Niveytha27 commited on
Commit
c44e6e8
·
verified ·
1 Parent(s): 9c70e19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -136
app.py CHANGED
@@ -1,165 +1,263 @@
 
1
  import requests
2
  import io
3
- import re
 
4
  import numpy as np
5
  import faiss
6
- import torch
7
- from pypdf import PdfReader
8
- from rank_bm25 import BM25Okapi
9
- from sentence_transformers import SentenceTransformer
10
  from accelerate import Accelerator
11
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
12
- from bert_score import score
13
- import gradio as gr
 
14
 
15
- # --- Preload Data ---
16
- DEFAULT_PDF_URLS = [
17
- "https://www.latentview.com/wp-content/uploads/2023/07/LatentView-Annual-Report-2022-23.pdf",
18
- "https://www.latentview.com/wp-content/uploads/2024/08/LatentView-Annual-Report-2023-24.pdf"
19
- ]
20
- embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
21
 
22
- def preload_data(pdf_urls):
 
 
 
 
 
 
 
 
23
 
24
- def download_pdf(url):
25
- response = requests.get(url, stream=True)
26
- response.raise_for_status()
27
- return response.content
28
-
29
- def extract_text_from_pdf(pdf_bytes):
30
- pdf_file = io.BytesIO(pdf_bytes)
31
- reader = PdfReader(pdf_file)
32
- text = ""
33
- for page in reader.pages:
34
- text += page.extract_text() or ""
35
- return text
36
-
37
- def preprocess_text(text):
38
- financial_symbols = r"\$\€\₹\£\¥\₩\₽\₮\₦\₲"
39
- text = re.sub(fr"[^\w\s{financial_symbols}.,%/₹$€¥£-]", "", text)
40
- text = re.sub(r'\s+', ' ', text).strip()
41
- return text
42
-
43
- def chunk_text(text, chunk_size=1024, overlap_size=100):
44
- chunks = []
45
- start = 0
46
- text_length = len(text)
47
- while start < text_length:
48
- end = min(start + chunk_size, text_length)
49
- if end < text_length and text[end].isalnum():
50
- last_space = text.rfind(" ", start, end)
51
- if last_space != -1:
52
- end = last_space
53
- chunk = text[start:end].strip()
54
- if chunk:
55
- chunks.append(chunk)
56
- if end == text_length:
57
- break
58
- overlap_start = max(0, end - overlap_size)
59
- if overlap_start < end:
60
- last_overlap_space = text.rfind(" ", 0, overlap_start)
61
- if last_overlap_space != -1 and last_overlap_space > start:
62
- start = last_overlap_space + 1
63
- else:
64
- start = end
65
- else:
66
- start = end
67
- return chunks
68
 
69
- all_data = []
70
- for url in pdf_urls:
71
- pdf_bytes = download_pdf(url)
72
- text = extract_text_from_pdf(pdf_bytes)
73
- preprocessed_text = preprocess_text(text)
74
- all_data.append(preprocessed_text)
75
-
76
- chunks = []
77
- for data in all_data:
78
- chunks.extend(chunk_text(data))
79
-
80
- embeddings = embedding_model.encode(chunks)
81
- index = faiss.IndexFlatL2(embeddings.shape[1])
82
- index.add(embeddings)
83
 
84
- return index, chunks
 
 
 
85
 
86
- index, chunks = preload_data(DEFAULT_PDF_URLS)
87
- accelerator = Accelerator()
88
- MODEL_NAME = "microsoft/phi-2"
89
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
90
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", trust_remote_code=True, cache_dir="./my_models")
91
- model = accelerator.prepare(model)
92
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
93
 
94
- def bm25_retrieval(query, documents, top_k=3):
95
- tokenized_docs = [doc.split() for doc in documents]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  bm25 = BM25Okapi(tokenized_docs)
97
- return [documents[i] for i in np.argsort(bm25.get_scores(query.split()))[::-1][:top_k]]
 
 
98
 
99
- def adaptive_retrieval(query, index, chunks, top_k=3, bm25_weight=0.5):
100
- query_embedding = embedding_model.encode([query], convert_to_numpy=True, dtype=np.float16)
101
  _, indices = index.search(query_embedding, top_k)
102
  vector_results = [chunks[i] for i in indices[0]]
103
  bm25_results = bm25_retrieval(query, chunks, top_k)
104
- return list(set(vector_results + bm25_results))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- def rerank(query, results):
107
  query_embedding = embedding_model.encode([query], convert_to_numpy=True)
108
- result_embeddings = embedding_model.encode(results, convert_to_numpy=True)
109
- similarities = np.dot(result_embeddings, query_embedding.T).flatten()
110
- return [results[i] for i in np.argsort(similarities)[::-1]], similarities
111
-
112
- def merge_chunks(retrieved_chunks, overlap_size=100):
113
- merged_chunks = []
114
- buffer = retrieved_chunks[0] if retrieved_chunks else ""
115
- for i in range(1, len(retrieved_chunks)):
116
- chunk = retrieved_chunks[i]
117
- overlap_start = buffer[-overlap_size:]
118
- overlap_index = chunk.find(overlap_start)
119
- if overlap_index != -1:
120
- buffer += chunk[overlap_index + overlap_size:]
121
- else:
122
- merged_chunks.append(buffer)
123
- buffer = chunk
124
- if buffer:
125
- merged_chunks.append(buffer)
126
- return merged_chunks
127
-
128
- def calculate_confidence(query, answer):
129
- P, R, F1 = score([answer], [query], lang="en", verbose=False)
130
- return F1.item()
131
 
 
132
  def generate_response(query, context):
133
- prompt = f"""Your task is to analyze the given Context and answer the Question concisely in plain English.
134
- **Guidelines:**
135
- - Do NOT include </think> tag, just provide the final answer only.
136
- - Provide a direct, factual answer based strictly on the Context.
137
- - Avoid generating Python code, solutions, or any irrelevant information.
138
- Context: {context}
139
- Question: {query}
140
- Answer:
141
- """
142
- response = generator(prompt, max_new_tokens=150, num_return_sequences=1)[0]['generated_text']
143
- answer = response.split("Answer:")[1].strip()
144
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
145
 
 
 
 
 
 
 
146
  def process_query(query):
 
 
 
147
  retrieved_chunks = adaptive_retrieval(query, index, chunks)
148
- merged_chunks = merge_chunks(retrieved_chunks, 50)
149
- reranked_chunks, similarities = rerank(query, merged_chunks)
150
- context = " ".join(reranked_chunks[:3])
151
- answer = generate_response(query, context)
152
- confidence = calculate_confidence(query, answer)
153
- full_response = f"{answer}\n\nConfidence: {confidence:.2f}"
 
 
154
  return full_response
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  iface = gr.Interface(
157
- fn=process_query,
158
- inputs=gr.Textbox(placeholder="Enter your financial question"),
159
- outputs="text",
160
- title="Financial Document Q&A Chatbot",
161
- description="Ask questions about the preloaded financial documents."
 
 
 
 
 
 
 
162
  )
163
 
164
- iface.launch()
165
- accelerator.free_memory()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  import requests
3
  import io
4
+ from pypdf import PdfReader
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
  import numpy as np
7
  import faiss
8
+ from sentence_transformers import SentenceTransformer, CrossEncoder
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, pipeline as hf_pipeline
 
 
10
  from accelerate import Accelerator
11
+ from langchain.text_splitter import NLTKTextSplitter
12
+ from rank_bm25 import BM25Okapi
13
+ import os
14
+ import pickle
15
 
16
+ # --- Global Variables for Caching ---
17
+ index = None
18
+ chunks = None
19
+ embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
20
+ rerank_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
21
+ generator = None
22
 
23
+ # --- PDF Processing and Embedding ---
24
+ def download_pdf(url):
25
+ response = requests.get(url, stream=True)
26
+ response.raise_for_status()
27
+ return response.content
28
+
29
+ def custom_chunking(text, delimiter="\n\n"):
30
+ """Splits text based on a specified delimiter."""
31
+ return text.split(delimiter)
32
 
33
+ def extract_text_from_pdf(pdf_bytes, document_id):
34
+ """Extracts text from a PDF, page by page, and then chunks each page."""
35
+ pdf_file = io.BytesIO(pdf_bytes)
36
+ reader = PdfReader(pdf_file)
37
+ nltk_splitter = NLTKTextSplitter(chunk_size=500)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ extracted_data = []
40
+ for page_num, page in enumerate(reader.pages):
41
+ page_text = page.extract_text() or ""
42
+ clean_text = " ".join(page_text.split())
 
 
 
 
 
 
 
 
 
 
43
 
44
+ if clean_text:
45
+ words = clean_text.split()
46
+ section_header = " ".join(words[:20]) if words else "No Section Name Found"
47
+ custom_chunks = custom_chunking(clean_text)
48
 
49
+ for custom_chunk in custom_chunks:
50
+ clean_custom_chunk = " ".join(custom_chunk.split())
51
+ if clean_custom_chunk:
52
+ nltk_chunks = nltk_splitter.split_text(clean_custom_chunk)
 
 
 
53
 
54
+ for nltk_chunk in nltk_chunks:
55
+ clean_nltk_chunk = " ".join(nltk_chunk.split())
56
+ if clean_nltk_chunk:
57
+ extracted_data.append({
58
+ "document_id": document_id,
59
+ "section_header": section_header,
60
+ "text": clean_nltk_chunk
61
+ })
62
+ return extracted_data
63
+
64
+ def process_single_pdf(url, doc_id):
65
+ """Processes a single PDF."""
66
+ pdf_bytes = download_pdf(url)
67
+ return extract_text_from_pdf(pdf_bytes, doc_id)
68
+
69
+ def process_pdfs_parallel(pdf_urls, document_ids):
70
+ """Processes multiple PDFs in parallel."""
71
+ all_data = []
72
+ with ThreadPoolExecutor() as pdf_executor:
73
+ pdf_futures = [pdf_executor.submit(process_single_pdf, url, doc_id) for url, doc_id in zip(pdf_urls, document_ids)]
74
+ for future in as_completed(pdf_futures):
75
+ all_data.extend(future.result())
76
+ return all_data
77
+
78
+ def create_embeddings_and_index(data):
79
+ """Create Embeddings"""
80
+ texts = [item['text'] for item in data]
81
+ embeddings = embedding_model.encode(texts)
82
+ dimension = embeddings.shape[1]
83
+ index = faiss.IndexFlatL2(dimension)
84
+ index.add(embeddings)
85
+ return index, data
86
+
87
+ # --- Retrieval Functions ---
88
+ def bm25_retrieval(query, documents, top_k=10):
89
+ tokenized_docs = [doc['text'].split() for doc in documents]
90
  bm25 = BM25Okapi(tokenized_docs)
91
+ doc_scores = bm25.get_scores(query.split())
92
+ top_indices = np.argsort(doc_scores)[::-1][:top_k]
93
+ return [documents[i] for i in top_indices]
94
 
95
+ def adaptive_retrieval(query, index, chunks, top_k=10):
96
+ query_embedding = embedding_model.encode([query], convert_to_numpy=True)
97
  _, indices = index.search(query_embedding, top_k)
98
  vector_results = [chunks[i] for i in indices[0]]
99
  bm25_results = bm25_retrieval(query, chunks, top_k)
100
+ combined_results = vector_results + bm25_results
101
+
102
+ unique_results = []
103
+ seen_texts = set()
104
+ for result in combined_results:
105
+ if result['text'] not in seen_texts:
106
+ unique_results.append(result)
107
+ seen_texts.add(result['text'])
108
+
109
+ return unique_results
110
+
111
+ def rerank(query, results, keyword_weight=0.3, cross_encoder_weight=0.7):
112
+ """Combines keyword-based and cross-encoder reranking."""
113
+
114
+ # Keyword-based scoring
115
+ keywords = query.lower().split()
116
+ def score_chunk_keywords(chunk):
117
+ text = chunk['text'].lower()
118
+ return sum(1 for keyword in keywords if keyword in text)
119
+
120
+ keyword_scores = [score_chunk_keywords(chunk) for chunk in results]
121
+
122
+ # Cross-encoder scoring
123
+ rerank_model = CrossEncoder(rerank_model)
124
+ query_results = [[query, f"Document: {result['document_id']}, Section: {result['section_header']}, Text: {result['text']}"] for result in results]
125
+ cross_encoder_scores = rerank_model.predict(query_results)
126
+
127
+ # Combine scores
128
+ combined_scores = [(keyword_scores[i] * keyword_weight) + (cross_encoder_scores[i] * cross_encoder_weight) for i in range(len(results))]
129
+
130
+ # Rank and select top 3
131
+ ranked_results = [results[i] for i in np.argsort(combined_scores)[::-1]]
132
+ return ranked_results[:3]
133
+
134
+ def merge_chunks(retrieved_chunks):
135
+ """Merges chunks based on their original order, including metadata."""
136
+ merged_text = " ".join([
137
+ f"Document: {chunk['document_id']}, Section: {chunk['section_header']}, Text: {chunk['text']}"
138
+ for chunk in retrieved_chunks
139
+ ])
140
+ return merged_text
141
+
142
+ # --- Confidence Calculation ---
143
+ def calculate_confidence(query, context, answer):
144
+ """Calculates confidence score based on question-context and context-answer similarity."""
145
+ embedding_model = SentenceTransformer(embedding_model)
146
 
 
147
  query_embedding = embedding_model.encode([query], convert_to_numpy=True)
148
+ context_embedding = embedding_model.encode([context], convert_to_numpy=True)
149
+ answer_embedding = embedding_model.encode([answer], convert_to_numpy=True)
150
+
151
+ query_context_similarity = np.dot(query_embedding, context_embedding.T).item()
152
+ context_answer_similarity = np.dot(context_embedding, answer_embedding.T).item()
153
+ confidence = (query_context_similarity + context_answer_similarity) / 2.0 # Equal weights
154
+
155
+ return confidence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
+ # --- Response Generation ---
158
  def generate_response(query, context):
159
+ prompt = f"""Your task is to analyze the given Context and take the answer for the Question and provide a clear relevant answer in plain English.
160
+
161
+ **Guidelines:**
162
+ - JUST PROVIDE ONLY THE ANSWER.
163
+ - Provide a elaborate, factual answer based strictly on the Context.
164
+ - Avoid generating Python code, solutions, or any irrelevant information.
165
+
166
+ Context: {context}
167
+
168
+ Question: {query}
169
+
170
+ Answer:"""
171
+ response = generator(prompt, max_new_tokens=500, num_return_sequences=1)[0]['generated_text']
172
+ return response
173
+
174
+ # --- Guardrail ---
175
+ def is_sensitive_query(query):
176
+ sensitive_keywords = ["personal", "address", "phone", "ssn", "credit card", "bank account", "password", "social security", "private", "location"]
177
+ query_lower = query.lower()
178
+ if any(keyword in query_lower for keyword in sensitive_keywords):
179
+ return True
180
+
181
+ classifier = hf_pipeline("text-classification", model="unitary/toxic-bert")
182
+ result = classifier(query)[0]
183
 
184
+ if result["label"] == "toxic" and result["score"] > 0.7:
185
+ return True
186
+
187
+ return False
188
+
189
+ # --- Process Query ---
190
  def process_query(query):
191
+ if is_sensitive_query(query):
192
+ return "I cannot answer questions that involve sensitive or personal information, or that are toxic in nature."
193
+
194
  retrieved_chunks = adaptive_retrieval(query, index, chunks)
195
+ reranked_chunks = rerank(query, retrieved_chunks)
196
+ final_chunks = reranked_chunks[:3]
197
+ merged_result = merge_chunks(final_chunks)
198
+ answer = generate_response(query, merged_result)
199
+ if "</think>" in answer:
200
+ answer = answer.split("</think>", 1)[-1].strip()
201
+ confidence = calculate_confidence(query, merged_result, answer)
202
+ full_response = f"{extracted_answer}\n\nConfidence: {confidence:.2f}"
203
  return full_response
204
 
205
+ # --- Initialization ---
206
+ def initialize_app():
207
+ global index, chunks, generator
208
+
209
+ pdf_urls = ["https://www.latentview.com/wp-content/uploads/2023/07/LatentView-Annual-Report-2022-23.pdf",
210
+ "https://www.latentview.com/wp-content/uploads/2024/08/LatentView-Annual-Report-2023-24.pdf"]
211
+ document_ids = ["LatentView-Annual-Report-2022-23", "LatentView-Annual-Report-2023-24"]
212
+
213
+ if os.path.exists('vector_cache.pkl'):
214
+ with open('vector_cache.pkl', 'rb') as f:
215
+ index, chunks = pickle.load(f)
216
+ else:
217
+ extracted_data = process_pdfs_parallel(pdf_urls, document_ids)
218
+ index, chunks = create_embeddings_and_index(extracted_data)
219
+ with open('vector_cache.pkl', 'wb') as f:
220
+ pickle.dump((index, chunks), f)
221
+
222
+ accelerator = Accelerator()
223
+ accelerator.free_memory()
224
+ MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
225
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
226
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
227
+ model = accelerator.prepare(model)
228
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
229
+ return "Initialization Complete!"
230
+
231
+ # --- Gradio Interface ---
232
+ def gradio_interface(query):
233
+ return process_query(query)
234
+
235
  iface = gr.Interface(
236
+ fn=gradio_interface,
237
+ inputs=gr.Textbox(lines=2, placeholder="Enter your question here..."),
238
+ outputs=gr.Textbox(lines=5, placeholder="Answer will appear here..."),
239
+ title="Annual Report Q&A Chatbot (LatentView Analytics)",
240
+ description="Ask questions about the company's annual reports. (2022-23 & 2023-24)",
241
+ examples=[
242
+ ["What is the total revenue from operations for 2023-24?"],
243
+ ["Who is the CEO of Latentview Analytics? "],
244
+ ["Summarize the key financial highlights in 2023-24"],
245
+ ["What were the total expenses for 2022-23?"],
246
+ ],
247
+ cache_examples=False,
248
  )
249
 
250
+ with gr.Blocks() as demo:
251
+ gr.Markdown("# Annual Report Q&A Chatbot (LatentView Analytics)")
252
+ init_button = gr.Button("Initialize")
253
+ init_output = gr.Textbox(label="Initialization Status")
254
+
255
+ init_button.click(
256
+ fn=initialize_app,
257
+ inputs=[],
258
+ outputs=init_output,
259
+ )
260
+
261
+ iface.render()
262
+
263
+ demo.launch()