Niveytha27 commited on
Commit
1ae29e2
·
verified ·
1 Parent(s): 28f8859

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -74
app.py CHANGED
@@ -3,59 +3,94 @@ import io
3
  import re
4
  import numpy as np
5
  import faiss
6
- import time
7
- import gradio as gr
8
  from pypdf import PdfReader
9
  from rank_bm25 import BM25Okapi
10
  from sentence_transformers import SentenceTransformer
11
  from accelerate import Accelerator
12
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
13
 
14
- def chunk_text(text, chunk_size=700, overlap_size=150):
15
- """Chunks text without breaking words in the middle (corrected overlap)."""
16
- chunks = []
17
- start = 0
18
- text_length = len(text)
19
-
20
- while start < text_length:
21
- end = min(start + chunk_size, text_length)
22
-
23
- # Ensure we do not split words
24
- if end < text_length and text[end].isalnum():
25
- last_space = text.rfind(" ", start, end) # Find last space within the chunk
26
- if last_space != -1: # If a space is found, adjust the end
27
- end = last_space
28
-
29
- chunk = text[start:end].strip()
30
- if chunk: # Avoid empty chunks
31
- chunks.append(chunk)
32
-
33
- if end == text_length:
34
- break
35
-
36
- # Corrected overlap calculation
37
- overlap_start = max(0, end - overlap_size)
38
- if overlap_start < end: # Prevent infinite loop if overlap_start is equal to end.
39
- last_overlap_space = text.rfind(" ", 0, overlap_start)
40
- if last_overlap_space != -1 and last_overlap_space > start:
41
- start = last_overlap_space + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  else:
43
- start = end # If no space found, start at the last end.
44
- else:
45
- start = end
46
 
47
- return chunks
 
 
 
 
 
48
 
49
- chunks = []
50
- for data in all_data:
51
- chunks.extend(chunk_text(data))
52
 
53
- embedding_model = SentenceTransformer("BAAI/bge-large-en")
54
- # embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
55
- embeddings = embedding_model.encode(chunks)
 
 
56
 
57
- index = faiss.IndexFlatL2(embeddings.shape[1])
58
- index.add(embeddings)
 
 
 
 
 
 
59
 
60
  def bm25_retrieval(query, documents, top_k=3):
61
  tokenized_docs = [doc.split() for doc in documents]
@@ -75,47 +110,26 @@ def rerank(query, results):
75
  similarities = np.dot(result_embeddings, query_embedding.T).flatten()
76
  return [results[i] for i in np.argsort(similarities)[::-1]], similarities
77
 
78
- #Chunk merging.
79
  def merge_chunks(retrieved_chunks, overlap_size=100):
80
- """Merges overlapping chunks properly by detecting the actual overlap."""
81
  merged_chunks = []
82
  buffer = retrieved_chunks[0] if retrieved_chunks else ""
83
-
84
  for i in range(1, len(retrieved_chunks)):
85
  chunk = retrieved_chunks[i]
86
-
87
- # Find actual overlap
88
- overlap_start = buffer[-overlap_size:] # Get the last `overlap_size` chars of the previous chunk
89
- overlap_index = chunk.find(overlap_start) # Find where this part appears in the new chunk
90
-
91
  if overlap_index != -1:
92
- # Merge only the non-overlapping part
93
  buffer += chunk[overlap_index + overlap_size:]
94
  else:
95
- # Store completed merged chunk and start a new one
96
  merged_chunks.append(buffer)
97
  buffer = chunk
98
-
99
  if buffer:
100
  merged_chunks.append(buffer)
101
-
102
  return merged_chunks
103
 
104
- # def calculate_confidence(query, context, similarities):
105
- # return np.mean(similarities) # Averaged similarity scores
106
  def calculate_confidence(query, answer):
107
  P, R, F1 = score([answer], [query], lang="en", verbose=False)
108
  return F1.item()
109
 
110
- # Load SLM
111
- accelerator = Accelerator()
112
- accelerator.free_memory()
113
- MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
114
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
115
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", cache_dir="./my_models")
116
- model = accelerator.prepare(model)
117
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
118
-
119
  def generate_response(query, context):
120
  prompt = f"""Your task is to analyze the given Context and answer the Question concisely in plain English.
121
  **Guidelines:**
@@ -130,12 +144,7 @@ def generate_response(query, context):
130
  answer = response.split("Answer:")[1].strip()
131
  return answer
132
 
133
- def process_query(pdf_urls_text, query):
134
- pdf_urls = [url.strip() for url in pdf_urls_text.split("\n") if url.strip()]
135
- if not pdf_urls:
136
- return "Please enter at least one PDF URL."
137
-
138
- index, chunks = load_and_index_data(pdf_urls)
139
  retrieved_chunks = adaptive_retrieval(query, index, chunks)
140
  merged_chunks = merge_chunks(retrieved_chunks, 150)
141
  reranked_chunks, similarities = rerank(query, merged_chunks)
@@ -147,11 +156,11 @@ def process_query(pdf_urls_text, query):
147
 
148
  iface = gr.Interface(
149
  fn=process_query,
150
- inputs=[gr.Textbox(lines=3, placeholder="Enter PDF URLs (one per line)"), gr.Textbox(placeholder="Enter your financial question")],
151
  outputs="text",
152
  title="Financial Document Q&A Chatbot",
153
- description="Enter PDF URLs and your question to get answers from the documents."
154
  )
155
 
156
  iface.launch()
157
- accelerator.free_memory()
 
3
  import re
4
  import numpy as np
5
  import faiss
6
+ import torch
 
7
  from pypdf import PdfReader
8
  from rank_bm25 import BM25Okapi
9
  from sentence_transformers import SentenceTransformer
10
  from accelerate import Accelerator
11
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
12
+ from bert_score import score
13
+ import gradio as gr
14
 
15
+ # --- Preload Data ---
16
+ DEFAULT_PDF_URLS = [
17
+ "https://www.latentview.com/wp-content/uploads/2023/07/LatentView-Annual-Report-2022-23.pdf",
18
+ "https://www.latentview.com/wp-content/uploads/2024/08/LatentView-Annual-Report-2023-24.pdf"
19
+ ]
20
+
21
+ def preload_data(pdf_urls):
22
+ embedding_model = SentenceTransformer("BAAI/bge-large-en")
23
+
24
+ def download_pdf(url):
25
+ response = requests.get(url, stream=True)
26
+ response.raise_for_status()
27
+ return response.content
28
+
29
+ def extract_text_from_pdf(pdf_bytes):
30
+ pdf_file = io.BytesIO(pdf_bytes)
31
+ reader = PdfReader(pdf_file)
32
+ text = ""
33
+ for page in reader.pages:
34
+ text += page.extract_text() or ""
35
+ return text
36
+
37
+ def preprocess_text(text):
38
+ financial_symbols = r"\$\€\₹\£\¥\₩\₽\₮\₦\₲"
39
+ text = re.sub(fr"[^\w\s{financial_symbols}.,%/₹$€¥£-]", "", text)
40
+ text = re.sub(r'\s+', ' ', text).strip()
41
+ return text
42
+
43
+ def chunk_text(text, chunk_size=700, overlap_size=150):
44
+ chunks = []
45
+ start = 0
46
+ text_length = len(text)
47
+ while start < text_length:
48
+ end = min(start + chunk_size, text_length)
49
+ if end < text_length and text[end].isalnum():
50
+ last_space = text.rfind(" ", start, end)
51
+ if last_space != -1:
52
+ end = last_space
53
+ chunk = text[start:end].strip()
54
+ if chunk:
55
+ chunks.append(chunk)
56
+ if end == text_length:
57
+ break
58
+ overlap_start = max(0, end - overlap_size)
59
+ if overlap_start < end:
60
+ last_overlap_space = text.rfind(" ", 0, overlap_start)
61
+ if last_overlap_space != -1 and last_overlap_space > start:
62
+ start = last_overlap_space + 1
63
+ else:
64
+ start = end
65
  else:
66
+ start = end
67
+ return chunks
 
68
 
69
+ all_data = []
70
+ for url in pdf_urls:
71
+ pdf_bytes = download_pdf(url)
72
+ text = extract_text_from_pdf(pdf_bytes)
73
+ preprocessed_text = preprocess_text(text)
74
+ all_data.append(preprocessed_text)
75
 
76
+ chunks = []
77
+ for data in all_data:
78
+ chunks.extend(chunk_text(data))
79
 
80
+ embeddings = embedding_model.encode(chunks)
81
+ index = faiss.IndexFlatL2(embeddings.shape[1])
82
+ index.add(embeddings)
83
+
84
+ return index, chunks
85
 
86
+ index, chunks = preload_data(DEFAULT_PDF_URLS)
87
+ embedding_model = SentenceTransformer("BAAI/bge-large-en")
88
+ accelerator = Accelerator()
89
+ MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
90
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
91
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", trust_remote_code=True, cache_dir="./my_models")
92
+ model = accelerator.prepare(model)
93
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
94
 
95
  def bm25_retrieval(query, documents, top_k=3):
96
  tokenized_docs = [doc.split() for doc in documents]
 
110
  similarities = np.dot(result_embeddings, query_embedding.T).flatten()
111
  return [results[i] for i in np.argsort(similarities)[::-1]], similarities
112
 
 
113
  def merge_chunks(retrieved_chunks, overlap_size=100):
 
114
  merged_chunks = []
115
  buffer = retrieved_chunks[0] if retrieved_chunks else ""
 
116
  for i in range(1, len(retrieved_chunks)):
117
  chunk = retrieved_chunks[i]
118
+ overlap_start = buffer[-overlap_size:]
119
+ overlap_index = chunk.find(overlap_start)
 
 
 
120
  if overlap_index != -1:
 
121
  buffer += chunk[overlap_index + overlap_size:]
122
  else:
 
123
  merged_chunks.append(buffer)
124
  buffer = chunk
 
125
  if buffer:
126
  merged_chunks.append(buffer)
 
127
  return merged_chunks
128
 
 
 
129
  def calculate_confidence(query, answer):
130
  P, R, F1 = score([answer], [query], lang="en", verbose=False)
131
  return F1.item()
132
 
 
 
 
 
 
 
 
 
 
133
  def generate_response(query, context):
134
  prompt = f"""Your task is to analyze the given Context and answer the Question concisely in plain English.
135
  **Guidelines:**
 
144
  answer = response.split("Answer:")[1].strip()
145
  return answer
146
 
147
+ def process_query(query):
 
 
 
 
 
148
  retrieved_chunks = adaptive_retrieval(query, index, chunks)
149
  merged_chunks = merge_chunks(retrieved_chunks, 150)
150
  reranked_chunks, similarities = rerank(query, merged_chunks)
 
156
 
157
  iface = gr.Interface(
158
  fn=process_query,
159
+ inputs=gr.Textbox(placeholder="Enter your financial question"),
160
  outputs="text",
161
  title="Financial Document Q&A Chatbot",
162
+ description="Ask questions about the preloaded financial documents."
163
  )
164
 
165
  iface.launch()
166
+ accelerator.free_memory()