HamidOmarov commited on
Commit
88d2e91
·
1 Parent(s): a6ffef9

RAG: pdfminer fallback + last-file summary fallback + relaxed filters + fixes

Browse files
Files changed (3) hide show
  1. app/api.py +3 -1
  2. app/rag_system.py +55 -23
  3. requirements.txt +1 -0
app/api.py CHANGED
@@ -68,12 +68,14 @@ async def upload_pdf(file: UploadFile = File(...)):
68
  added = rag.add_pdf(dest)
69
  return UploadResponse(filename=file.filename, chunks_added=added)
70
 
 
71
  @app.post("/ask_question", response_model=AskResponse)
72
  def ask_question(payload: AskRequest):
73
  hits = rag.search(payload.question, k=max(1, payload.top_k))
74
  contexts = [c for c, _ in hits]
 
75
  answer = rag.synthesize_answer(payload.question, contexts)
76
- return AskResponse(answer=answer, contexts=contexts)
77
 
78
  @app.get("/get_history", response_model=HistoryResponse)
79
  def get_history():
 
68
  added = rag.add_pdf(dest)
69
  return UploadResponse(filename=file.filename, chunks_added=added)
70
 
71
+ # app/api.py içində ask_question endpoint
72
  @app.post("/ask_question", response_model=AskResponse)
73
  def ask_question(payload: AskRequest):
74
  hits = rag.search(payload.question, k=max(1, payload.top_k))
75
  contexts = [c for c, _ in hits]
76
+ # fallback: (optional) burda da son faylı ötürmək olar; synthesize_answer onsuz da edir:
77
  answer = rag.synthesize_answer(payload.question, contexts)
78
+ return AskResponse(answer=answer, contexts=contexts or rag.last_added[:5])
79
 
80
  @app.get("/get_history", response_model=HistoryResponse)
81
  def get_history():
app/rag_system.py CHANGED
@@ -23,6 +23,10 @@ OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
23
 
24
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
25
  NUM_TOK_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
 
 
 
 
26
 
27
  def _split_sentences(text: str) -> List[str]:
28
  return [s.strip() for s in re.split(r'(?<=[.!?])\s+|[\r\n]+', text) if s.strip()]
@@ -36,7 +40,7 @@ def _mostly_numeric(s: str) -> bool:
36
 
37
  def _tabular_like(s: str) -> bool:
38
  hits = len(NUM_TOK_RE.findall(s))
39
- return hits >= 3 or len(s) < 15
40
 
41
  def _clean_for_summary(text: str) -> str:
42
  out = []
@@ -75,21 +79,21 @@ def _keyword_summary_en(contexts: List[str]) -> List[str]:
75
  add("Wallpaper repair or replacement; some areas replaced with plaster and paint.")
76
  if ("alçı boya" in text) or ("boya işi" in text) or ("plaster" in text) or ("boya" in text):
77
  add("Wall plastering and painting works.")
78
- if "seramik" in text:
79
  add("Ceramic tiling works (including grouting).")
80
  if ("dilatasyon" in text) or ("ar 153" in text) or ("ar153" in text):
81
  add("Installation of AR 153–050 floor expansion joint profile with accessories and insulation.")
82
- if "daş yunu" in text:
83
  add("Rock wool insulation installed where required.")
84
- if ("sütunlarda" in text) or ("üzlüyün" in text):
85
  add("Repair of wall cladding on columns.")
86
- if ("m²" in text) or ("ədəd" in text) or ("azn" in text):
87
  add("Bill of quantities style lines with unit prices and measures (m², pcs).")
88
 
89
  if not bullets:
90
  bullets = [
91
- "The document appears to be a bill of quantities for renovation works.",
92
- "Scope includes demolition/reinstallation, finishing (plaster & paint), tiling, and profiles.",
93
  ]
94
  return bullets[:5]
95
 
@@ -112,6 +116,7 @@ class SimpleRAG:
112
  self._translator = None # lazy
113
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
114
  self.chunks: List[str] = []
 
115
  self._load()
116
 
117
  def _load(self) -> None:
@@ -134,22 +139,39 @@ class SimpleRAG:
134
 
135
  @staticmethod
136
  def _pdf_to_texts(pdf_path: Path, step: int = 1400) -> List[str]:
137
- reader = PdfReader(str(pdf_path))
138
  pages: List[str] = []
139
- for p in reader.pages:
140
- t = p.extract_text() or ""
141
- if t.strip():
142
- pages.append(t)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  chunks: List[str] = []
144
- for txt in pages:
145
- for i in range(0, len(txt), step):
146
- part = txt[i : i + step].strip()
147
- if part:
148
- chunks.append(part)
149
  return chunks
150
 
151
  def add_pdf(self, pdf_path: Path) -> int:
152
  texts = self._pdf_to_texts(pdf_path)
 
153
  if not texts:
154
  return 0
155
  emb = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
@@ -187,7 +209,17 @@ class SimpleRAG:
187
  except Exception:
188
  return texts
189
 
 
 
 
 
 
 
 
 
190
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
 
 
191
  if not contexts:
192
  return "No relevant context found. Please upload a PDF or ask a more specific question."
193
 
@@ -195,27 +227,27 @@ class SimpleRAG:
195
  cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
196
  cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
197
  if not cleaned_contexts:
198
- return "The document appears largely tabular/numeric; couldn't extract readable sentences."
 
199
 
200
  # 2) Pre-translate paragraphs to EN when target is EN
201
  translated = self._translate_to_en(cleaned_contexts) if OUTPUT_LANG == "en" else cleaned_contexts
202
 
203
- # 3) Split into candidate sentences and filter strictly for completeness
204
  candidates: List[str] = []
205
  for para in translated:
206
  for s in _split_sentences(para):
207
  w = s.split()
208
  if not (6 <= len(w) <= 60):
209
  continue
210
- if s.strip().lower().endswith("e.g."):
211
- continue
212
- if not re.search(r"[.!?](?:[\"'])?$", s): # must end with punctuation
213
  continue
214
  if _tabular_like(s) or _mostly_numeric(s):
215
  continue
216
  candidates.append(" ".join(w))
217
 
218
- # 4) Fallback if no good sentences
219
  if not candidates:
220
  bullets = _keyword_summary_en(cleaned_contexts)
221
  return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
 
23
 
24
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
25
  NUM_TOK_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
26
+ GENERIC_Q_RE = re.compile(
27
+ r"(what\s+is\s+(it|this|the\s+document)\s+about\??|what\s+is\s+about\??|summary|overview)",
28
+ re.IGNORECASE,
29
+ )
30
 
31
  def _split_sentences(text: str) -> List[str]:
32
  return [s.strip() for s in re.split(r'(?<=[.!?])\s+|[\r\n]+', text) if s.strip()]
 
40
 
41
  def _tabular_like(s: str) -> bool:
42
  hits = len(NUM_TOK_RE.findall(s))
43
+ return hits >= 4 or len(s) < 15 # daha səxavətli
44
 
45
  def _clean_for_summary(text: str) -> str:
46
  out = []
 
79
  add("Wallpaper repair or replacement; some areas replaced with plaster and paint.")
80
  if ("alçı boya" in text) or ("boya işi" in text) or ("plaster" in text) or ("boya" in text):
81
  add("Wall plastering and painting works.")
82
+ if "seramik" in text or "ceramic" in text:
83
  add("Ceramic tiling works (including grouting).")
84
  if ("dilatasyon" in text) or ("ar 153" in text) or ("ar153" in text):
85
  add("Installation of AR 153–050 floor expansion joint profile with accessories and insulation.")
86
+ if "daş yunu" in text or "rock wool" in text:
87
  add("Rock wool insulation installed where required.")
88
+ if ("sütunlarda" in text) or ("üzlüyün" in text) or ("cladding" in text):
89
  add("Repair of wall cladding on columns.")
90
+ if ("m²" in text) or ("ədəd" in text) or ("azn" in text) or ("unit price" in text):
91
  add("Bill of quantities style lines with unit prices and measures (m², pcs).")
92
 
93
  if not bullets:
94
  bullets = [
95
+ "The document appears to be a bill of quantities or a structured list of works.",
96
+ "Scope likely includes demolition/reinstallation, finishing (plaster & paint), tiling, and profiles.",
97
  ]
98
  return bullets[:5]
99
 
 
116
  self._translator = None # lazy
117
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
118
  self.chunks: List[str] = []
119
+ self.last_added: List[str] = [] # son yüklənən faylın parçaları (RAM)
120
  self._load()
121
 
122
  def _load(self) -> None:
 
139
 
140
  @staticmethod
141
  def _pdf_to_texts(pdf_path: Path, step: int = 1400) -> List[str]:
142
+ # 1) pypdf
143
  pages: List[str] = []
144
+ try:
145
+ reader = PdfReader(str(pdf_path))
146
+ for p in reader.pages:
147
+ t = p.extract_text() or ""
148
+ if t.strip():
149
+ pages.append(t)
150
+ except Exception:
151
+ pages = []
152
+
153
+ full = " ".join(pages).strip()
154
+ if not full:
155
+ # 2) pdfminer fallback
156
+ try:
157
+ from pdfminer.high_level import extract_text as pdfminer_extract_text
158
+ full = (pdfminer_extract_text(str(pdf_path)) or "").strip()
159
+ except Exception:
160
+ full = ""
161
+
162
+ if not full:
163
+ return []
164
+
165
  chunks: List[str] = []
166
+ for i in range(0, len(full), step):
167
+ part = full[i : i + step].strip()
168
+ if part:
169
+ chunks.append(part)
 
170
  return chunks
171
 
172
  def add_pdf(self, pdf_path: Path) -> int:
173
  texts = self._pdf_to_texts(pdf_path)
174
+ self.last_added = texts[:] # son faylı yadda saxla (summarize fallback üçün)
175
  if not texts:
176
  return 0
177
  emb = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
 
209
  except Exception:
210
  return texts
211
 
212
+ def _prepare_contexts(self, question: str, contexts: List[str]) -> List[str]:
213
+ # Generik sual və ya boş axtarış halında: son yüklənən fayldan istifadə et
214
+ generic = (len(question.split()) <= 5) or bool(GENERIC_Q_RE.search(question or ""))
215
+ if (not contexts or generic) and self.last_added:
216
+ base = self.last_added[:5]
217
+ return base
218
+ return contexts
219
+
220
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
221
+ contexts = self._prepare_contexts(question, contexts)
222
+
223
  if not contexts:
224
  return "No relevant context found. Please upload a PDF or ask a more specific question."
225
 
 
227
  cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
228
  cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
229
  if not cleaned_contexts:
230
+ bullets = _keyword_summary_en(contexts[:5])
231
+ return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
232
 
233
  # 2) Pre-translate paragraphs to EN when target is EN
234
  translated = self._translate_to_en(cleaned_contexts) if OUTPUT_LANG == "en" else cleaned_contexts
235
 
236
+ # 3) Split into candidate sentences and filter
237
  candidates: List[str] = []
238
  for para in translated:
239
  for s in _split_sentences(para):
240
  w = s.split()
241
  if not (6 <= len(w) <= 60):
242
  continue
243
+ # tam cümlə tələbi (ya düzgün sonlu durğu, ya da kifayət qədər uzunluq)
244
+ if not re.search(r"[.!?](?:[\"'])?$", s) and len(w) < 18:
 
245
  continue
246
  if _tabular_like(s) or _mostly_numeric(s):
247
  continue
248
  candidates.append(" ".join(w))
249
 
250
+ # 4) Fallback if no sentences
251
  if not candidates:
252
  bullets = _keyword_summary_en(cleaned_contexts)
253
  return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
requirements.txt CHANGED
@@ -7,3 +7,4 @@ transformers>=4.40
7
  sentencepiece
8
  sacremoses
9
  python-multipart
 
 
7
  sentencepiece
8
  sacremoses
9
  python-multipart
10
+ pdfminer.six