HamidOmarov commited on
Commit
833b4d4
·
verified ·
1 Parent(s): b1de6d2

Update app/rag_system.py

Browse files
Files changed (1) hide show
  1. app/rag_system.py +148 -139
app/rag_system.py CHANGED
@@ -1,16 +1,24 @@
1
  # app/rag_system.py
2
  from __future__ import annotations
3
 
4
- import os, re
 
5
  from pathlib import Path
6
- from typing import List, Tuple
7
 
8
  import faiss
9
  import numpy as np
10
- from pypdf import PdfReader
 
 
 
 
 
 
11
  from sentence_transformers import SentenceTransformer
12
 
13
- ROOT_DIR = Path(__file__).resolve().parent.parent
 
14
  DATA_DIR = ROOT_DIR / "data"
15
  UPLOAD_DIR = DATA_DIR / "uploads"
16
  INDEX_DIR = DATA_DIR / "index"
@@ -18,29 +26,40 @@ CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
18
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
19
  d.mkdir(parents=True, exist_ok=True)
20
 
 
21
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
22
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
23
 
 
24
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
25
- NUM_TOK_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
26
- GENERIC_Q_RE = re.compile(
27
- r"(what\s+is\s+(it|this|the\s+document)\s+about\??|what\s+is\s+about\??|summary|overview)",
28
- re.IGNORECASE,
29
- )
 
 
 
 
 
 
30
 
31
  def _split_sentences(text: str) -> List[str]:
32
- return [s.strip() for s in re.split(r'(?<=[.!?])\s+|[\r\n]+', text) if s.strip()]
 
33
 
34
  def _mostly_numeric(s: str) -> bool:
35
- alnum = [c for c in s if s and c.isalnum()]
36
  if not alnum:
37
  return True
38
  digits = sum(c.isdigit() for c in alnum)
39
  return digits / max(1, len(alnum)) > 0.3
40
 
 
 
41
  def _tabular_like(s: str) -> bool:
42
- hits = len(NUM_TOK_RE.findall(s))
43
- return hits >= 4 or len(s) < 15
44
 
45
  def _clean_for_summary(text: str) -> str:
46
  out = []
@@ -58,46 +77,23 @@ def _sim_jaccard(a: str, b: str) -> float:
58
  return 0.0
59
  return len(aw & bw) / len(aw | bw)
60
 
 
 
 
 
 
 
 
 
 
 
 
61
  def _looks_azerbaijani(s: str) -> bool:
62
  has_az = any(ch in AZ_CHARS for ch in s)
63
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
64
  return has_az or non_ascii_ratio > 0.15
65
 
66
- def _non_ascii_ratio(s: str) -> float:
67
- return sum(ord(c) > 127 for c in s) / max(1, len(s))
68
-
69
- def _keyword_summary_en(contexts: List[str]) -> List[str]:
70
- text = " ".join(contexts).lower()
71
- bullets: List[str] = []
72
-
73
- def add(b: str):
74
- if b not in bullets:
75
- bullets.append(b)
76
-
77
- if ("şüşə" in text) or ("ara kəsm" in text) or ("s/q" in text):
78
- add("Removal and re-installation of glass partitions in sanitary areas.")
79
- if "divar kağız" in text:
80
- add("Wallpaper repair or replacement; some areas replaced with plaster and paint.")
81
- if ("alçı boya" in text) or ("boya işi" in text) or ("plaster" in text) or ("boya" in text):
82
- add("Wall plastering and painting works.")
83
- if "seramik" in text or "ceramic" in text:
84
- add("Ceramic tiling works (including grouting).")
85
- if ("dilatasyon" in text) or ("ar 153" in text) or ("ar153" in text):
86
- add("Installation of AR 153–050 floor expansion joint profile with accessories and insulation.")
87
- if "daş yunu" in text or "rock wool" in text:
88
- add("Rock wool insulation installed where required.")
89
- if ("sütunlarda" in text) or ("üzlüyün" in text) or ("cladding" in text):
90
- add("Repair of wall cladding on columns.")
91
- if ("m²" in text) or ("ədəd" in text) or ("azn" in text) or ("unit price" in text):
92
- add("Bill of quantities style lines with unit prices and measures (m², pcs).")
93
-
94
- if not bullets:
95
- bullets = [
96
- "The document appears to be a bill of quantities or a structured list of works.",
97
- "Scope likely includes demolition/reinstallation, finishing (plaster & paint), tiling, and profiles.",
98
- ]
99
- return bullets[:5]
100
-
101
  class SimpleRAG:
102
  def __init__(
103
  self,
@@ -112,14 +108,16 @@ class SimpleRAG:
112
  self.cache_dir = Path(cache_dir)
113
 
114
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
115
- self.embed_dim = self.model.get_sentence_embedding_dimension()
116
 
117
- self._translator = None # lazy
118
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
119
  self.chunks: List[str] = []
120
  self.last_added: List[str] = []
 
 
121
  self._load()
122
 
 
123
  def _load(self) -> None:
124
  if self.meta_path.exists():
125
  try:
@@ -138,56 +136,49 @@ class SimpleRAG:
138
  faiss.write_index(self.index, str(self.index_path))
139
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
140
 
 
 
 
 
 
141
  @staticmethod
142
- def _pdf_to_texts(pdf_path: Path, step: int = 1400) -> List[str]:
143
- # 1) pypdf
144
  pages: List[str] = []
145
- try:
146
- reader = PdfReader(str(pdf_path))
147
- for p in reader.pages:
148
- t = p.extract_text() or ""
149
- if t.strip():
150
- pages.append(t)
151
- except Exception:
152
- pages = []
153
-
154
- full = " ".join(pages).strip()
155
- if not full:
156
- # 2) pdfminer fallback
157
- try:
158
- from pdfminer.high_level import extract_text as pdfminer_extract_text
159
- full = (pdfminer_extract_text(str(pdf_path)) or "").strip()
160
- except Exception:
161
- full = ""
162
-
163
- if not full:
164
- return []
165
-
166
  chunks: List[str] = []
167
- for i in range(0, len(full), step):
168
- part = full[i : i + step].strip()
169
- if part:
170
- chunks.append(part)
 
171
  return chunks
172
 
 
173
  def add_pdf(self, pdf_path: Path) -> int:
174
  texts = self._pdf_to_texts(pdf_path)
175
  if not texts:
176
- # IMPORTANT: do NOT clobber last_added if this PDF had no extractable text
177
  return 0
178
-
179
- self.last_added = texts[:] # only set if we actually extracted text
180
- emb = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
181
  self.index.add(emb.astype(np.float32))
182
  self.chunks.extend(texts)
 
183
  self._persist()
184
  return len(texts)
185
 
 
186
  def search(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
187
- if self.index is None or self.index.ntotal == 0:
188
  return []
189
  q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
190
- D, I = self.index.search(q, min(k, max(1, self.index.ntotal)))
 
191
  out: List[Tuple[str, float]] = []
192
  if I.size > 0 and self.chunks:
193
  for idx, score in zip(I[0], D[0]):
@@ -195,6 +186,7 @@ class SimpleRAG:
195
  out.append((self.chunks[idx], float(score)))
196
  return out
197
 
 
198
  def _translate_to_en(self, texts: List[str]) -> List[str]:
199
  if not texts:
200
  return texts
@@ -207,78 +199,95 @@ class SimpleRAG:
207
  cache_dir=str(self.cache_dir),
208
  device=-1,
209
  )
210
- outs = self._translator(texts, max_length=800)
211
  return [o["translation_text"].strip() for o in outs]
212
  except Exception:
213
  return texts
214
 
215
- def _prepare_contexts(self, question: str, contexts: List[str]) -> List[str]:
216
- # Generic question or empty search use last uploaded file snippets
217
- generic = (len((question or "").split()) <= 5) or bool(GENERIC_Q_RE.search(question or ""))
218
- if (not contexts or generic) and self.last_added:
219
- return self.last_added[:5]
220
- return contexts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
 
222
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
223
- contexts = self._prepare_contexts(question, contexts)
224
-
225
- if not contexts:
226
- return "No relevant context found. Please upload a PDF or ask a more specific question."
227
-
228
- # 1) Clean & keep top contexts
229
- cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
230
- cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
231
- if not cleaned_contexts:
232
- bullets = _keyword_summary_en(contexts[:5])
233
- return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
234
-
235
- # 2) Pre-translate paragraphs to EN when target is EN
236
- translated = self._translate_to_en(cleaned_contexts) if OUTPUT_LANG == "en" else cleaned_contexts
237
-
238
- # 3) Split into candidate sentences and filter
239
- candidates: List[str] = []
240
- for para in translated:
241
- for s in _split_sentences(para):
242
  w = s.split()
243
- if not (6 <= len(w) <= 60):
244
- continue
245
- # full sentence requirement: punctuation at end OR sufficiently long
246
- if not re.search(r"[.!?](?:[\"'])?$", s) and len(w) < 18:
247
  continue
248
  if _tabular_like(s) or _mostly_numeric(s):
249
  continue
250
- candidates.append(" ".join(w))
251
 
252
- # 4) Fallback if no sentences
253
- if not candidates:
254
- bullets = _keyword_summary_en(cleaned_contexts)
255
- return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
 
 
 
 
 
 
 
 
 
256
 
257
- # 5) Rank by similarity to the question
258
- q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
259
- cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
260
- scores = (cand_emb @ q_emb.T).ravel()
261
- order = np.argsort(-scores)
262
 
263
- # 6) Aggressive near-duplicate removal
264
- selected: List[str] = []
265
- for i in order:
266
- s = candidates[i].strip()
267
- if any(_sim_jaccard(s, t) >= 0.90 for t in selected):
268
- continue
269
- selected.append(s)
270
- if len(selected) >= max_sentences:
271
- break
272
 
273
- # 7) If still looks non-English, use keyword fallback
274
- if not selected or (sum(_non_ascii_ratio(s) for s in selected) / len(selected) > 0.10):
275
- bullets = _keyword_summary_en(cleaned_contexts)
276
- return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
277
 
278
  bullets = "\n".join(f"- {s}" for s in selected)
279
  return f"Answer (based on document context):\n{bullets}"
280
 
281
- def synthesize_answer(question: str, contexts: List[str]) -> str:
282
- return SimpleRAG().synthesize_answer(question, contexts)
283
 
284
- __all__ = ["SimpleRAG", "synthesize_answer", "DATA_DIR", "UPLOAD_DIR", "INDEX_DIR", "CACHE_DIR", "MODEL_NAME"]
 
 
 
 
 
 
1
  # app/rag_system.py
2
  from __future__ import annotations
3
 
4
+ import os
5
+ import re
6
  from pathlib import Path
7
+ from typing import List, Tuple, Optional
8
 
9
  import faiss
10
  import numpy as np
11
+
12
+ # Prefer pypdf; fallback to PyPDF2 if needed
13
+ try:
14
+ from pypdf import PdfReader
15
+ except Exception:
16
+ from PyPDF2 import PdfReader # type: ignore
17
+
18
  from sentence_transformers import SentenceTransformer
19
 
20
+ # ---------------- Paths & Cache ----------------
21
+ ROOT_DIR = Path(__file__).resolve().parent
22
  DATA_DIR = ROOT_DIR / "data"
23
  UPLOAD_DIR = DATA_DIR / "uploads"
24
  INDEX_DIR = DATA_DIR / "index"
 
26
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
27
  d.mkdir(parents=True, exist_ok=True)
28
 
29
+ # ---------------- Config ----------------
30
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
31
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
32
 
33
+ # ---------------- Helpers ----------------
34
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
35
+
36
+ def _fix_mojibake(s: str) -> str:
37
+ """Fix common UTF-8-as-Latin-1 mojibake."""
38
+ if not s:
39
+ return s
40
+ if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
41
+ try:
42
+ return s.encode("latin-1", "ignore").decode("utf-8", "ignore")
43
+ except Exception:
44
+ return s
45
+ return s
46
 
47
  def _split_sentences(text: str) -> List[str]:
48
+ # Split on punctuation boundaries and line breaks
49
+ return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
50
 
51
  def _mostly_numeric(s: str) -> bool:
52
+ alnum = [c for c in s if c.isalnum()]
53
  if not alnum:
54
  return True
55
  digits = sum(c.isdigit() for c in alnum)
56
  return digits / max(1, len(alnum)) > 0.3
57
 
58
+ NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
59
+
60
  def _tabular_like(s: str) -> bool:
61
+ hits = len(NUM_TOKEN_RE.findall(s))
62
+ return hits >= 2 or "Page" in s or len(s) < 20
63
 
64
  def _clean_for_summary(text: str) -> str:
65
  out = []
 
77
  return 0.0
78
  return len(aw & bw) / len(aw | bw)
79
 
80
+ STOPWORDS = {
81
+ "the","a","an","and","or","of","to","in","on","for","with","by",
82
+ "this","that","these","those","is","are","was","were","be","been","being",
83
+ "at","as","it","its","from","into","about","over","after","before","than",
84
+ "such","can","could","should","would","may","might","will","shall"
85
+ }
86
+
87
+ def _keywords(text: str) -> List[str]:
88
+ toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
89
+ return [t for t in toks if t not in STOPWORDS and len(t) > 2]
90
+
91
  def _looks_azerbaijani(s: str) -> bool:
92
  has_az = any(ch in AZ_CHARS for ch in s)
93
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
94
  return has_az or non_ascii_ratio > 0.15
95
 
96
+ # ---------------- RAG Core ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  class SimpleRAG:
98
  def __init__(
99
  self,
 
108
  self.cache_dir = Path(cache_dir)
109
 
110
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
111
+ self.embed_dim = int(self.model.get_sentence_embedding_dimension())
112
 
 
113
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
114
  self.chunks: List[str] = []
115
  self.last_added: List[str] = []
116
+ self._translator = None # lazy init
117
+
118
  self._load()
119
 
120
+ # ---------- Persistence ----------
121
  def _load(self) -> None:
122
  if self.meta_path.exists():
123
  try:
 
136
  faiss.write_index(self.index, str(self.index_path))
137
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
138
 
139
+ # ---------- Utilities ----------
140
+ @property
141
+ def is_empty(self) -> bool:
142
+ return getattr(self.index, "ntotal", 0) == 0 or not self.chunks
143
+
144
  @staticmethod
145
+ def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
146
+ reader = PdfReader(str(pdf_path))
147
  pages: List[str] = []
148
+ for p in reader.pages:
149
+ t = p.extract_text() or ""
150
+ t = _fix_mojibake(t)
151
+ if t.strip():
152
+ pages.append(t)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  chunks: List[str] = []
154
+ for txt in pages:
155
+ for i in range(0, len(txt), step):
156
+ part = txt[i : i + step].strip()
157
+ if part:
158
+ chunks.append(part)
159
  return chunks
160
 
161
+ # ---------- Indexing ----------
162
  def add_pdf(self, pdf_path: Path) -> int:
163
  texts = self._pdf_to_texts(pdf_path)
164
  if not texts:
 
165
  return 0
166
+ emb = self.model.encode(
167
+ texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
168
+ )
169
  self.index.add(emb.astype(np.float32))
170
  self.chunks.extend(texts)
171
+ self.last_added = texts[:]
172
  self._persist()
173
  return len(texts)
174
 
175
+ # ---------- Search ----------
176
  def search(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
177
+ if self.is_empty:
178
  return []
179
  q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
180
+ k = max(1, min(int(k or 5), getattr(self.index, "ntotal", 1)))
181
+ D, I = self.index.search(q, k)
182
  out: List[Tuple[str, float]] = []
183
  if I.size > 0 and self.chunks:
184
  for idx, score in zip(I[0], D[0]):
 
186
  out.append((self.chunks[idx], float(score)))
187
  return out
188
 
189
+ # ---------- Translation (optional) ----------
190
  def _translate_to_en(self, texts: List[str]) -> List[str]:
191
  if not texts:
192
  return texts
 
199
  cache_dir=str(self.cache_dir),
200
  device=-1,
201
  )
202
+ outs = self._translator(texts, max_length=400)
203
  return [o["translation_text"].strip() for o in outs]
204
  except Exception:
205
  return texts
206
 
207
+ # ---------- Fallbacks ----------
208
+ def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
209
+ """Pick sentences sharing keywords with the question (question-dependent even if dense retrieval is weak)."""
210
+ qk = set(_keywords(question))
211
+ if not qk:
212
+ return []
213
+ candidates: List[Tuple[float, str]] = []
214
+ for text in pool[:200]:
215
+ cleaned = _clean_for_summary(text)
216
+ for s in _split_sentences(cleaned):
217
+ if _tabular_like(s) or _mostly_numeric(s):
218
+ continue
219
+ toks = set(_keywords(s))
220
+ if not toks:
221
+ continue
222
+ overlap = len(qk & toks)
223
+ if overlap == 0:
224
+ continue
225
+ length_penalty = max(8, min(40, len(s.split())))
226
+ score = overlap + min(0.5, overlap / length_penalty)
227
+ candidates.append((score, s))
228
+ candidates.sort(key=lambda x: x[0], reverse=True)
229
+ out: List[str] = []
230
+ for _, s in candidates:
231
+ if any(_sim_jaccard(s, t) >= 0.82 for t in out):
232
+ continue
233
+ out.append(s)
234
+ if len(out) >= limit_sentences:
235
+ break
236
+ return out
237
 
238
+ # ---------- Answer Synthesis ----------
239
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
240
+ """Extractive summary over retrieved contexts; falls back to keyword selection; EN translation if needed."""
241
+ if not contexts and self.is_empty:
242
+ return "No relevant context found. Index is empty — upload a PDF first."
243
+
244
+ # Fix mojibake in contexts
245
+ contexts = [_fix_mojibake(c) for c in (contexts or [])]
246
+
247
+ # Build candidate sentences from nearby contexts
248
+ local_pool: List[str] = []
249
+ for c in (contexts or [])[:5]: # keep it light
250
+ cleaned = _clean_for_summary(c)
251
+ for s in _split_sentences(cleaned):
 
 
 
 
 
 
 
252
  w = s.split()
253
+ if not (8 <= len(w) <= 35):
 
 
 
254
  continue
255
  if _tabular_like(s) or _mostly_numeric(s):
256
  continue
257
+ local_pool.append(" ".join(w))
258
 
259
+ selected: List[str] = []
260
+ if local_pool:
261
+ q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
262
+ cand_emb = self.model.encode(local_pool, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
263
+ scores = (cand_emb @ q_emb.T).ravel()
264
+ order = np.argsort(-scores)
265
+ for i in order:
266
+ s = local_pool[i].strip()
267
+ if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
268
+ continue
269
+ selected.append(s)
270
+ if len(selected) >= max_sentences:
271
+ break
272
 
273
+ # Keyword fallback if needed
274
+ if not selected:
275
+ selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
 
 
276
 
277
+ if not selected:
278
+ return "No readable sentences matched the question. Try a more specific query."
 
 
 
 
 
 
 
279
 
280
+ # Translate to EN if looks AZ and OUTPUT_LANG = en
281
+ if OUTPUT_LANG == "en" and any(_looks_azerbaijani(s) for s in selected):
282
+ selected = self._translate_to_en(selected)
 
283
 
284
  bullets = "\n".join(f"- {s}" for s in selected)
285
  return f"Answer (based on document context):\n{bullets}"
286
 
 
 
287
 
288
+ # Public API
289
+ __all__ = [
290
+ "SimpleRAG",
291
+ "UPLOAD_DIR",
292
+ "INDEX_DIR",
293
+ ]