HamidOmarov commited on
Commit
a6ffef9
·
1 Parent(s): a037cf8

RAG: robust EN summarization (pre-translate, filters, fallback)

Browse files
Files changed (1) hide show
  1. app/rag_system.py +17 -22
app/rag_system.py CHANGED
@@ -25,7 +25,7 @@ AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
25
  NUM_TOK_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
26
 
27
  def _split_sentences(text: str) -> List[str]:
28
- return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
29
 
30
  def _mostly_numeric(s: str) -> bool:
31
  alnum = [c for c in s if c.isalnum()]
@@ -36,7 +36,7 @@ def _mostly_numeric(s: str) -> bool:
36
 
37
  def _tabular_like(s: str) -> bool:
38
  hits = len(NUM_TOK_RE.findall(s))
39
- return hits >= 2 or "Page" in s or len(s) < 20
40
 
41
  def _clean_for_summary(text: str) -> str:
42
  out = []
@@ -47,11 +47,6 @@ def _clean_for_summary(text: str) -> str:
47
  out.append(t)
48
  return " ".join(out)
49
 
50
- def _norm_fingerprint(s: str) -> str:
51
- s = s.lower()
52
- s = "".join(ch for ch in s if ch.isalpha() or ch.isspace())
53
- return " ".join(s.split())
54
-
55
  def _sim_jaccard(a: str, b: str) -> float:
56
  aw = set(a.lower().split())
57
  bw = set(b.lower().split())
@@ -68,7 +63,6 @@ def _non_ascii_ratio(s: str) -> float:
68
  return sum(ord(c) > 127 for c in s) / max(1, len(s))
69
 
70
  def _keyword_summary_en(contexts: List[str]) -> List[str]:
71
- """English fallback: infer main items from keywords."""
72
  text = " ".join(contexts).lower()
73
  bullets: List[str] = []
74
  def add(b: str):
@@ -78,13 +72,13 @@ def _keyword_summary_en(contexts: List[str]) -> List[str]:
78
  if ("şüşə" in text) or ("ara kəsm" in text) or ("s/q" in text):
79
  add("Removal and re-installation of glass partitions in sanitary areas.")
80
  if "divar kağız" in text:
81
- add("Wallpaper repair or replacement; in some areas replaced with plaster and paint.")
82
  if ("alçı boya" in text) or ("boya işi" in text) or ("plaster" in text) or ("boya" in text):
83
  add("Wall plastering and painting works.")
84
  if "seramik" in text:
85
  add("Ceramic tiling works (including grouting).")
86
  if ("dilatasyon" in text) or ("ar 153" in text) or ("ar153" in text):
87
- add("Installation of AR 153–050 floor expansion joint profile with required accessories and insulation.")
88
  if "daş yunu" in text:
89
  add("Rock wool insulation installed where required.")
90
  if ("sütunlarda" in text) or ("üzlüyün" in text):
@@ -139,7 +133,7 @@ class SimpleRAG:
139
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
140
 
141
  @staticmethod
142
- def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
143
  reader = PdfReader(str(pdf_path))
144
  pages: List[str] = []
145
  for p in reader.pages:
@@ -203,35 +197,36 @@ class SimpleRAG:
203
  if not cleaned_contexts:
204
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
205
 
206
- # 2) Try to pre-translate paragraphs to EN
207
- if OUTPUT_LANG == "en":
208
- translated = self._translate_to_en(cleaned_contexts)
209
- else:
210
- translated = cleaned_contexts
211
 
212
- # 3) Split paragraphs into candidate sentences and filter
213
  candidates: List[str] = []
214
  for para in translated:
215
  for s in _split_sentences(para):
216
  w = s.split()
217
- if not (8 <= len(w) <= 35):
 
 
 
 
218
  continue
219
  if _tabular_like(s) or _mostly_numeric(s):
220
  continue
221
  candidates.append(" ".join(w))
222
 
223
- # 4) If we still don't have good EN sentences, fallback to keyword summary
224
  if not candidates:
225
  bullets = _keyword_summary_en(cleaned_contexts)
226
  return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
227
 
228
- # 5) Rank by similarity
229
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
230
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
231
  scores = (cand_emb @ q_emb.T).ravel()
232
  order = np.argsort(-scores)
233
 
234
- # 6) Deduplicate (aggressive)
235
  selected: List[str] = []
236
  for i in order:
237
  s = candidates[i].strip()
@@ -241,7 +236,7 @@ class SimpleRAG:
241
  if len(selected) >= max_sentences:
242
  break
243
 
244
- # 7) If selected lines still look non-English, use keyword fallback
245
  if not selected or (sum(_non_ascii_ratio(s) for s in selected) / len(selected) > 0.10):
246
  bullets = _keyword_summary_en(cleaned_contexts)
247
  return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
 
25
  NUM_TOK_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
26
 
27
  def _split_sentences(text: str) -> List[str]:
28
+ return [s.strip() for s in re.split(r'(?<=[.!?])\s+|[\r\n]+', text) if s.strip()]
29
 
30
  def _mostly_numeric(s: str) -> bool:
31
  alnum = [c for c in s if c.isalnum()]
 
36
 
37
  def _tabular_like(s: str) -> bool:
38
  hits = len(NUM_TOK_RE.findall(s))
39
+ return hits >= 3 or len(s) < 15
40
 
41
  def _clean_for_summary(text: str) -> str:
42
  out = []
 
47
  out.append(t)
48
  return " ".join(out)
49
 
 
 
 
 
 
50
  def _sim_jaccard(a: str, b: str) -> float:
51
  aw = set(a.lower().split())
52
  bw = set(b.lower().split())
 
63
  return sum(ord(c) > 127 for c in s) / max(1, len(s))
64
 
65
  def _keyword_summary_en(contexts: List[str]) -> List[str]:
 
66
  text = " ".join(contexts).lower()
67
  bullets: List[str] = []
68
  def add(b: str):
 
72
  if ("şüşə" in text) or ("ara kəsm" in text) or ("s/q" in text):
73
  add("Removal and re-installation of glass partitions in sanitary areas.")
74
  if "divar kağız" in text:
75
+ add("Wallpaper repair or replacement; some areas replaced with plaster and paint.")
76
  if ("alçı boya" in text) or ("boya işi" in text) or ("plaster" in text) or ("boya" in text):
77
  add("Wall plastering and painting works.")
78
  if "seramik" in text:
79
  add("Ceramic tiling works (including grouting).")
80
  if ("dilatasyon" in text) or ("ar 153" in text) or ("ar153" in text):
81
+ add("Installation of AR 153–050 floor expansion joint profile with accessories and insulation.")
82
  if "daş yunu" in text:
83
  add("Rock wool insulation installed where required.")
84
  if ("sütunlarda" in text) or ("üzlüyün" in text):
 
133
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
134
 
135
  @staticmethod
136
+ def _pdf_to_texts(pdf_path: Path, step: int = 1400) -> List[str]:
137
  reader = PdfReader(str(pdf_path))
138
  pages: List[str] = []
139
  for p in reader.pages:
 
197
  if not cleaned_contexts:
198
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
199
 
200
+ # 2) Pre-translate paragraphs to EN when target is EN
201
+ translated = self._translate_to_en(cleaned_contexts) if OUTPUT_LANG == "en" else cleaned_contexts
 
 
 
202
 
203
+ # 3) Split into candidate sentences and filter strictly for completeness
204
  candidates: List[str] = []
205
  for para in translated:
206
  for s in _split_sentences(para):
207
  w = s.split()
208
+ if not (6 <= len(w) <= 60):
209
+ continue
210
+ if s.strip().lower().endswith("e.g."):
211
+ continue
212
+ if not re.search(r"[.!?](?:[\"'])?$", s): # must end with punctuation
213
  continue
214
  if _tabular_like(s) or _mostly_numeric(s):
215
  continue
216
  candidates.append(" ".join(w))
217
 
218
+ # 4) Fallback if no good sentences
219
  if not candidates:
220
  bullets = _keyword_summary_en(cleaned_contexts)
221
  return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
222
 
223
+ # 5) Rank by similarity to the question
224
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
225
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
226
  scores = (cand_emb @ q_emb.T).ravel()
227
  order = np.argsort(-scores)
228
 
229
+ # 6) Aggressive near-duplicate removal
230
  selected: List[str] = []
231
  for i in order:
232
  s = candidates[i].strip()
 
236
  if len(selected) >= max_sentences:
237
  break
238
 
239
+ # 7) If still looks non-English, use keyword fallback
240
  if not selected or (sum(_non_ascii_ratio(s) for s in selected) / len(selected) > 0.10):
241
  bullets = _keyword_summary_en(cleaned_contexts)
242
  return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)