HamidOmarov commited on
Commit
4448508
·
1 Parent(s): 0a78f5f

RAG: fix mojibake via ftfy; descoped-mode to keep tabular lines; better AZ→EN handling

Browse files
Files changed (1) hide show
  1. app/rag_system.py +83 -112
app/rag_system.py CHANGED
@@ -8,147 +8,117 @@ from typing import List, Tuple
8
 
9
  import faiss
10
  import numpy as np
11
- from ftfy import fix_text
12
 
13
  # Prefer pypdf; fallback to PyPDF2 if needed
14
  try:
15
  from pypdf import PdfReader
16
- except Exception: # pragma: no cover
17
  from PyPDF2 import PdfReader # type: ignore
18
 
19
  from sentence_transformers import SentenceTransformer
 
20
 
21
-
22
- # ===================== Paths & Cache (HF-safe) =====================
23
- # Writable base in HF Spaces is /app. Allow ENV overrides for local runs.
24
  ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
25
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
26
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
27
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
28
- CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers prefers HF_HOME
29
 
30
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
31
  d.mkdir(parents=True, exist_ok=True)
32
 
33
-
34
- # ============================= Config ==============================
35
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
36
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
37
 
38
-
39
- # ============================ Helpers ==============================
40
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
41
  NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
42
 
43
- STOPWORDS = {
44
- "the","a","an","and","or","of","to","in","on","for","with","by",
45
- "this","that","these","those","is","are","was","were","be","been","being",
46
- "at","as","it","its","from","into","about","over","after","before","than",
47
- "such","can","could","should","would","may","might","will","shall"
48
- }
49
-
50
  AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
51
  _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
52
 
53
- KEYWORD_HINTS = [
54
- "descoped", "out of scope", "exclude", "excluded", "scope change",
55
- "çıxar", "çıxarılan", "daxil deyil", "kənar", "silin", "dəyişiklik",
56
- ]
57
-
58
-
59
- def _fix_mojibake(s: str) -> str:
60
- """Fix common UTF-8-as-Latin-1 mojibake artifacts."""
61
- if not s:
62
- return s
63
- if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
64
- try:
65
- return s.encode("latin-1", "ignore").decode("utf-8", "ignore")
66
- except Exception:
67
- return s
68
- return s
69
-
70
-
71
  def _fix_intra_word_spaces(s: str) -> str:
72
- """Join sequences like 'H Ə F T Ə' -> 'HƏFTƏ' without touching normal words."""
73
  if not s:
74
  return s
75
  return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
76
 
77
-
78
- def _fix_word_breaks(s: str) -> str:
79
- """Repair hyphen/newline word-breaks and collapse excessive spaces."""
80
- s = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", s) # join hyphen breaks
81
- return re.sub(r"[ \t]+", " ", s)
82
-
 
 
 
 
 
 
83
 
84
  def _split_sentences(text: str) -> List[str]:
85
- # sentence-ish splitter that also breaks on line breaks
86
- return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
87
-
88
 
89
  def _mostly_numeric(s: str) -> bool:
90
- """Treat a line as numeric/tabular if >60% of alnum chars are digits."""
91
  alnum = [c for c in s if c.isalnum()]
92
  if not alnum:
93
  return True
94
  digits = sum(c.isdigit() for c in alnum)
95
- return digits / max(1, len(alnum)) > 0.6
96
-
97
 
98
  def _tabular_like(s: str) -> bool:
99
- """Heuristic for table-ish lines; relax threshold so we don't drop everything."""
100
  hits = len(NUM_TOKEN_RE.findall(s))
101
- return hits >= 3 # was 2; set to 3 to be less aggressive
102
-
103
 
104
  def _clean_for_summary(text: str) -> str:
105
  out = []
106
  for ln in text.splitlines():
107
  t = " ".join(ln.split())
 
108
  if not t or _mostly_numeric(t) or _tabular_like(t):
109
  continue
110
  out.append(t)
111
  return " ".join(out)
112
 
113
-
114
  def _sim_jaccard(a: str, b: str) -> float:
115
- aw = set(a.lower().split())
116
- bw = set(b.lower().split())
117
  if not aw or not bw:
118
  return 0.0
119
  return len(aw & bw) / len(aw | bw)
120
 
121
-
 
 
 
 
 
122
  def _keywords(text: str) -> List[str]:
123
  toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
124
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
125
 
126
-
127
  def _looks_azerbaijani(s: str) -> bool:
128
  has_az = any(ch in AZ_CHARS for ch in s)
129
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
130
  return has_az or non_ascii_ratio > 0.15
131
 
 
 
 
 
 
 
 
 
132
 
133
- def _extract_keyword_lines(question: str, pool: List[str], limit: int = 6) -> List[str]:
134
- """Directly lift lines containing descoped/scope-change hints."""
135
- keys = set(_keywords(question)) | {k.lower() for k in KEYWORD_HINTS}
136
- hits: List[str] = []
137
- for text in pool[:200]:
138
- t = fix_text(_fix_intra_word_spaces(_fix_word_breaks(_fix_mojibake(text))))
139
- for line in t.splitlines():
140
- s = " ".join(line.split())
141
- if not s or len(s.split()) < 4:
142
- continue
143
- lo = s.lower()
144
- if any(k in lo for k in keys):
145
- hits.append(s)
146
- if len(hits) >= limit:
147
- return hits
148
- return hits
149
-
150
 
151
- # ============================ RAG Core =============================
152
  class SimpleRAG:
153
  def __init__(
154
  self,
@@ -168,7 +138,7 @@ class SimpleRAG:
168
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
169
  self.chunks: List[str] = []
170
  self.last_added: List[str] = []
171
- self._translator = None # lazy init
172
 
173
  self._load()
174
 
@@ -202,13 +172,9 @@ class SimpleRAG:
202
  pages: List[str] = []
203
  for p in reader.pages:
204
  t = p.extract_text() or ""
 
205
  if t.strip():
206
- t = _fix_mojibake(t)
207
- t = fix_text(t)
208
- t = _fix_word_breaks(t)
209
- t = _fix_intra_word_spaces(t)
210
  pages.append(t)
211
-
212
  chunks: List[str] = []
213
  for txt in pages:
214
  for i in range(0, len(txt), step):
@@ -259,33 +225,35 @@ class SimpleRAG:
259
  device=-1,
260
  )
261
  outs = self._translator(texts, max_length=400)
262
- return [o["translation_text"].strip() for o in outs]
263
  except Exception:
264
  return texts
265
 
266
  # ---------- Fallbacks ----------
267
- def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
268
  qk = set(_keywords(question))
269
  if not qk:
270
  return []
271
  candidates: List[Tuple[float, str]] = []
272
- for text in pool[:200]:
273
- cleaned = _clean_for_summary(text)
274
  for s in _split_sentences(cleaned):
275
- if _tabular_like(s) or _mostly_numeric(s):
276
- continue
 
277
  toks = set(_keywords(s))
278
  if not toks:
279
  continue
280
  overlap = len(qk & toks)
281
- if overlap == 0:
282
  continue
283
- length_penalty = max(8, min(40, len(s.split())))
284
- score = overlap + min(0.5, overlap / length_penalty)
285
  candidates.append((score, s))
286
  candidates.sort(key=lambda x: x[0], reverse=True)
287
  out: List[str] = []
288
  for _, s in candidates:
 
289
  if any(_sim_jaccard(s, t) >= 0.82 for t in out):
290
  continue
291
  out.append(s)
@@ -298,19 +266,24 @@ class SimpleRAG:
298
  if not contexts and self.is_empty:
299
  return "No relevant context found. Index is empty — upload a PDF first."
300
 
301
- # Fix mojibake in contexts, normalize spacing
302
- contexts = [fix_text(_fix_intra_word_spaces(_fix_word_breaks(_fix_mojibake(c or "")))) for c in (contexts or [])]
303
 
304
- # Build candidate sentences from nearby contexts (use more windows)
305
  local_pool: List[str] = []
306
- for c in (contexts or [])[:8]:
307
- cleaned = _clean_for_summary(c)
 
308
  for s in _split_sentences(cleaned):
309
  w = s.split()
310
- if not (8 <= len(w) <= 35):
311
- continue
312
- if _tabular_like(s) or _mostly_numeric(s):
313
  continue
 
 
 
 
 
 
 
314
  local_pool.append(" ".join(w))
315
 
316
  selected: List[str] = []
@@ -320,34 +293,32 @@ class SimpleRAG:
320
  scores = (cand_emb @ q_emb.T).ravel()
321
  order = np.argsort(-scores)
322
  for i in order:
323
- s = local_pool[i].strip()
324
  if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
325
  continue
326
  selected.append(s)
327
  if len(selected) >= max_sentences:
328
  break
329
 
330
- # keyword-based sentence-level selection across corpus
331
  if not selected:
332
- selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
333
-
334
- # final direct-line extraction if still empty
335
- if not selected:
336
- selected = _extract_keyword_lines(question, self.chunks, limit=max_sentences)
 
337
 
338
  if not selected:
339
  return "No readable sentences matched the question. Try a more specific query."
340
 
341
- # translate to EN if needed
342
- if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
343
- selected = self._translate_to_en(selected)
 
 
344
 
345
  bullets = "\n".join(f"- {s}" for s in selected)
346
  return f"Answer (based on document context):\n{bullets}"
347
 
348
 
349
- __all__ = [
350
- "SimpleRAG",
351
- "UPLOAD_DIR",
352
- "INDEX_DIR",
353
- ]
 
8
 
9
  import faiss
10
  import numpy as np
 
11
 
12
  # Prefer pypdf; fallback to PyPDF2 if needed
13
  try:
14
  from pypdf import PdfReader
15
+ except Exception:
16
  from PyPDF2 import PdfReader # type: ignore
17
 
18
  from sentence_transformers import SentenceTransformer
19
+ from ftfy import fix_text
20
 
21
+ # ---------------- Paths & Cache (HF-safe) ----------------
 
 
22
  ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
23
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
24
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
25
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
26
+ CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
27
 
28
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
29
  d.mkdir(parents=True, exist_ok=True)
30
 
31
+ # ---------------- Config ----------------
 
32
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
33
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
34
 
35
+ # ---------------- Helpers ----------------
 
36
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
37
  NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
38
 
 
 
 
 
 
 
 
39
  AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
40
  _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def _fix_intra_word_spaces(s: str) -> str:
43
+ # "H Ə F T Ə" -> "HƏFTƏ"
44
  if not s:
45
  return s
46
  return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
47
 
48
+ def _fix_mojibake(s: str) -> str:
49
+ # Try to undo latin-1/utf-8 mess, then ftfy as final pass
50
+ if not s:
51
+ return s
52
+ try:
53
+ if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
54
+ s = s.encode("latin-1", "ignore").decode("utf-8", "ignore")
55
+ except Exception:
56
+ pass
57
+ s = fix_text(s)
58
+ s = _fix_intra_word_spaces(s)
59
+ return s
60
 
61
  def _split_sentences(text: str) -> List[str]:
62
+ return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
 
 
63
 
64
  def _mostly_numeric(s: str) -> bool:
 
65
  alnum = [c for c in s if c.isalnum()]
66
  if not alnum:
67
  return True
68
  digits = sum(c.isdigit() for c in alnum)
69
+ return digits / max(1, len(alnum)) > 0.3
 
70
 
71
  def _tabular_like(s: str) -> bool:
 
72
  hits = len(NUM_TOKEN_RE.findall(s))
73
+ return hits >= 2 or "Page" in s or len(s) < 20
 
74
 
75
  def _clean_for_summary(text: str) -> str:
76
  out = []
77
  for ln in text.splitlines():
78
  t = " ".join(ln.split())
79
+ t = _fix_mojibake(t)
80
  if not t or _mostly_numeric(t) or _tabular_like(t):
81
  continue
82
  out.append(t)
83
  return " ".join(out)
84
 
 
85
  def _sim_jaccard(a: str, b: str) -> float:
86
+ aw = set(a.lower().split()); bw = set(b.lower().split())
 
87
  if not aw or not bw:
88
  return 0.0
89
  return len(aw & bw) / len(aw | bw)
90
 
91
+ STOPWORDS = {
92
+ "the","a","an","and","or","of","to","in","on","for","with","by",
93
+ "this","that","these","those","is","are","was","were","be","been","being",
94
+ "at","as","it","its","from","into","about","over","after","before","than",
95
+ "such","can","could","should","would","may","might","will","shall"
96
+ }
97
  def _keywords(text: str) -> List[str]:
98
  toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
99
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
100
 
 
101
  def _looks_azerbaijani(s: str) -> bool:
102
  has_az = any(ch in AZ_CHARS for ch in s)
103
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
104
  return has_az or non_ascii_ratio > 0.15
105
 
106
+ # ---- Descoped/out-of-scope heuristics ----
107
+ DESCOPED_KWS = [
108
+ "descoped","out of scope","out-of-scope","exclude","excluded","exclusion",
109
+ "çıxarılan","çıxarıl","çıxarıldı","daxil deyil","sökül","demontaj","kəsilmə",
110
+ ]
111
+ def _descoped_mode(question: str) -> bool:
112
+ ql = (question or "").lower()
113
+ return any(k in ql for k in DESCOPED_KWS) or "descop" in ql
114
 
115
+ def _is_descoped_line(s: str) -> bool:
116
+ sl = s.lower()
117
+ if any(k in sl for k in DESCOPED_KWS):
118
+ return True
119
+ return bool(re.search(r"\b(out[-\s]?of[-\s]?scope|descop)", sl))
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ # ---------------- RAG Core ----------------
122
  class SimpleRAG:
123
  def __init__(
124
  self,
 
138
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
139
  self.chunks: List[str] = []
140
  self.last_added: List[str] = []
141
+ self._translator = None # lazy
142
 
143
  self._load()
144
 
 
172
  pages: List[str] = []
173
  for p in reader.pages:
174
  t = p.extract_text() or ""
175
+ t = _fix_mojibake(t)
176
  if t.strip():
 
 
 
 
177
  pages.append(t)
 
178
  chunks: List[str] = []
179
  for txt in pages:
180
  for i in range(0, len(txt), step):
 
225
  device=-1,
226
  )
227
  outs = self._translator(texts, max_length=400)
228
+ return [fix_text(o["translation_text"].strip()) for o in outs]
229
  except Exception:
230
  return texts
231
 
232
  # ---------- Fallbacks ----------
233
+ def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4, allow_numeric: bool = False) -> List[str]:
234
  qk = set(_keywords(question))
235
  if not qk:
236
  return []
237
  candidates: List[Tuple[float, str]] = []
238
+ for text in pool[:400]:
239
+ cleaned = _fix_mojibake(" ".join(text.split()))
240
  for s in _split_sentences(cleaned):
241
+ if not allow_numeric:
242
+ if _tabular_like(s) or _mostly_numeric(s):
243
+ continue
244
  toks = set(_keywords(s))
245
  if not toks:
246
  continue
247
  overlap = len(qk & toks)
248
+ if overlap == 0 and not _is_descoped_line(s):
249
  continue
250
+ length_penalty = max(6, min(60, len(s.split())))
251
+ score = overlap + (0.3 if _is_descoped_line(s) else 0.0) + min(0.5, overlap / length_penalty)
252
  candidates.append((score, s))
253
  candidates.sort(key=lambda x: x[0], reverse=True)
254
  out: List[str] = []
255
  for _, s in candidates:
256
+ s = fix_text(s).strip()
257
  if any(_sim_jaccard(s, t) >= 0.82 for t in out):
258
  continue
259
  out.append(s)
 
266
  if not contexts and self.is_empty:
267
  return "No relevant context found. Index is empty — upload a PDF first."
268
 
269
+ desc_mode = _descoped_mode(question)
 
270
 
271
+ # Build candidate sentences from nearby contexts
272
  local_pool: List[str] = []
273
+ scan_n = 8 if desc_mode else 5
274
+ for c in (contexts or [])[:scan_n]:
275
+ cleaned = _fix_mojibake(" ".join(c.split()))
276
  for s in _split_sentences(cleaned):
277
  w = s.split()
278
+ if not ( (6 if desc_mode else 8) <= len(w) <= (60 if desc_mode else 35) ):
 
 
279
  continue
280
+ if not desc_mode:
281
+ if _tabular_like(s) or _mostly_numeric(s):
282
+ continue
283
+ else:
284
+ # allow numeric/tabular if it looks like descoped line
285
+ if not _is_descoped_line(s) and (_tabular_like(s) or _mostly_numeric(s)):
286
+ continue
287
  local_pool.append(" ".join(w))
288
 
289
  selected: List[str] = []
 
293
  scores = (cand_emb @ q_emb.T).ravel()
294
  order = np.argsort(-scores)
295
  for i in order:
296
+ s = fix_text(local_pool[i]).strip()
297
  if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
298
  continue
299
  selected.append(s)
300
  if len(selected) >= max_sentences:
301
  break
302
 
 
303
  if not selected:
304
+ selected = self._keyword_fallback(
305
+ question,
306
+ self.chunks,
307
+ limit_sentences=max_sentences,
308
+ allow_numeric=desc_mode, # relax numeric filter for descoped Qs
309
+ )
310
 
311
  if not selected:
312
  return "No readable sentences matched the question. Try a more specific query."
313
 
314
+ # Translate to EN if needed (and requested)
315
+ if OUTPUT_LANG == "en":
316
+ needs_tr = any(_looks_azerbaijani(s) for s in selected) or any(ch in "".join(selected) for ch in ("Ã","Ä","Þ"))
317
+ if needs_tr:
318
+ selected = self._translate_to_en(selected)
319
 
320
  bullets = "\n".join(f"- {s}" for s in selected)
321
  return f"Answer (based on document context):\n{bullets}"
322
 
323
 
324
+ __all__ = ["SimpleRAG", "UPLOAD_DIR", "INDEX_DIR"]