HamidOmarov commited on
Commit
0a78f5f
·
1 Parent(s): 1fb5688

RAG: fix mojibake/word-breaks; relax tabular filter; keyword-line fallback for scope changes

Browse files
Files changed (1) hide show
  1. app/rag_system.py +85 -57
app/rag_system.py CHANGED
@@ -8,55 +8,36 @@ from typing import List, Tuple
8
 
9
  import faiss
10
  import numpy as np
 
11
 
12
- # --- ftfy (mojibake/normalizasiya) ---
13
- try:
14
- from ftfy import fix_text as _ftfy
15
- except Exception: # ftfy yoxdursa, no-op
16
- def _ftfy(x: str) -> str:
17
- return x
18
-
19
- # pypdf -> PyPDF2 fallback
20
  try:
21
  from pypdf import PdfReader
22
- except Exception:
23
  from PyPDF2 import PdfReader # type: ignore
24
 
25
  from sentence_transformers import SentenceTransformer
26
 
27
- # ---------------- Paths & Cache (HF-safe) ----------------
28
- # Default: repo kökü; APP_ROOT verilərsə ona keç
29
- DEFAULT_ROOT = Path(__file__).resolve().parents[1]
30
- ROOT_DIR = Path(os.getenv("APP_ROOT", str(DEFAULT_ROOT)))
31
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
32
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
33
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
34
- CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers üçün ən yaxşısı HF_HOME
35
 
36
- # cəhd et yaratmağa; icazə problemi olsa, local ./data-a düş
37
- for pth in (CACHE_DIR,):
38
- try:
39
- pth.mkdir(parents=True, exist_ok=True)
40
- except PermissionError:
41
- pass
42
 
43
- try:
44
- DATA_DIR.mkdir(parents=True, exist_ok=True)
45
- UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
46
- INDEX_DIR.mkdir(parents=True, exist_ok=True)
47
- except PermissionError:
48
- DATA_DIR = Path("./data"); DATA_DIR.mkdir(parents=True, exist_ok=True)
49
- UPLOAD_DIR = DATA_DIR / "uploads"; UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
50
- INDEX_DIR = DATA_DIR / "index"; INDEX_DIR.mkdir(parents=True, exist_ok=True)
51
-
52
- # ---------------- Config ----------------
53
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
54
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
55
 
56
- # ---------------- Helpers ----------------
 
57
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
58
- AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
59
- _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
60
  NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
61
 
62
  STOPWORDS = {
@@ -66,14 +47,17 @@ STOPWORDS = {
66
  "such","can","could","should","would","may","might","will","shall"
67
  }
68
 
69
- def _fix_intra_word_spaces(s: str) -> str:
70
- """'c l a s s' → 'class', 'H Ə F T Ə' → 'HƏFTƏ' (yalnız ardıcıl tək-hərflər)."""
71
- if not s:
72
- return s
73
- return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
 
 
 
74
 
75
  def _fix_mojibake(s: str) -> str:
76
- """UTF-8-in Latin-1 kimi oxunmasından yaranan 'ö' və s. pozuntuları yumşaq düzəlt."""
77
  if not s:
78
  return s
79
  if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
@@ -83,19 +67,39 @@ def _fix_mojibake(s: str) -> str:
83
  return s
84
  return s
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def _split_sentences(text: str) -> List[str]:
87
- return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
 
 
88
 
89
  def _mostly_numeric(s: str) -> bool:
 
90
  alnum = [c for c in s if c.isalnum()]
91
  if not alnum:
92
  return True
93
  digits = sum(c.isdigit() for c in alnum)
94
- return digits / max(1, len(alnum)) > 0.3
 
95
 
96
  def _tabular_like(s: str) -> bool:
 
97
  hits = len(NUM_TOKEN_RE.findall(s))
98
- return hits >= 2 or "Page" in s or len(s) < 20
 
99
 
100
  def _clean_for_summary(text: str) -> str:
101
  out = []
@@ -106,6 +110,7 @@ def _clean_for_summary(text: str) -> str:
106
  out.append(t)
107
  return " ".join(out)
108
 
 
109
  def _sim_jaccard(a: str, b: str) -> float:
110
  aw = set(a.lower().split())
111
  bw = set(b.lower().split())
@@ -113,16 +118,37 @@ def _sim_jaccard(a: str, b: str) -> float:
113
  return 0.0
114
  return len(aw & bw) / len(aw | bw)
115
 
 
116
  def _keywords(text: str) -> List[str]:
117
  toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
118
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
119
 
 
120
  def _looks_azerbaijani(s: str) -> bool:
121
  has_az = any(ch in AZ_CHARS for ch in s)
122
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
123
  return has_az or non_ascii_ratio > 0.15
124
 
125
- # ---------------- RAG Core ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  class SimpleRAG:
127
  def __init__(
128
  self,
@@ -176,13 +202,13 @@ class SimpleRAG:
176
  pages: List[str] = []
177
  for p in reader.pages:
178
  t = p.extract_text() or ""
179
- # normalizasiya ardıcıllığı
180
- t = _ftfy(t)
181
- t = _fix_mojibake(t)
182
- t = _fix_intra_word_spaces(t)
183
- t = re.sub(r"\s+", " ", t).strip()
184
- if t:
185
  pages.append(t)
 
186
  chunks: List[str] = []
187
  for txt in pages:
188
  for i in range(0, len(txt), step):
@@ -272,15 +298,12 @@ class SimpleRAG:
272
  if not contexts and self.is_empty:
273
  return "No relevant context found. Index is empty — upload a PDF first."
274
 
275
- # konteksləri təmizlə
276
- contexts = [
277
- re.sub(r"\s+", " ", _fix_intra_word_spaces(_fix_mojibake(_ftfy(c)))).strip()
278
- for c in (contexts or [])
279
- ]
280
 
281
- # Yaxın kontekstlərdən namizədlər
282
  local_pool: List[str] = []
283
- for c in (contexts or [])[:5]:
284
  cleaned = _clean_for_summary(c)
285
  for s in _split_sentences(cleaned):
286
  w = s.split()
@@ -304,13 +327,18 @@ class SimpleRAG:
304
  if len(selected) >= max_sentences:
305
  break
306
 
 
307
  if not selected:
308
  selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
309
 
 
 
 
 
310
  if not selected:
311
  return "No readable sentences matched the question. Try a more specific query."
312
 
313
- # EN istəyə uyğun tərcümə
314
  if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
315
  selected = self._translate_to_en(selected)
316
 
 
8
 
9
  import faiss
10
  import numpy as np
11
+ from ftfy import fix_text
12
 
13
+ # Prefer pypdf; fallback to PyPDF2 if needed
 
 
 
 
 
 
 
14
  try:
15
  from pypdf import PdfReader
16
+ except Exception: # pragma: no cover
17
  from PyPDF2 import PdfReader # type: ignore
18
 
19
  from sentence_transformers import SentenceTransformer
20
 
21
+
22
+ # ===================== Paths & Cache (HF-safe) =====================
23
+ # Writable base in HF Spaces is /app. Allow ENV overrides for local runs.
24
+ ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
25
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
26
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
27
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
28
+ CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers prefers HF_HOME
29
 
30
+ for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
31
+ d.mkdir(parents=True, exist_ok=True)
 
 
 
 
32
 
33
+
34
+ # ============================= Config ==============================
 
 
 
 
 
 
 
 
35
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
36
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
37
 
38
+
39
+ # ============================ Helpers ==============================
40
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
 
 
41
  NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
42
 
43
  STOPWORDS = {
 
47
  "such","can","could","should","would","may","might","will","shall"
48
  }
49
 
50
+ AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
51
+ _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
52
+
53
+ KEYWORD_HINTS = [
54
+ "descoped", "out of scope", "exclude", "excluded", "scope change",
55
+ "çıxar", "çıxarılan", "daxil deyil", "kənar", "silin", "dəyişiklik",
56
+ ]
57
+
58
 
59
  def _fix_mojibake(s: str) -> str:
60
+ """Fix common UTF-8-as-Latin-1 mojibake artifacts."""
61
  if not s:
62
  return s
63
  if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
 
67
  return s
68
  return s
69
 
70
+
71
+ def _fix_intra_word_spaces(s: str) -> str:
72
+ """Join sequences like 'H Ə F T Ə' -> 'HƏFTƏ' without touching normal words."""
73
+ if not s:
74
+ return s
75
+ return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
76
+
77
+
78
+ def _fix_word_breaks(s: str) -> str:
79
+ """Repair hyphen/newline word-breaks and collapse excessive spaces."""
80
+ s = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", s) # join hyphen breaks
81
+ return re.sub(r"[ \t]+", " ", s)
82
+
83
+
84
  def _split_sentences(text: str) -> List[str]:
85
+ # sentence-ish splitter that also breaks on line breaks
86
+ return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
87
+
88
 
89
  def _mostly_numeric(s: str) -> bool:
90
+ """Treat a line as numeric/tabular if >60% of alnum chars are digits."""
91
  alnum = [c for c in s if c.isalnum()]
92
  if not alnum:
93
  return True
94
  digits = sum(c.isdigit() for c in alnum)
95
+ return digits / max(1, len(alnum)) > 0.6
96
+
97
 
98
  def _tabular_like(s: str) -> bool:
99
+ """Heuristic for table-ish lines; relax threshold so we don't drop everything."""
100
  hits = len(NUM_TOKEN_RE.findall(s))
101
+ return hits >= 3 # was 2; set to 3 to be less aggressive
102
+
103
 
104
  def _clean_for_summary(text: str) -> str:
105
  out = []
 
110
  out.append(t)
111
  return " ".join(out)
112
 
113
+
114
  def _sim_jaccard(a: str, b: str) -> float:
115
  aw = set(a.lower().split())
116
  bw = set(b.lower().split())
 
118
  return 0.0
119
  return len(aw & bw) / len(aw | bw)
120
 
121
+
122
  def _keywords(text: str) -> List[str]:
123
  toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
124
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
125
 
126
+
127
  def _looks_azerbaijani(s: str) -> bool:
128
  has_az = any(ch in AZ_CHARS for ch in s)
129
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
130
  return has_az or non_ascii_ratio > 0.15
131
 
132
+
133
+ def _extract_keyword_lines(question: str, pool: List[str], limit: int = 6) -> List[str]:
134
+ """Directly lift lines containing descoped/scope-change hints."""
135
+ keys = set(_keywords(question)) | {k.lower() for k in KEYWORD_HINTS}
136
+ hits: List[str] = []
137
+ for text in pool[:200]:
138
+ t = fix_text(_fix_intra_word_spaces(_fix_word_breaks(_fix_mojibake(text))))
139
+ for line in t.splitlines():
140
+ s = " ".join(line.split())
141
+ if not s or len(s.split()) < 4:
142
+ continue
143
+ lo = s.lower()
144
+ if any(k in lo for k in keys):
145
+ hits.append(s)
146
+ if len(hits) >= limit:
147
+ return hits
148
+ return hits
149
+
150
+
151
+ # ============================ RAG Core =============================
152
  class SimpleRAG:
153
  def __init__(
154
  self,
 
202
  pages: List[str] = []
203
  for p in reader.pages:
204
  t = p.extract_text() or ""
205
+ if t.strip():
206
+ t = _fix_mojibake(t)
207
+ t = fix_text(t)
208
+ t = _fix_word_breaks(t)
209
+ t = _fix_intra_word_spaces(t)
 
210
  pages.append(t)
211
+
212
  chunks: List[str] = []
213
  for txt in pages:
214
  for i in range(0, len(txt), step):
 
298
  if not contexts and self.is_empty:
299
  return "No relevant context found. Index is empty — upload a PDF first."
300
 
301
+ # Fix mojibake in contexts, normalize spacing
302
+ contexts = [fix_text(_fix_intra_word_spaces(_fix_word_breaks(_fix_mojibake(c or "")))) for c in (contexts or [])]
 
 
 
303
 
304
+ # Build candidate sentences from nearby contexts (use more windows)
305
  local_pool: List[str] = []
306
+ for c in (contexts or [])[:8]:
307
  cleaned = _clean_for_summary(c)
308
  for s in _split_sentences(cleaned):
309
  w = s.split()
 
327
  if len(selected) >= max_sentences:
328
  break
329
 
330
+ # keyword-based sentence-level selection across corpus
331
  if not selected:
332
  selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
333
 
334
+ # final direct-line extraction if still empty
335
+ if not selected:
336
+ selected = _extract_keyword_lines(question, self.chunks, limit=max_sentences)
337
+
338
  if not selected:
339
  return "No readable sentences matched the question. Try a more specific query."
340
 
341
+ # translate to EN if needed
342
  if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
343
  selected = self._translate_to_en(selected)
344