HamidOmarov commited on
Commit
1fb5688
·
1 Parent(s): 9dc7698

Text cleanups: join inter-letter spaces + ftfy normalize

Browse files
Files changed (1) hide show
  1. app/rag_system.py +59 -60
app/rag_system.py CHANGED
@@ -8,53 +8,72 @@ from typing import List, Tuple
8
 
9
  import faiss
10
  import numpy as np
11
- from ftfy import fix_text
12
 
13
- # Prefer pypdf; fallback to PyPDF2
 
 
 
 
 
 
 
14
  try:
15
  from pypdf import PdfReader
16
- except Exception: # pragma: no cover
17
  from PyPDF2 import PdfReader # type: ignore
18
 
19
  from sentence_transformers import SentenceTransformer
20
 
21
- # ===================== Paths (HF-safe) =====================
22
- # HF Spaces üçün yazıla bilən baza /app-dir. Lokal mühitdə də işləyir.
23
- ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
 
24
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
25
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
26
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
27
- CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
28
 
29
- for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
30
- d.mkdir(parents=True, exist_ok=True)
 
 
 
 
31
 
32
- # ===================== Config =====================
 
 
 
 
 
 
 
 
 
33
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
34
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
35
 
36
- # ===================== Helpers =====================
37
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
 
 
38
  NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
39
 
40
  STOPWORDS = {
41
- "the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "by",
42
- "this", "that", "these", "those", "is", "are", "was", "were", "be", "been", "being",
43
- "at", "as", "it", "its", "from", "into", "about", "over", "after", "before", "than",
44
- "such", "can", "could", "should", "would", "may", "might", "will", "shall"
45
  }
46
 
47
- AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
48
- _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
49
-
50
  def _fix_intra_word_spaces(s: str) -> str:
51
- """'H Ə F T Ə' → 'HƏFTƏ' (yalnız ardıcıl tək-hərf qaçışlarını birləşdirir)."""
52
  if not s:
53
  return s
54
  return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
55
 
56
  def _fix_mojibake(s: str) -> str:
57
- """UTF-8-as-Latin1 tipik mojibake üçün sürətli həll."""
58
  if not s:
59
  return s
60
  if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
@@ -64,17 +83,6 @@ def _fix_mojibake(s: str) -> str:
64
  return s
65
  return s
66
 
67
- def _normalize_text(s: str) -> str:
68
- if not s:
69
- return s
70
- s = fix_text(s) # ftfy ilə ümumi düzəlişlər
71
- s = _fix_mojibake(s) # latin-1 → utf-8 “çevrilməsi” cəhd
72
- s = s.replace("fi", "fi").replace("fl", "fl")
73
- s = _fix_intra_word_spaces(s) # H Ə F T Ə → HƏFTƏ
74
- s = re.sub(r"[ \t]+", " ", s)
75
- s = re.sub(r"\s+\n", "\n", s)
76
- return s.strip()
77
-
78
  def _split_sentences(text: str) -> List[str]:
79
  return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
80
 
@@ -83,7 +91,7 @@ def _mostly_numeric(s: str) -> bool:
83
  if not alnum:
84
  return True
85
  digits = sum(c.isdigit() for c in alnum)
86
- return digits / max(1, len(alnum)) > 0.30
87
 
88
  def _tabular_like(s: str) -> bool:
89
  hits = len(NUM_TOKEN_RE.findall(s))
@@ -93,11 +101,7 @@ def _clean_for_summary(text: str) -> str:
93
  out = []
94
  for ln in text.splitlines():
95
  t = " ".join(ln.split())
96
- if not t:
97
- continue
98
- if len(t) < 25:
99
- continue
100
- if _mostly_numeric(t) or _tabular_like(t):
101
  continue
102
  out.append(t)
103
  return " ".join(out)
@@ -118,7 +122,7 @@ def _looks_azerbaijani(s: str) -> bool:
118
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
119
  return has_az or non_ascii_ratio > 0.15
120
 
121
- # ===================== RAG Core =====================
122
  class SimpleRAG:
123
  def __init__(
124
  self,
@@ -138,7 +142,7 @@ class SimpleRAG:
138
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
139
  self.chunks: List[str] = []
140
  self.last_added: List[str] = []
141
- self._translator = None # lazy
142
 
143
  self._load()
144
 
@@ -172,26 +176,20 @@ class SimpleRAG:
172
  pages: List[str] = []
173
  for p in reader.pages:
174
  t = p.extract_text() or ""
175
- if t.strip():
176
- t = _normalize_text(t)
 
 
 
 
177
  pages.append(t)
178
-
179
  chunks: List[str] = []
180
  for txt in pages:
181
  for i in range(0, len(txt), step):
182
- part = txt[i: i + step].strip()
183
  if part:
184
  chunks.append(part)
185
-
186
- # simple dedup to avoid exact repeats
187
- seen = set()
188
- uniq: List[str] = []
189
- for c in chunks:
190
- if c in seen:
191
- continue
192
- seen.add(c)
193
- uniq.append(c)
194
- return uniq
195
 
196
  # ---------- Indexing ----------
197
  def add_pdf(self, pdf_path: Path) -> int:
@@ -274,10 +272,13 @@ class SimpleRAG:
274
  if not contexts and self.is_empty:
275
  return "No relevant context found. Index is empty — upload a PDF first."
276
 
277
- # normalize contexts (mojibake, spacing, etc.)
278
- contexts = [_normalize_text(c) for c in (contexts or [])]
 
 
 
279
 
280
- # 1) local candidate pool
281
  local_pool: List[str] = []
282
  for c in (contexts or [])[:5]:
283
  cleaned = _clean_for_summary(c)
@@ -289,7 +290,6 @@ class SimpleRAG:
289
  continue
290
  local_pool.append(" ".join(w))
291
 
292
- # 2) rank by similarity to question
293
  selected: List[str] = []
294
  if local_pool:
295
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
@@ -304,15 +304,14 @@ class SimpleRAG:
304
  if len(selected) >= max_sentences:
305
  break
306
 
307
- # 3) keyword fallback (whole corpus) əgər nəticə zəifdirsə
308
  if not selected:
309
  selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
310
 
311
  if not selected:
312
  return "No readable sentences matched the question. Try a more specific query."
313
 
314
- # 4) translate to EN if needed
315
- if OUTPUT_LANG == "en" and any(_looks_azerbaijani(s) for s in selected):
316
  selected = self._translate_to_en(selected)
317
 
318
  bullets = "\n".join(f"- {s}" for s in selected)
 
8
 
9
  import faiss
10
  import numpy as np
 
11
 
12
+ # --- ftfy (mojibake/normalizasiya) ---
13
+ try:
14
+ from ftfy import fix_text as _ftfy
15
+ except Exception: # ftfy yoxdursa, no-op
16
+ def _ftfy(x: str) -> str:
17
+ return x
18
+
19
+ # pypdf -> PyPDF2 fallback
20
  try:
21
  from pypdf import PdfReader
22
+ except Exception:
23
  from PyPDF2 import PdfReader # type: ignore
24
 
25
  from sentence_transformers import SentenceTransformer
26
 
27
+ # ---------------- Paths & Cache (HF-safe) ----------------
28
+ # Default: repo kökü; APP_ROOT verilərsə ona keç
29
+ DEFAULT_ROOT = Path(__file__).resolve().parents[1]
30
+ ROOT_DIR = Path(os.getenv("APP_ROOT", str(DEFAULT_ROOT)))
31
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
32
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
33
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
34
+ CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers üçün ən yaxşısı HF_HOME
35
 
36
+ # cəhd et yaratmağa; icazə problemi olsa, local ./data-a düş
37
+ for pth in (CACHE_DIR,):
38
+ try:
39
+ pth.mkdir(parents=True, exist_ok=True)
40
+ except PermissionError:
41
+ pass
42
 
43
+ try:
44
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
45
+ UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
46
+ INDEX_DIR.mkdir(parents=True, exist_ok=True)
47
+ except PermissionError:
48
+ DATA_DIR = Path("./data"); DATA_DIR.mkdir(parents=True, exist_ok=True)
49
+ UPLOAD_DIR = DATA_DIR / "uploads"; UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
50
+ INDEX_DIR = DATA_DIR / "index"; INDEX_DIR.mkdir(parents=True, exist_ok=True)
51
+
52
+ # ---------------- Config ----------------
53
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
54
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
55
 
56
+ # ---------------- Helpers ----------------
57
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
58
+ AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
59
+ _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
60
  NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
61
 
62
  STOPWORDS = {
63
+ "the","a","an","and","or","of","to","in","on","for","with","by",
64
+ "this","that","these","those","is","are","was","were","be","been","being",
65
+ "at","as","it","its","from","into","about","over","after","before","than",
66
+ "such","can","could","should","would","may","might","will","shall"
67
  }
68
 
 
 
 
69
  def _fix_intra_word_spaces(s: str) -> str:
70
+ """'c l a s s' → 'class', 'H Ə F T Ə' → 'HƏFTƏ' (yalnız ardıcıl tək-hərflər)."""
71
  if not s:
72
  return s
73
  return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
74
 
75
  def _fix_mojibake(s: str) -> str:
76
+ """UTF-8-in Latin-1 kimi oxunmasından yaranan 'ö' və s. pozuntuları yumşaq düzəlt."""
77
  if not s:
78
  return s
79
  if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
 
83
  return s
84
  return s
85
 
 
 
 
 
 
 
 
 
 
 
 
86
  def _split_sentences(text: str) -> List[str]:
87
  return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
88
 
 
91
  if not alnum:
92
  return True
93
  digits = sum(c.isdigit() for c in alnum)
94
+ return digits / max(1, len(alnum)) > 0.3
95
 
96
  def _tabular_like(s: str) -> bool:
97
  hits = len(NUM_TOKEN_RE.findall(s))
 
101
  out = []
102
  for ln in text.splitlines():
103
  t = " ".join(ln.split())
104
+ if not t or _mostly_numeric(t) or _tabular_like(t):
 
 
 
 
105
  continue
106
  out.append(t)
107
  return " ".join(out)
 
122
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
123
  return has_az or non_ascii_ratio > 0.15
124
 
125
+ # ---------------- RAG Core ----------------
126
  class SimpleRAG:
127
  def __init__(
128
  self,
 
142
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
143
  self.chunks: List[str] = []
144
  self.last_added: List[str] = []
145
+ self._translator = None # lazy init
146
 
147
  self._load()
148
 
 
176
  pages: List[str] = []
177
  for p in reader.pages:
178
  t = p.extract_text() or ""
179
+ # normalizasiya ardıcıllığı
180
+ t = _ftfy(t)
181
+ t = _fix_mojibake(t)
182
+ t = _fix_intra_word_spaces(t)
183
+ t = re.sub(r"\s+", " ", t).strip()
184
+ if t:
185
  pages.append(t)
 
186
  chunks: List[str] = []
187
  for txt in pages:
188
  for i in range(0, len(txt), step):
189
+ part = txt[i : i + step].strip()
190
  if part:
191
  chunks.append(part)
192
+ return chunks
 
 
 
 
 
 
 
 
 
193
 
194
  # ---------- Indexing ----------
195
  def add_pdf(self, pdf_path: Path) -> int:
 
272
  if not contexts and self.is_empty:
273
  return "No relevant context found. Index is empty — upload a PDF first."
274
 
275
+ # konteksləri təmizlə
276
+ contexts = [
277
+ re.sub(r"\s+", " ", _fix_intra_word_spaces(_fix_mojibake(_ftfy(c)))).strip()
278
+ for c in (contexts or [])
279
+ ]
280
 
281
+ # Yaxın kontekstlərdən namizədlər
282
  local_pool: List[str] = []
283
  for c in (contexts or [])[:5]:
284
  cleaned = _clean_for_summary(c)
 
290
  continue
291
  local_pool.append(" ".join(w))
292
 
 
293
  selected: List[str] = []
294
  if local_pool:
295
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
 
304
  if len(selected) >= max_sentences:
305
  break
306
 
 
307
  if not selected:
308
  selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
309
 
310
  if not selected:
311
  return "No readable sentences matched the question. Try a more specific query."
312
 
313
+ # EN istəyə uyğun tərcümə
314
+ if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
315
  selected = self._translate_to_en(selected)
316
 
317
  bullets = "\n".join(f"- {s}" for s in selected)