HamidOmarov commited on
Commit
9dc7698
·
1 Parent(s): 2fbf4a0

Normalize PDF text (ftfy) + stricter cleaning to reduce mojibake/noise

Browse files
Files changed (2) hide show
  1. app/rag_system.py +66 -42
  2. requirements.txt +1 -0
app/rag_system.py CHANGED
@@ -4,51 +4,57 @@ from __future__ import annotations
4
  import os
5
  import re
6
  from pathlib import Path
7
- from typing import List, Tuple, Optional
8
 
9
  import faiss
10
  import numpy as np
 
11
 
12
- # -- add near other helpers --
13
- import re
14
-
15
- AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
16
- _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
17
-
18
- def _fix_intra_word_spaces(s: str) -> str:
19
- """Join sequences like 'H Ə F T Ə' -> 'HƏFTƏ' without touching normal words."""
20
- if not s:
21
- return s
22
- return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
23
-
24
- # Prefer pypdf; fallback to PyPDF2 if needed
25
  try:
26
  from pypdf import PdfReader
27
- except Exception:
28
  from PyPDF2 import PdfReader # type: ignore
29
 
30
  from sentence_transformers import SentenceTransformer
31
 
32
- # ---------------- Paths & Cache (HF-safe) ----------------
33
- # Writeable base is /app in HF Spaces. Allow ENV overrides.
34
  ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
35
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
36
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
37
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
38
- CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers prefers HF_HOME
39
 
40
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
41
  d.mkdir(parents=True, exist_ok=True)
42
 
43
- # ---------------- Config ----------------
44
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
45
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
46
 
47
- # ---------------- Helpers ----------------
48
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def _fix_mojibake(s: str) -> str:
51
- """Fix common UTF-8-as-Latin-1 mojibake."""
52
  if not s:
53
  return s
54
  if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
@@ -58,6 +64,17 @@ def _fix_mojibake(s: str) -> str:
58
  return s
59
  return s
60
 
 
 
 
 
 
 
 
 
 
 
 
61
  def _split_sentences(text: str) -> List[str]:
62
  return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
63
 
@@ -66,9 +83,7 @@ def _mostly_numeric(s: str) -> bool:
66
  if not alnum:
67
  return True
68
  digits = sum(c.isdigit() for c in alnum)
69
- return digits / max(1, len(alnum)) > 0.3
70
-
71
- NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
72
 
73
  def _tabular_like(s: str) -> bool:
74
  hits = len(NUM_TOKEN_RE.findall(s))
@@ -78,7 +93,11 @@ def _clean_for_summary(text: str) -> str:
78
  out = []
79
  for ln in text.splitlines():
80
  t = " ".join(ln.split())
81
- if not t or _mostly_numeric(t) or _tabular_like(t):
 
 
 
 
82
  continue
83
  out.append(t)
84
  return " ".join(out)
@@ -90,13 +109,6 @@ def _sim_jaccard(a: str, b: str) -> float:
90
  return 0.0
91
  return len(aw & bw) / len(aw | bw)
92
 
93
- STOPWORDS = {
94
- "the","a","an","and","or","of","to","in","on","for","with","by",
95
- "this","that","these","those","is","are","was","were","be","been","being",
96
- "at","as","it","its","from","into","about","over","after","before","than",
97
- "such","can","could","should","would","may","might","will","shall"
98
- }
99
-
100
  def _keywords(text: str) -> List[str]:
101
  toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
102
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
@@ -106,7 +118,7 @@ def _looks_azerbaijani(s: str) -> bool:
106
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
107
  return has_az or non_ascii_ratio > 0.15
108
 
109
- # ---------------- RAG Core ----------------
110
  class SimpleRAG:
111
  def __init__(
112
  self,
@@ -126,7 +138,7 @@ class SimpleRAG:
126
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
127
  self.chunks: List[str] = []
128
  self.last_added: List[str] = []
129
- self._translator = None # lazy init
130
 
131
  self._load()
132
 
@@ -160,16 +172,26 @@ class SimpleRAG:
160
  pages: List[str] = []
161
  for p in reader.pages:
162
  t = p.extract_text() or ""
163
- t = _fix_mojibake(t)
164
  if t.strip():
 
165
  pages.append(t)
 
166
  chunks: List[str] = []
167
  for txt in pages:
168
  for i in range(0, len(txt), step):
169
- part = txt[i : i + step].strip()
170
  if part:
171
  chunks.append(part)
172
- return chunks
 
 
 
 
 
 
 
 
 
173
 
174
  # ---------- Indexing ----------
175
  def add_pdf(self, pdf_path: Path) -> int:
@@ -252,10 +274,10 @@ class SimpleRAG:
252
  if not contexts and self.is_empty:
253
  return "No relevant context found. Index is empty — upload a PDF first."
254
 
255
- # Fix mojibake in contexts
256
- contexts = [_fix_mojibake(c) for c in (contexts or [])]
257
 
258
- # Build candidate sentences from nearby contexts
259
  local_pool: List[str] = []
260
  for c in (contexts or [])[:5]:
261
  cleaned = _clean_for_summary(c)
@@ -267,6 +289,7 @@ class SimpleRAG:
267
  continue
268
  local_pool.append(" ".join(w))
269
 
 
270
  selected: List[str] = []
271
  if local_pool:
272
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
@@ -281,20 +304,21 @@ class SimpleRAG:
281
  if len(selected) >= max_sentences:
282
  break
283
 
 
284
  if not selected:
285
  selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
286
 
287
  if not selected:
288
  return "No readable sentences matched the question. Try a more specific query."
289
 
290
- if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
 
291
  selected = self._translate_to_en(selected)
292
 
293
  bullets = "\n".join(f"- {s}" for s in selected)
294
  return f"Answer (based on document context):\n{bullets}"
295
 
296
 
297
- # Public API
298
  __all__ = [
299
  "SimpleRAG",
300
  "UPLOAD_DIR",
 
4
  import os
5
  import re
6
  from pathlib import Path
7
+ from typing import List, Tuple
8
 
9
  import faiss
10
  import numpy as np
11
+ from ftfy import fix_text
12
 
13
+ # Prefer pypdf; fallback to PyPDF2
 
 
 
 
 
 
 
 
 
 
 
 
14
  try:
15
  from pypdf import PdfReader
16
+ except Exception: # pragma: no cover
17
  from PyPDF2 import PdfReader # type: ignore
18
 
19
  from sentence_transformers import SentenceTransformer
20
 
21
+ # ===================== Paths (HF-safe) =====================
22
+ # HF Spaces üçün yazıla bilən baza /app-dir. Lokal mühitdə də işləyir.
23
  ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
24
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
25
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
26
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
27
+ CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
28
 
29
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
30
  d.mkdir(parents=True, exist_ok=True)
31
 
32
+ # ===================== Config =====================
33
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
34
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
35
 
36
+ # ===================== Helpers =====================
37
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
38
+ NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
39
+
40
+ STOPWORDS = {
41
+ "the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "by",
42
+ "this", "that", "these", "those", "is", "are", "was", "were", "be", "been", "being",
43
+ "at", "as", "it", "its", "from", "into", "about", "over", "after", "before", "than",
44
+ "such", "can", "could", "should", "would", "may", "might", "will", "shall"
45
+ }
46
+
47
+ AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
48
+ _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
49
+
50
+ def _fix_intra_word_spaces(s: str) -> str:
51
+ """'H Ə F T Ə' → 'HƏFTƏ' (yalnız ardıcıl tək-hərf qaçışlarını birləşdirir)."""
52
+ if not s:
53
+ return s
54
+ return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
55
 
56
  def _fix_mojibake(s: str) -> str:
57
+ """UTF-8-as-Latin1 tipik mojibake üçün sürətli həll."""
58
  if not s:
59
  return s
60
  if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
 
64
  return s
65
  return s
66
 
67
+ def _normalize_text(s: str) -> str:
68
+ if not s:
69
+ return s
70
+ s = fix_text(s) # ftfy ilə ümumi düzəlişlər
71
+ s = _fix_mojibake(s) # latin-1 → utf-8 “çevrilməsi” cəhd
72
+ s = s.replace("fi", "fi").replace("fl", "fl")
73
+ s = _fix_intra_word_spaces(s) # H Ə F T Ə → HƏFTƏ
74
+ s = re.sub(r"[ \t]+", " ", s)
75
+ s = re.sub(r"\s+\n", "\n", s)
76
+ return s.strip()
77
+
78
  def _split_sentences(text: str) -> List[str]:
79
  return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
80
 
 
83
  if not alnum:
84
  return True
85
  digits = sum(c.isdigit() for c in alnum)
86
+ return digits / max(1, len(alnum)) > 0.30
 
 
87
 
88
  def _tabular_like(s: str) -> bool:
89
  hits = len(NUM_TOKEN_RE.findall(s))
 
93
  out = []
94
  for ln in text.splitlines():
95
  t = " ".join(ln.split())
96
+ if not t:
97
+ continue
98
+ if len(t) < 25:
99
+ continue
100
+ if _mostly_numeric(t) or _tabular_like(t):
101
  continue
102
  out.append(t)
103
  return " ".join(out)
 
109
  return 0.0
110
  return len(aw & bw) / len(aw | bw)
111
 
 
 
 
 
 
 
 
112
  def _keywords(text: str) -> List[str]:
113
  toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
114
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
 
118
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
119
  return has_az or non_ascii_ratio > 0.15
120
 
121
+ # ===================== RAG Core =====================
122
  class SimpleRAG:
123
  def __init__(
124
  self,
 
138
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
139
  self.chunks: List[str] = []
140
  self.last_added: List[str] = []
141
+ self._translator = None # lazy
142
 
143
  self._load()
144
 
 
172
  pages: List[str] = []
173
  for p in reader.pages:
174
  t = p.extract_text() or ""
 
175
  if t.strip():
176
+ t = _normalize_text(t)
177
  pages.append(t)
178
+
179
  chunks: List[str] = []
180
  for txt in pages:
181
  for i in range(0, len(txt), step):
182
+ part = txt[i: i + step].strip()
183
  if part:
184
  chunks.append(part)
185
+
186
+ # simple dedup to avoid exact repeats
187
+ seen = set()
188
+ uniq: List[str] = []
189
+ for c in chunks:
190
+ if c in seen:
191
+ continue
192
+ seen.add(c)
193
+ uniq.append(c)
194
+ return uniq
195
 
196
  # ---------- Indexing ----------
197
  def add_pdf(self, pdf_path: Path) -> int:
 
274
  if not contexts and self.is_empty:
275
  return "No relevant context found. Index is empty — upload a PDF first."
276
 
277
+ # normalize contexts (mojibake, spacing, etc.)
278
+ contexts = [_normalize_text(c) for c in (contexts or [])]
279
 
280
+ # 1) local candidate pool
281
  local_pool: List[str] = []
282
  for c in (contexts or [])[:5]:
283
  cleaned = _clean_for_summary(c)
 
289
  continue
290
  local_pool.append(" ".join(w))
291
 
292
+ # 2) rank by similarity to question
293
  selected: List[str] = []
294
  if local_pool:
295
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
 
304
  if len(selected) >= max_sentences:
305
  break
306
 
307
+ # 3) keyword fallback (whole corpus) əgər nəticə zəifdirsə
308
  if not selected:
309
  selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
310
 
311
  if not selected:
312
  return "No readable sentences matched the question. Try a more specific query."
313
 
314
+ # 4) translate to EN if needed
315
+ if OUTPUT_LANG == "en" and any(_looks_azerbaijani(s) for s in selected):
316
  selected = self._translate_to_en(selected)
317
 
318
  bullets = "\n".join(f"- {s}" for s in selected)
319
  return f"Answer (based on document context):\n{bullets}"
320
 
321
 
 
322
  __all__ = [
323
  "SimpleRAG",
324
  "UPLOAD_DIR",
requirements.txt CHANGED
@@ -10,3 +10,4 @@ python-multipart
10
  pdfminer.six
11
  numpy
12
  requests
 
 
10
  pdfminer.six
11
  numpy
12
  requests
13
+ ftfy