Spaces:
Sleeping
Sleeping
Commit
·
9dc7698
1
Parent(s):
2fbf4a0
Normalize PDF text (ftfy) + stricter cleaning to reduce mojibake/noise
Browse files- app/rag_system.py +66 -42
- requirements.txt +1 -0
app/rag_system.py
CHANGED
@@ -4,51 +4,57 @@ from __future__ import annotations
|
|
4 |
import os
|
5 |
import re
|
6 |
from pathlib import Path
|
7 |
-
from typing import List, Tuple
|
8 |
|
9 |
import faiss
|
10 |
import numpy as np
|
|
|
11 |
|
12 |
-
#
|
13 |
-
import re
|
14 |
-
|
15 |
-
AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
|
16 |
-
_SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
|
17 |
-
|
18 |
-
def _fix_intra_word_spaces(s: str) -> str:
|
19 |
-
"""Join sequences like 'H Ə F T Ə' -> 'HƏFTƏ' without touching normal words."""
|
20 |
-
if not s:
|
21 |
-
return s
|
22 |
-
return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
|
23 |
-
|
24 |
-
# Prefer pypdf; fallback to PyPDF2 if needed
|
25 |
try:
|
26 |
from pypdf import PdfReader
|
27 |
-
except Exception:
|
28 |
from PyPDF2 import PdfReader # type: ignore
|
29 |
|
30 |
from sentence_transformers import SentenceTransformer
|
31 |
|
32 |
-
#
|
33 |
-
#
|
34 |
ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
|
35 |
DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
|
36 |
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
|
37 |
INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
|
38 |
-
CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
|
39 |
|
40 |
for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
41 |
d.mkdir(parents=True, exist_ok=True)
|
42 |
|
43 |
-
#
|
44 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
45 |
OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
|
46 |
|
47 |
-
#
|
48 |
AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
def _fix_mojibake(s: str) -> str:
|
51 |
-
"""
|
52 |
if not s:
|
53 |
return s
|
54 |
if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
|
@@ -58,6 +64,17 @@ def _fix_mojibake(s: str) -> str:
|
|
58 |
return s
|
59 |
return s
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def _split_sentences(text: str) -> List[str]:
|
62 |
return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
|
63 |
|
@@ -66,9 +83,7 @@ def _mostly_numeric(s: str) -> bool:
|
|
66 |
if not alnum:
|
67 |
return True
|
68 |
digits = sum(c.isdigit() for c in alnum)
|
69 |
-
return digits / max(1, len(alnum)) > 0.
|
70 |
-
|
71 |
-
NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
|
72 |
|
73 |
def _tabular_like(s: str) -> bool:
|
74 |
hits = len(NUM_TOKEN_RE.findall(s))
|
@@ -78,7 +93,11 @@ def _clean_for_summary(text: str) -> str:
|
|
78 |
out = []
|
79 |
for ln in text.splitlines():
|
80 |
t = " ".join(ln.split())
|
81 |
-
if not t
|
|
|
|
|
|
|
|
|
82 |
continue
|
83 |
out.append(t)
|
84 |
return " ".join(out)
|
@@ -90,13 +109,6 @@ def _sim_jaccard(a: str, b: str) -> float:
|
|
90 |
return 0.0
|
91 |
return len(aw & bw) / len(aw | bw)
|
92 |
|
93 |
-
STOPWORDS = {
|
94 |
-
"the","a","an","and","or","of","to","in","on","for","with","by",
|
95 |
-
"this","that","these","those","is","are","was","were","be","been","being",
|
96 |
-
"at","as","it","its","from","into","about","over","after","before","than",
|
97 |
-
"such","can","could","should","would","may","might","will","shall"
|
98 |
-
}
|
99 |
-
|
100 |
def _keywords(text: str) -> List[str]:
|
101 |
toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
|
102 |
return [t for t in toks if t not in STOPWORDS and len(t) > 2]
|
@@ -106,7 +118,7 @@ def _looks_azerbaijani(s: str) -> bool:
|
|
106 |
non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
|
107 |
return has_az or non_ascii_ratio > 0.15
|
108 |
|
109 |
-
#
|
110 |
class SimpleRAG:
|
111 |
def __init__(
|
112 |
self,
|
@@ -126,7 +138,7 @@ class SimpleRAG:
|
|
126 |
self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
|
127 |
self.chunks: List[str] = []
|
128 |
self.last_added: List[str] = []
|
129 |
-
self._translator = None # lazy
|
130 |
|
131 |
self._load()
|
132 |
|
@@ -160,16 +172,26 @@ class SimpleRAG:
|
|
160 |
pages: List[str] = []
|
161 |
for p in reader.pages:
|
162 |
t = p.extract_text() or ""
|
163 |
-
t = _fix_mojibake(t)
|
164 |
if t.strip():
|
|
|
165 |
pages.append(t)
|
|
|
166 |
chunks: List[str] = []
|
167 |
for txt in pages:
|
168 |
for i in range(0, len(txt), step):
|
169 |
-
part = txt[i
|
170 |
if part:
|
171 |
chunks.append(part)
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
# ---------- Indexing ----------
|
175 |
def add_pdf(self, pdf_path: Path) -> int:
|
@@ -252,10 +274,10 @@ class SimpleRAG:
|
|
252 |
if not contexts and self.is_empty:
|
253 |
return "No relevant context found. Index is empty — upload a PDF first."
|
254 |
|
255 |
-
#
|
256 |
-
contexts = [
|
257 |
|
258 |
-
#
|
259 |
local_pool: List[str] = []
|
260 |
for c in (contexts or [])[:5]:
|
261 |
cleaned = _clean_for_summary(c)
|
@@ -267,6 +289,7 @@ class SimpleRAG:
|
|
267 |
continue
|
268 |
local_pool.append(" ".join(w))
|
269 |
|
|
|
270 |
selected: List[str] = []
|
271 |
if local_pool:
|
272 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
@@ -281,20 +304,21 @@ class SimpleRAG:
|
|
281 |
if len(selected) >= max_sentences:
|
282 |
break
|
283 |
|
|
|
284 |
if not selected:
|
285 |
selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
|
286 |
|
287 |
if not selected:
|
288 |
return "No readable sentences matched the question. Try a more specific query."
|
289 |
|
290 |
-
|
|
|
291 |
selected = self._translate_to_en(selected)
|
292 |
|
293 |
bullets = "\n".join(f"- {s}" for s in selected)
|
294 |
return f"Answer (based on document context):\n{bullets}"
|
295 |
|
296 |
|
297 |
-
# Public API
|
298 |
__all__ = [
|
299 |
"SimpleRAG",
|
300 |
"UPLOAD_DIR",
|
|
|
4 |
import os
|
5 |
import re
|
6 |
from pathlib import Path
|
7 |
+
from typing import List, Tuple
|
8 |
|
9 |
import faiss
|
10 |
import numpy as np
|
11 |
+
from ftfy import fix_text
|
12 |
|
13 |
+
# Prefer pypdf; fallback to PyPDF2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
try:
|
15 |
from pypdf import PdfReader
|
16 |
+
except Exception: # pragma: no cover
|
17 |
from PyPDF2 import PdfReader # type: ignore
|
18 |
|
19 |
from sentence_transformers import SentenceTransformer
|
20 |
|
21 |
+
# ===================== Paths (HF-safe) =====================
|
22 |
+
# HF Spaces üçün yazıla bilən baza /app-dir. Lokal mühitdə də işləyir.
|
23 |
ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
|
24 |
DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
|
25 |
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
|
26 |
INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
|
27 |
+
CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
|
28 |
|
29 |
for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
30 |
d.mkdir(parents=True, exist_ok=True)
|
31 |
|
32 |
+
# ===================== Config =====================
|
33 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
34 |
OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
|
35 |
|
36 |
+
# ===================== Helpers =====================
|
37 |
AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
|
38 |
+
NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
|
39 |
+
|
40 |
+
STOPWORDS = {
|
41 |
+
"the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "by",
|
42 |
+
"this", "that", "these", "those", "is", "are", "was", "were", "be", "been", "being",
|
43 |
+
"at", "as", "it", "its", "from", "into", "about", "over", "after", "before", "than",
|
44 |
+
"such", "can", "could", "should", "would", "may", "might", "will", "shall"
|
45 |
+
}
|
46 |
+
|
47 |
+
AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
|
48 |
+
_SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
|
49 |
+
|
50 |
+
def _fix_intra_word_spaces(s: str) -> str:
|
51 |
+
"""'H Ə F T Ə' → 'HƏFTƏ' (yalnız ardıcıl tək-hərf qaçışlarını birləşdirir)."""
|
52 |
+
if not s:
|
53 |
+
return s
|
54 |
+
return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
|
55 |
|
56 |
def _fix_mojibake(s: str) -> str:
|
57 |
+
"""UTF-8-as-Latin1 tipik mojibake üçün sürətli həll."""
|
58 |
if not s:
|
59 |
return s
|
60 |
if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
|
|
|
64 |
return s
|
65 |
return s
|
66 |
|
67 |
+
def _normalize_text(s: str) -> str:
|
68 |
+
if not s:
|
69 |
+
return s
|
70 |
+
s = fix_text(s) # ftfy ilə ümumi düzəlişlər
|
71 |
+
s = _fix_mojibake(s) # latin-1 → utf-8 “çevrilməsi” cəhd
|
72 |
+
s = s.replace("fi", "fi").replace("fl", "fl")
|
73 |
+
s = _fix_intra_word_spaces(s) # H Ə F T Ə → HƏFTƏ
|
74 |
+
s = re.sub(r"[ \t]+", " ", s)
|
75 |
+
s = re.sub(r"\s+\n", "\n", s)
|
76 |
+
return s.strip()
|
77 |
+
|
78 |
def _split_sentences(text: str) -> List[str]:
|
79 |
return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
|
80 |
|
|
|
83 |
if not alnum:
|
84 |
return True
|
85 |
digits = sum(c.isdigit() for c in alnum)
|
86 |
+
return digits / max(1, len(alnum)) > 0.30
|
|
|
|
|
87 |
|
88 |
def _tabular_like(s: str) -> bool:
|
89 |
hits = len(NUM_TOKEN_RE.findall(s))
|
|
|
93 |
out = []
|
94 |
for ln in text.splitlines():
|
95 |
t = " ".join(ln.split())
|
96 |
+
if not t:
|
97 |
+
continue
|
98 |
+
if len(t) < 25:
|
99 |
+
continue
|
100 |
+
if _mostly_numeric(t) or _tabular_like(t):
|
101 |
continue
|
102 |
out.append(t)
|
103 |
return " ".join(out)
|
|
|
109 |
return 0.0
|
110 |
return len(aw & bw) / len(aw | bw)
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
def _keywords(text: str) -> List[str]:
|
113 |
toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
|
114 |
return [t for t in toks if t not in STOPWORDS and len(t) > 2]
|
|
|
118 |
non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
|
119 |
return has_az or non_ascii_ratio > 0.15
|
120 |
|
121 |
+
# ===================== RAG Core =====================
|
122 |
class SimpleRAG:
|
123 |
def __init__(
|
124 |
self,
|
|
|
138 |
self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
|
139 |
self.chunks: List[str] = []
|
140 |
self.last_added: List[str] = []
|
141 |
+
self._translator = None # lazy
|
142 |
|
143 |
self._load()
|
144 |
|
|
|
172 |
pages: List[str] = []
|
173 |
for p in reader.pages:
|
174 |
t = p.extract_text() or ""
|
|
|
175 |
if t.strip():
|
176 |
+
t = _normalize_text(t)
|
177 |
pages.append(t)
|
178 |
+
|
179 |
chunks: List[str] = []
|
180 |
for txt in pages:
|
181 |
for i in range(0, len(txt), step):
|
182 |
+
part = txt[i: i + step].strip()
|
183 |
if part:
|
184 |
chunks.append(part)
|
185 |
+
|
186 |
+
# simple dedup to avoid exact repeats
|
187 |
+
seen = set()
|
188 |
+
uniq: List[str] = []
|
189 |
+
for c in chunks:
|
190 |
+
if c in seen:
|
191 |
+
continue
|
192 |
+
seen.add(c)
|
193 |
+
uniq.append(c)
|
194 |
+
return uniq
|
195 |
|
196 |
# ---------- Indexing ----------
|
197 |
def add_pdf(self, pdf_path: Path) -> int:
|
|
|
274 |
if not contexts and self.is_empty:
|
275 |
return "No relevant context found. Index is empty — upload a PDF first."
|
276 |
|
277 |
+
# normalize contexts (mojibake, spacing, etc.)
|
278 |
+
contexts = [_normalize_text(c) for c in (contexts or [])]
|
279 |
|
280 |
+
# 1) local candidate pool
|
281 |
local_pool: List[str] = []
|
282 |
for c in (contexts or [])[:5]:
|
283 |
cleaned = _clean_for_summary(c)
|
|
|
289 |
continue
|
290 |
local_pool.append(" ".join(w))
|
291 |
|
292 |
+
# 2) rank by similarity to question
|
293 |
selected: List[str] = []
|
294 |
if local_pool:
|
295 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
|
|
304 |
if len(selected) >= max_sentences:
|
305 |
break
|
306 |
|
307 |
+
# 3) keyword fallback (whole corpus) əgər nəticə zəifdirsə
|
308 |
if not selected:
|
309 |
selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
|
310 |
|
311 |
if not selected:
|
312 |
return "No readable sentences matched the question. Try a more specific query."
|
313 |
|
314 |
+
# 4) translate to EN if needed
|
315 |
+
if OUTPUT_LANG == "en" and any(_looks_azerbaijani(s) for s in selected):
|
316 |
selected = self._translate_to_en(selected)
|
317 |
|
318 |
bullets = "\n".join(f"- {s}" for s in selected)
|
319 |
return f"Answer (based on document context):\n{bullets}"
|
320 |
|
321 |
|
|
|
322 |
__all__ = [
|
323 |
"SimpleRAG",
|
324 |
"UPLOAD_DIR",
|
requirements.txt
CHANGED
@@ -10,3 +10,4 @@ python-multipart
|
|
10 |
pdfminer.six
|
11 |
numpy
|
12 |
requests
|
|
|
|
10 |
pdfminer.six
|
11 |
numpy
|
12 |
requests
|
13 |
+
ftfy
|