Spaces:
Running
Running
Commit
·
6b6b475
1
Parent(s):
a5f927b
Fix indentation; stable EN extractive summarizer
Browse files- app/rag_system.py +61 -64
app/rag_system.py
CHANGED
@@ -19,18 +19,15 @@ for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
|
19 |
d.mkdir(parents=True, exist_ok=True)
|
20 |
|
21 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
22 |
-
|
23 |
-
# Output dili – EN üçün "en" saxla (default en)
|
24 |
OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
|
25 |
|
26 |
-
|
27 |
-
|
28 |
|
29 |
def _split_sentences(text: str) -> List[str]:
|
30 |
return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
|
31 |
|
32 |
def _mostly_numeric(s: str) -> bool:
|
33 |
-
# daha aqressiv threshold
|
34 |
alnum = [c for c in s if c.isalnum()]
|
35 |
if not alnum:
|
36 |
return True
|
@@ -38,20 +35,34 @@ def _mostly_numeric(s: str) -> bool:
|
|
38 |
return digits / max(1, len(alnum)) > 0.3
|
39 |
|
40 |
def _tabular_like(s: str) -> bool:
|
41 |
-
|
42 |
-
hits = len(NUM_PAT.findall(s))
|
43 |
return hits >= 2 or "Page" in s or len(s) < 20
|
44 |
|
45 |
def _clean_for_summary(text: str) -> str:
|
46 |
-
|
47 |
for ln in text.splitlines():
|
48 |
t = " ".join(ln.split())
|
49 |
-
if not t:
|
50 |
-
continue
|
51 |
-
if _mostly_numeric(t) or _tabular_like(t):
|
52 |
continue
|
53 |
-
|
54 |
-
return " ".join(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
class SimpleRAG:
|
57 |
def __init__(
|
@@ -69,33 +80,11 @@ class SimpleRAG:
|
|
69 |
self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
|
70 |
self.embed_dim = self.model.get_sentence_embedding_dimension()
|
71 |
|
72 |
-
#
|
73 |
-
self.
|
74 |
-
|
75 |
-
self.index: faiss.Index = None # type: ignore
|
76 |
self.chunks: List[str] = []
|
77 |
self._load()
|
78 |
|
79 |
-
# ---- translator (az->en) ----
|
80 |
-
def _translate_to_en(self, texts: List[str]) -> List[str]:
|
81 |
-
if OUTPUT_LANG != "en" or not texts:
|
82 |
-
return texts
|
83 |
-
try:
|
84 |
-
if self._translator is None:
|
85 |
-
from transformers import pipeline
|
86 |
-
# Helsinki-NLP az->en
|
87 |
-
self._translator = pipeline(
|
88 |
-
"translation",
|
89 |
-
model="Helsinki-NLP/opus-mt-az-en",
|
90 |
-
cache_dir=str(self.cache_dir),
|
91 |
-
device=-1,
|
92 |
-
)
|
93 |
-
outs = self._translator(texts, max_length=400)
|
94 |
-
return [o["translation_text"] for o in outs]
|
95 |
-
except Exception:
|
96 |
-
# tərcümə alınmasa, orijinalı qaytar
|
97 |
-
return texts
|
98 |
-
|
99 |
def _load(self) -> None:
|
100 |
if self.meta_path.exists():
|
101 |
try:
|
@@ -105,11 +94,10 @@ class SimpleRAG:
|
|
105 |
if self.index_path.exists():
|
106 |
try:
|
107 |
idx = faiss.read_index(str(self.index_path))
|
108 |
-
|
|
|
109 |
except Exception:
|
110 |
-
|
111 |
-
else:
|
112 |
-
self.index = faiss.IndexFlatIP(self.embed_dim)
|
113 |
|
114 |
def _persist(self) -> None:
|
115 |
faiss.write_index(self.index, str(self.index_path))
|
@@ -118,7 +106,7 @@ class SimpleRAG:
|
|
118 |
@staticmethod
|
119 |
def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
|
120 |
reader = PdfReader(str(pdf_path))
|
121 |
-
pages = []
|
122 |
for p in reader.pages:
|
123 |
t = p.extract_text() or ""
|
124 |
if t.strip():
|
@@ -126,7 +114,7 @@ class SimpleRAG:
|
|
126 |
chunks: List[str] = []
|
127 |
for txt in pages:
|
128 |
for i in range(0, len(txt), step):
|
129 |
-
part = txt[i:i+step].strip()
|
130 |
if part:
|
131 |
chunks.append(part)
|
132 |
return chunks
|
@@ -153,36 +141,52 @@ class SimpleRAG:
|
|
153 |
out.append((self.chunks[idx], float(score)))
|
154 |
return out
|
155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
157 |
if not contexts:
|
158 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
159 |
|
160 |
-
# 1)
|
161 |
-
|
162 |
for c in contexts[:5]:
|
163 |
-
|
164 |
for s in _split_sentences(cleaned):
|
165 |
-
# uzunluq və keyfiyyət filtrləri
|
166 |
w = s.split()
|
167 |
if not (8 <= len(w) <= 35):
|
168 |
-
|
169 |
if _tabular_like(s) or _mostly_numeric(s):
|
170 |
-
|
171 |
-
candidates.append(" ".join(w))
|
172 |
|
173 |
if not candidates:
|
174 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
175 |
|
176 |
-
# 2)
|
177 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
178 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
179 |
scores = (cand_emb @ q_emb.T).ravel()
|
180 |
order = np.argsort(-scores)
|
181 |
|
182 |
-
# 3)
|
183 |
selected: List[str] = []
|
184 |
for i in order:
|
185 |
-
|
186 |
if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
|
187 |
continue
|
188 |
selected.append(s)
|
@@ -192,22 +196,15 @@ class SimpleRAG:
|
|
192 |
if not selected:
|
193 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
194 |
|
195 |
-
# 4)
|
196 |
-
if
|
197 |
-
|
|
|
198 |
|
199 |
bullets = "\n".join(f"- {s}" for s in selected)
|
200 |
return f"Answer (based on document context):\n{bullets}"
|
201 |
|
202 |
-
def _sim_jaccard(a: str, b: str) -> float:
|
203 |
-
aw = set(a.lower().split())
|
204 |
-
bw = set(b.lower().split())
|
205 |
-
if not aw or not bw:
|
206 |
-
return 0.0
|
207 |
-
return len(aw & bw) / len(aw | bw)
|
208 |
-
|
209 |
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
210 |
return SimpleRAG().synthesize_answer(question, contexts)
|
211 |
|
212 |
-
|
213 |
__all__ = ["SimpleRAG", "synthesize_answer", "DATA_DIR", "UPLOAD_DIR", "INDEX_DIR", "CACHE_DIR", "MODEL_NAME"]
|
|
|
19 |
d.mkdir(parents=True, exist_ok=True)
|
20 |
|
21 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
|
22 |
OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
|
23 |
|
24 |
+
AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
|
25 |
+
NUM_TOK_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
|
26 |
|
27 |
def _split_sentences(text: str) -> List[str]:
|
28 |
return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
|
29 |
|
30 |
def _mostly_numeric(s: str) -> bool:
|
|
|
31 |
alnum = [c for c in s if c.isalnum()]
|
32 |
if not alnum:
|
33 |
return True
|
|
|
35 |
return digits / max(1, len(alnum)) > 0.3
|
36 |
|
37 |
def _tabular_like(s: str) -> bool:
|
38 |
+
hits = len(NUM_TOK_RE.findall(s))
|
|
|
39 |
return hits >= 2 or "Page" in s or len(s) < 20
|
40 |
|
41 |
def _clean_for_summary(text: str) -> str:
|
42 |
+
out = []
|
43 |
for ln in text.splitlines():
|
44 |
t = " ".join(ln.split())
|
45 |
+
if not t or _mostly_numeric(t) or _tabular_like(t):
|
|
|
|
|
46 |
continue
|
47 |
+
out.append(t)
|
48 |
+
return " ".join(out)
|
49 |
+
|
50 |
+
def _norm_fingerprint(s: str) -> str:
|
51 |
+
s = s.lower()
|
52 |
+
s = "".join(ch for ch in s if ch.isalpha() or ch.isspace())
|
53 |
+
return " ".join(s.split())
|
54 |
+
|
55 |
+
def _sim_jaccard(a: str, b: str) -> float:
|
56 |
+
aw = set(a.lower().split())
|
57 |
+
bw = set(b.lower().split())
|
58 |
+
if not aw or not bw:
|
59 |
+
return 0.0
|
60 |
+
return len(aw & bw) / len(aw | bw)
|
61 |
+
|
62 |
+
def _looks_azerbaijani(s: str) -> bool:
|
63 |
+
has_az = any(ch in AZ_CHARS for ch in s)
|
64 |
+
non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
|
65 |
+
return has_az or non_ascii_ratio > 0.15
|
66 |
|
67 |
class SimpleRAG:
|
68 |
def __init__(
|
|
|
80 |
self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
|
81 |
self.embed_dim = self.model.get_sentence_embedding_dimension()
|
82 |
|
83 |
+
self._translator = None # lazy
|
84 |
+
self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
|
|
|
|
|
85 |
self.chunks: List[str] = []
|
86 |
self._load()
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
def _load(self) -> None:
|
89 |
if self.meta_path.exists():
|
90 |
try:
|
|
|
94 |
if self.index_path.exists():
|
95 |
try:
|
96 |
idx = faiss.read_index(str(self.index_path))
|
97 |
+
if getattr(idx, "d", None) == self.embed_dim:
|
98 |
+
self.index = idx
|
99 |
except Exception:
|
100 |
+
pass
|
|
|
|
|
101 |
|
102 |
def _persist(self) -> None:
|
103 |
faiss.write_index(self.index, str(self.index_path))
|
|
|
106 |
@staticmethod
|
107 |
def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
|
108 |
reader = PdfReader(str(pdf_path))
|
109 |
+
pages: List[str] = []
|
110 |
for p in reader.pages:
|
111 |
t = p.extract_text() or ""
|
112 |
if t.strip():
|
|
|
114 |
chunks: List[str] = []
|
115 |
for txt in pages:
|
116 |
for i in range(0, len(txt), step):
|
117 |
+
part = txt[i : i + step].strip()
|
118 |
if part:
|
119 |
chunks.append(part)
|
120 |
return chunks
|
|
|
141 |
out.append((self.chunks[idx], float(score)))
|
142 |
return out
|
143 |
|
144 |
+
def _translate_to_en(self, texts: List[str]) -> List[str]:
|
145 |
+
if not texts:
|
146 |
+
return texts
|
147 |
+
try:
|
148 |
+
from transformers import pipeline
|
149 |
+
if self._translator is None:
|
150 |
+
self._translator = pipeline(
|
151 |
+
"translation",
|
152 |
+
model="Helsinki-NLP/opus-mt-az-en",
|
153 |
+
cache_dir=str(self.cache_dir),
|
154 |
+
device=-1,
|
155 |
+
)
|
156 |
+
outs = self._translator(texts, max_length=400)
|
157 |
+
return [o["translation_text"].strip() for o in outs]
|
158 |
+
except Exception:
|
159 |
+
return texts
|
160 |
+
|
161 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
162 |
if not contexts:
|
163 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
164 |
|
165 |
+
# 1) candidates (aggressive clean)
|
166 |
+
candidates: List[str] = []
|
167 |
for c in contexts[:5]:
|
168 |
+
cleaned = _clean_for_summary(c)
|
169 |
for s in _split_sentences(cleaned):
|
|
|
170 |
w = s.split()
|
171 |
if not (8 <= len(w) <= 35):
|
172 |
+
continue
|
173 |
if _tabular_like(s) or _mostly_numeric(s):
|
174 |
+
continue
|
175 |
+
candidates.append(" ".join(w))
|
176 |
|
177 |
if not candidates:
|
178 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
179 |
|
180 |
+
# 2) rank by similarity
|
181 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
182 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
183 |
scores = (cand_emb @ q_emb.T).ravel()
|
184 |
order = np.argsort(-scores)
|
185 |
|
186 |
+
# 3) near-duplicate dedup
|
187 |
selected: List[str] = []
|
188 |
for i in order:
|
189 |
+
s = candidates[i].strip()
|
190 |
if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
|
191 |
continue
|
192 |
selected.append(s)
|
|
|
196 |
if not selected:
|
197 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
198 |
|
199 |
+
# 4) translate to EN if needed
|
200 |
+
if OUTPUT_LANG == "en":
|
201 |
+
if any(_looks_azerbaijani(s) for s in selected):
|
202 |
+
selected = self._translate_to_en(selected)
|
203 |
|
204 |
bullets = "\n".join(f"- {s}" for s in selected)
|
205 |
return f"Answer (based on document context):\n{bullets}"
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
208 |
return SimpleRAG().synthesize_answer(question, contexts)
|
209 |
|
|
|
210 |
__all__ = ["SimpleRAG", "synthesize_answer", "DATA_DIR", "UPLOAD_DIR", "INDEX_DIR", "CACHE_DIR", "MODEL_NAME"]
|