Spaces:
Running
Running
Commit
·
f06409c
1
Parent(s):
a46e32d
Force EN translation + strong dedup filtering
Browse files- app/rag_system.py +26 -15
app/rag_system.py
CHANGED
@@ -153,47 +153,58 @@ class SimpleRAG:
|
|
153 |
out.append((self.chunks[idx], float(score)))
|
154 |
return out
|
155 |
|
156 |
-
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int =
|
157 |
if not contexts:
|
158 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
159 |
|
160 |
-
# Candidate
|
161 |
-
|
162 |
for c in contexts[:5]:
|
163 |
-
|
164 |
for s in _split_sentences(cleaned):
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
if not candidates:
|
169 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
170 |
|
171 |
-
#
|
172 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
173 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
174 |
scores = (cand_emb @ q_emb.T).ravel()
|
175 |
order = np.argsort(-scores)
|
176 |
|
177 |
-
#
|
178 |
selected: List[str] = []
|
179 |
-
seen = set()
|
180 |
for i in order:
|
181 |
-
|
182 |
-
|
183 |
-
if key in seen:
|
184 |
continue
|
185 |
-
seen.add(key)
|
186 |
selected.append(s)
|
187 |
if len(selected) >= max_sentences:
|
188 |
break
|
189 |
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
192 |
selected = self._translate_to_en(selected)
|
193 |
|
194 |
bullets = "\n".join(f"- {s}" for s in selected)
|
195 |
return f"Answer (based on document context):\n{bullets}"
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
199 |
return SimpleRAG().synthesize_answer(question, contexts)
|
|
|
153 |
out.append((self.chunks[idx], float(score)))
|
154 |
return out
|
155 |
|
156 |
+
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
157 |
if not contexts:
|
158 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
159 |
|
160 |
+
# 1) Candidate sentence-lər (aggressive clean)
|
161 |
+
candidates: List[str] = []
|
162 |
for c in contexts[:5]:
|
163 |
+
cleaned = _clean_for_summary(c)
|
164 |
for s in _split_sentences(cleaned):
|
165 |
+
# uzunluq və keyfiyyət filtrləri
|
166 |
+
w = s.split()
|
167 |
+
if not (8 <= len(w) <= 35):
|
168 |
+
continue
|
169 |
+
if _tabular_like(s) or _mostly_numeric(s):
|
170 |
+
continue
|
171 |
+
candidates.append(" ".join(w)) # normalizasiya: bir boşluq
|
172 |
|
173 |
if not candidates:
|
174 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
175 |
|
176 |
+
# 2) Oxşarlığa görə sıralama
|
177 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
178 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
179 |
scores = (cand_emb @ q_emb.T).ravel()
|
180 |
order = np.argsort(-scores)
|
181 |
|
182 |
+
# 3) Near-duplicate dedup (Jaccard söz seti) – threshold 0.82
|
183 |
selected: List[str] = []
|
|
|
184 |
for i in order:
|
185 |
+
s = candidates[i].strip()
|
186 |
+
if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
|
|
|
187 |
continue
|
|
|
188 |
selected.append(s)
|
189 |
if len(selected) >= max_sentences:
|
190 |
break
|
191 |
|
192 |
+
if not selected:
|
193 |
+
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
194 |
+
|
195 |
+
# 4) HƏMİŞƏ EN tərcümə (istəyin belədir)
|
196 |
+
if os.getenv("OUTPUT_LANG", "en").lower() == "en":
|
197 |
selected = self._translate_to_en(selected)
|
198 |
|
199 |
bullets = "\n".join(f"- {s}" for s in selected)
|
200 |
return f"Answer (based on document context):\n{bullets}"
|
201 |
|
202 |
+
def _sim_jaccard(a: str, b: str) -> float:
|
203 |
+
aw = set(a.lower().split())
|
204 |
+
bw = set(b.lower().split())
|
205 |
+
if not aw or not bw:
|
206 |
+
return 0.0
|
207 |
+
return len(aw & bw) / len(aw | bw)
|
208 |
|
209 |
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
210 |
return SimpleRAG().synthesize_answer(question, contexts)
|