Spaces:
Running
Running
Update app/rag_system.py
Browse files- app/rag_system.py +10 -13
app/rag_system.py
CHANGED
@@ -17,12 +17,14 @@ except Exception:
|
|
17 |
|
18 |
from sentence_transformers import SentenceTransformer
|
19 |
|
20 |
-
# ---------------- Paths & Cache ----------------
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
27 |
d.mkdir(parents=True, exist_ok=True)
|
28 |
|
@@ -45,7 +47,6 @@ def _fix_mojibake(s: str) -> str:
|
|
45 |
return s
|
46 |
|
47 |
def _split_sentences(text: str) -> List[str]:
|
48 |
-
# Split on punctuation boundaries and line breaks
|
49 |
return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
|
50 |
|
51 |
def _mostly_numeric(s: str) -> bool:
|
@@ -206,7 +207,6 @@ class SimpleRAG:
|
|
206 |
|
207 |
# ---------- Fallbacks ----------
|
208 |
def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
|
209 |
-
"""Pick sentences sharing keywords with the question (question-dependent even if dense retrieval is weak)."""
|
210 |
qk = set(_keywords(question))
|
211 |
if not qk:
|
212 |
return []
|
@@ -237,7 +237,6 @@ class SimpleRAG:
|
|
237 |
|
238 |
# ---------- Answer Synthesis ----------
|
239 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
240 |
-
"""Extractive summary over retrieved contexts; falls back to keyword selection; EN translation if needed."""
|
241 |
if not contexts and self.is_empty:
|
242 |
return "No relevant context found. Index is empty — upload a PDF first."
|
243 |
|
@@ -246,7 +245,7 @@ class SimpleRAG:
|
|
246 |
|
247 |
# Build candidate sentences from nearby contexts
|
248 |
local_pool: List[str] = []
|
249 |
-
for c in (contexts or [])[:5]:
|
250 |
cleaned = _clean_for_summary(c)
|
251 |
for s in _split_sentences(cleaned):
|
252 |
w = s.split()
|
@@ -270,15 +269,13 @@ class SimpleRAG:
|
|
270 |
if len(selected) >= max_sentences:
|
271 |
break
|
272 |
|
273 |
-
# Keyword fallback if needed
|
274 |
if not selected:
|
275 |
selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
|
276 |
|
277 |
if not selected:
|
278 |
return "No readable sentences matched the question. Try a more specific query."
|
279 |
|
280 |
-
|
281 |
-
if OUTPUT_LANG == "en" and any(_looks_azerbaijani(s) for s in selected):
|
282 |
selected = self._translate_to_en(selected)
|
283 |
|
284 |
bullets = "\n".join(f"- {s}" for s in selected)
|
|
|
17 |
|
18 |
from sentence_transformers import SentenceTransformer
|
19 |
|
20 |
+
# ---------------- Paths & Cache (HF-safe) ----------------
|
21 |
+
# Writeable base is /app in HF Spaces. Allow ENV overrides.
|
22 |
+
ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
|
23 |
+
DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
|
24 |
+
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
|
25 |
+
INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
|
26 |
+
CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers prefers HF_HOME
|
27 |
+
|
28 |
for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
29 |
d.mkdir(parents=True, exist_ok=True)
|
30 |
|
|
|
47 |
return s
|
48 |
|
49 |
def _split_sentences(text: str) -> List[str]:
|
|
|
50 |
return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
|
51 |
|
52 |
def _mostly_numeric(s: str) -> bool:
|
|
|
207 |
|
208 |
# ---------- Fallbacks ----------
|
209 |
def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
|
|
|
210 |
qk = set(_keywords(question))
|
211 |
if not qk:
|
212 |
return []
|
|
|
237 |
|
238 |
# ---------- Answer Synthesis ----------
|
239 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
|
|
240 |
if not contexts and self.is_empty:
|
241 |
return "No relevant context found. Index is empty — upload a PDF first."
|
242 |
|
|
|
245 |
|
246 |
# Build candidate sentences from nearby contexts
|
247 |
local_pool: List[str] = []
|
248 |
+
for c in (contexts or [])[:5]:
|
249 |
cleaned = _clean_for_summary(c)
|
250 |
for s in _split_sentences(cleaned):
|
251 |
w = s.split()
|
|
|
269 |
if len(selected) >= max_sentences:
|
270 |
break
|
271 |
|
|
|
272 |
if not selected:
|
273 |
selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
|
274 |
|
275 |
if not selected:
|
276 |
return "No readable sentences matched the question. Try a more specific query."
|
277 |
|
278 |
+
if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
|
|
|
279 |
selected = self._translate_to_en(selected)
|
280 |
|
281 |
bullets = "\n".join(f"- {s}" for s in selected)
|