HamidOmarov commited on
Commit
cf92f2c
·
verified ·
1 Parent(s): 833b4d4

Update app/rag_system.py

Browse files
Files changed (1) hide show
  1. app/rag_system.py +10 -13
app/rag_system.py CHANGED
@@ -17,12 +17,14 @@ except Exception:
17
 
18
  from sentence_transformers import SentenceTransformer
19
 
20
- # ---------------- Paths & Cache ----------------
21
- ROOT_DIR = Path(__file__).resolve().parent
22
- DATA_DIR = ROOT_DIR / "data"
23
- UPLOAD_DIR = DATA_DIR / "uploads"
24
- INDEX_DIR = DATA_DIR / "index"
25
- CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
 
 
26
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
27
  d.mkdir(parents=True, exist_ok=True)
28
 
@@ -45,7 +47,6 @@ def _fix_mojibake(s: str) -> str:
45
  return s
46
 
47
  def _split_sentences(text: str) -> List[str]:
48
- # Split on punctuation boundaries and line breaks
49
  return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
50
 
51
  def _mostly_numeric(s: str) -> bool:
@@ -206,7 +207,6 @@ class SimpleRAG:
206
 
207
  # ---------- Fallbacks ----------
208
  def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
209
- """Pick sentences sharing keywords with the question (question-dependent even if dense retrieval is weak)."""
210
  qk = set(_keywords(question))
211
  if not qk:
212
  return []
@@ -237,7 +237,6 @@ class SimpleRAG:
237
 
238
  # ---------- Answer Synthesis ----------
239
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
240
- """Extractive summary over retrieved contexts; falls back to keyword selection; EN translation if needed."""
241
  if not contexts and self.is_empty:
242
  return "No relevant context found. Index is empty — upload a PDF first."
243
 
@@ -246,7 +245,7 @@ class SimpleRAG:
246
 
247
  # Build candidate sentences from nearby contexts
248
  local_pool: List[str] = []
249
- for c in (contexts or [])[:5]: # keep it light
250
  cleaned = _clean_for_summary(c)
251
  for s in _split_sentences(cleaned):
252
  w = s.split()
@@ -270,15 +269,13 @@ class SimpleRAG:
270
  if len(selected) >= max_sentences:
271
  break
272
 
273
- # Keyword fallback if needed
274
  if not selected:
275
  selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
276
 
277
  if not selected:
278
  return "No readable sentences matched the question. Try a more specific query."
279
 
280
- # Translate to EN if looks AZ and OUTPUT_LANG = en
281
- if OUTPUT_LANG == "en" and any(_looks_azerbaijani(s) for s in selected):
282
  selected = self._translate_to_en(selected)
283
 
284
  bullets = "\n".join(f"- {s}" for s in selected)
 
17
 
18
  from sentence_transformers import SentenceTransformer
19
 
20
+ # ---------------- Paths & Cache (HF-safe) ----------------
21
+ # Writeable base is /app in HF Spaces. Allow ENV overrides.
22
+ ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
23
+ DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
24
+ UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
25
+ INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
26
+ CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers prefers HF_HOME
27
+
28
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
29
  d.mkdir(parents=True, exist_ok=True)
30
 
 
47
  return s
48
 
49
  def _split_sentences(text: str) -> List[str]:
 
50
  return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
51
 
52
  def _mostly_numeric(s: str) -> bool:
 
207
 
208
  # ---------- Fallbacks ----------
209
  def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
 
210
  qk = set(_keywords(question))
211
  if not qk:
212
  return []
 
237
 
238
  # ---------- Answer Synthesis ----------
239
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
 
240
  if not contexts and self.is_empty:
241
  return "No relevant context found. Index is empty — upload a PDF first."
242
 
 
245
 
246
  # Build candidate sentences from nearby contexts
247
  local_pool: List[str] = []
248
+ for c in (contexts or [])[:5]:
249
  cleaned = _clean_for_summary(c)
250
  for s in _split_sentences(cleaned):
251
  w = s.split()
 
269
  if len(selected) >= max_sentences:
270
  break
271
 
 
272
  if not selected:
273
  selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
274
 
275
  if not selected:
276
  return "No readable sentences matched the question. Try a more specific query."
277
 
278
+ if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
 
279
  selected = self._translate_to_en(selected)
280
 
281
  bullets = "\n".join(f"- {s}" for s in selected)