HamidOmarov commited on
Commit
3e29f58
·
2 Parent(s): de26ec1 1cc2b5b

Merge HF Space updates into local

Browse files
Files changed (2) hide show
  1. app/api.py +173 -18
  2. app/rag_system.py +159 -141
app/api.py CHANGED
@@ -1,16 +1,26 @@
1
  # app/api.py
2
- from typing import List
3
 
4
- import faiss, os
 
 
 
 
 
 
 
5
  from fastapi import FastAPI, UploadFile, File, HTTPException
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from fastapi.responses import JSONResponse, RedirectResponse
8
- from pydantic import BaseModel
9
 
10
  from .rag_system import SimpleRAG, UPLOAD_DIR, INDEX_DIR
11
 
12
- app = FastAPI(title="RAG API", version="1.3.0")
 
 
13
 
 
14
  app.add_middleware(
15
  CORSMiddleware,
16
  allow_origins=["*"],
@@ -21,30 +31,127 @@ app.add_middleware(
21
 
22
  rag = SimpleRAG()
23
 
24
- # ---------- Schemas ----------
25
  class UploadResponse(BaseModel):
26
  filename: str
27
  chunks_added: int
28
 
29
  class AskRequest(BaseModel):
30
- question: str
31
- top_k: int = 5
32
 
33
  class AskResponse(BaseModel):
34
  answer: str
35
  contexts: List[str]
36
 
 
 
 
 
37
  class HistoryResponse(BaseModel):
38
  total_chunks: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- # ---------- Utility ----------
41
  @app.get("/")
42
  def root():
43
  return RedirectResponse(url="/docs")
44
 
45
  @app.get("/health")
46
  def health():
47
- return {"status": "ok", "version": app.version, "summarizer": "extractive_en + translate + fallback"}
 
 
 
 
 
 
48
 
49
  @app.get("/debug/translate")
50
  def debug_translate():
@@ -56,9 +163,11 @@ def debug_translate():
56
  except Exception as e:
57
  return JSONResponse(status_code=500, content={"ok": False, "error": str(e)})
58
 
59
- # ---------- Core ----------
60
  @app.post("/upload_pdf", response_model=UploadResponse)
61
  async def upload_pdf(file: UploadFile = File(...)):
 
 
 
62
  dest = UPLOAD_DIR / file.filename
63
  with open(dest, "wb") as f:
64
  while True:
@@ -66,30 +175,71 @@ async def upload_pdf(file: UploadFile = File(...)):
66
  if not chunk:
67
  break
68
  f.write(chunk)
 
69
  added = rag.add_pdf(dest)
70
  if added == 0:
71
- # Clear message for scanned/empty PDFs
72
  raise HTTPException(status_code=400, detail="No extractable text found (likely a scanned image PDF).")
 
 
73
  return UploadResponse(filename=file.filename, chunks_added=added)
74
 
75
  @app.post("/ask_question", response_model=AskResponse)
76
  def ask_question(payload: AskRequest):
77
- hits = rag.search(payload.question, k=max(1, payload.top_k))
78
- contexts = [c for c, _ in hits]
79
- answer = rag.synthesize_answer(payload.question, contexts)
80
- return AskResponse(answer=answer, contexts=contexts or rag.last_added[:5])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  @app.get("/get_history", response_model=HistoryResponse)
83
  def get_history():
84
- return HistoryResponse(total_chunks=len(rag.chunks))
 
 
 
85
 
86
  @app.get("/stats")
87
- def stats():
88
  return {
 
 
 
 
89
  "total_chunks": len(rag.chunks),
90
  "faiss_ntotal": int(getattr(rag.index, "ntotal", 0)),
91
  "model_dim": int(getattr(rag.index, "d", rag.embed_dim)),
92
- "last_added_chunks": len(rag.last_added),
93
  "version": app.version,
94
  }
95
 
@@ -104,6 +254,11 @@ def reset_index():
104
  os.remove(p)
105
  except FileNotFoundError:
106
  pass
 
 
 
 
 
107
  return {"ok": True}
108
  except Exception as e:
109
  raise HTTPException(status_code=500, detail=str(e))
 
1
  # app/api.py
2
+ from __future__ import annotations
3
 
4
+ import os
5
+ import re
6
+ from collections import deque
7
+ from datetime import datetime, timezone
8
+ from time import perf_counter
9
+ from typing import List, Optional, Dict, Any
10
+
11
+ import faiss
12
  from fastapi import FastAPI, UploadFile, File, HTTPException
13
  from fastapi.middleware.cors import CORSMiddleware
14
  from fastapi.responses import JSONResponse, RedirectResponse
15
+ from pydantic import BaseModel, Field
16
 
17
  from .rag_system import SimpleRAG, UPLOAD_DIR, INDEX_DIR
18
 
19
+ __version__ = "1.3.1"
20
+
21
+ app = FastAPI(title="RAG API", version=__version__)
22
 
23
+ # CORS (Streamlit UI üçün)
24
  app.add_middleware(
25
  CORSMiddleware,
26
  allow_origins=["*"],
 
31
 
32
  rag = SimpleRAG()
33
 
34
+ # -------------------- Schemas --------------------
35
  class UploadResponse(BaseModel):
36
  filename: str
37
  chunks_added: int
38
 
39
  class AskRequest(BaseModel):
40
+ question: str = Field(..., min_length=1)
41
+ top_k: int = Field(5, ge=1, le=20)
42
 
43
  class AskResponse(BaseModel):
44
  answer: str
45
  contexts: List[str]
46
 
47
+ class HistoryItem(BaseModel):
48
+ question: str
49
+ timestamp: str
50
+
51
  class HistoryResponse(BaseModel):
52
  total_chunks: int
53
+ history: List[HistoryItem] = []
54
+
55
+ # -------------------- Stats (in-memory) --------------------
56
+ class StatsStore:
57
+ def __init__(self):
58
+ self.documents_indexed = 0
59
+ self.questions_answered = 0
60
+ self.latencies_ms = deque(maxlen=500)
61
+ self.last7_questions = deque([0] * 7, maxlen=7) # sadə günlük sayğac
62
+ self.history = deque(maxlen=50)
63
+
64
+ def add_docs(self, n: int):
65
+ if n > 0:
66
+ self.documents_indexed += int(n)
67
+
68
+ def add_question(self, latency_ms: Optional[int] = None, q: Optional[str] = None):
69
+ self.questions_answered += 1
70
+ if latency_ms is not None:
71
+ self.latencies_ms.append(int(latency_ms))
72
+ if len(self.last7_questions) == 7:
73
+ self.last7_questions[0] += 1
74
+ if q:
75
+ self.history.appendleft(
76
+ {"question": q, "timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds")}
77
+ )
78
+
79
+ @property
80
+ def avg_ms(self) -> int:
81
+ return int(sum(self.latencies_ms) / len(self.latencies_ms)) if self.latencies_ms else 0
82
+
83
+ stats = StatsStore()
84
+
85
+ # -------------------- Helpers --------------------
86
+ _STOPWORDS = {
87
+ "the","a","an","of","for","and","or","in","on","to","from","with","by","is","are",
88
+ "was","were","be","been","being","at","as","that","this","these","those","it","its",
89
+ "into","than","then","so","such","about","over","per","via","vs","within"
90
+ }
91
+
92
+ def _tokenize(s: str) -> List[str]:
93
+ return [w for w in re.findall(r"[a-zA-Z0-9]+", s.lower()) if w and w not in _STOPWORDS and len(w) > 2]
94
+
95
+ def _is_generic_answer(text: str) -> bool:
96
+ if not text:
97
+ return True
98
+ low = text.strip().lower()
99
+ if len(low) < 15:
100
+ return True
101
+ # tipik generik pattern-lər
102
+ if "based on document context" in low or "appears to be" in low:
103
+ return True
104
+ return False
105
+
106
+ def _extractive_fallback(question: str, contexts: List[str], max_chars: int = 600) -> str:
107
+ """ Sualın açar sözlərinə əsasən kontekstdən cümlələr seç. """
108
+ if not contexts:
109
+ return "I couldn't find relevant information in the indexed documents for this question."
110
+ qtok = set(_tokenize(question))
111
+ if not qtok:
112
+ return (contexts[0] or "")[:max_chars]
113
+
114
+ # cümlələrə böl və skorla
115
+ sentences: List[str] = []
116
+ for c in contexts:
117
+ for s in re.split(r"(?<=[\.!\?])\s+|\n+", (c or "").strip()):
118
+ s = s.strip()
119
+ if s:
120
+ sentences.append(s)
121
+
122
+ scored: List[tuple[int, str]] = []
123
+ for s in sentences:
124
+ st = set(_tokenize(s))
125
+ scored.append((len(qtok & st), s))
126
+ scored.sort(key=lambda x: (x[0], len(x[1])), reverse=True)
127
+
128
+ picked: List[str] = []
129
+ for sc, s in scored:
130
+ if sc <= 0 and picked:
131
+ break
132
+ if len((" ".join(picked) + " " + s).strip()) > max_chars:
133
+ break
134
+ picked.append(s)
135
+
136
+ if not picked:
137
+ return (contexts[0] or "")[:max_chars]
138
+ bullets = "\n".join(f"- {p}" for p in picked)
139
+ return f"Answer (based on document context):\n{bullets}"
140
 
141
+ # -------------------- Routes --------------------
142
  @app.get("/")
143
  def root():
144
  return RedirectResponse(url="/docs")
145
 
146
  @app.get("/health")
147
  def health():
148
+ return {
149
+ "status": "ok",
150
+ "version": app.version,
151
+ "summarizer": "extractive_en + translate + keyword_fallback",
152
+ "faiss_ntotal": int(getattr(rag.index, "ntotal", 0)),
153
+ "model_dim": int(getattr(rag.index, "d", rag.embed_dim)),
154
+ }
155
 
156
  @app.get("/debug/translate")
157
  def debug_translate():
 
163
  except Exception as e:
164
  return JSONResponse(status_code=500, content={"ok": False, "error": str(e)})
165
 
 
166
  @app.post("/upload_pdf", response_model=UploadResponse)
167
  async def upload_pdf(file: UploadFile = File(...)):
168
+ if not file.filename.lower().endswith(".pdf"):
169
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed.")
170
+
171
  dest = UPLOAD_DIR / file.filename
172
  with open(dest, "wb") as f:
173
  while True:
 
175
  if not chunk:
176
  break
177
  f.write(chunk)
178
+
179
  added = rag.add_pdf(dest)
180
  if added == 0:
 
181
  raise HTTPException(status_code=400, detail="No extractable text found (likely a scanned image PDF).")
182
+
183
+ stats.add_docs(added)
184
  return UploadResponse(filename=file.filename, chunks_added=added)
185
 
186
  @app.post("/ask_question", response_model=AskResponse)
187
  def ask_question(payload: AskRequest):
188
+ q = (payload.question or "").strip()
189
+ if not q:
190
+ raise HTTPException(status_code=400, detail="Missing 'question'.")
191
+
192
+ k = max(1, int(payload.top_k))
193
+ t0 = perf_counter()
194
+
195
+ # 1) Həmişə sual embedding-i ilə axtar
196
+ try:
197
+ hits = rag.search(q, k=k) # List[Tuple[text, score]]
198
+ except Exception as e:
199
+ raise HTTPException(status_code=500, detail=f"Search failed: {e}")
200
+
201
+ contexts = [c for c, _ in (hits or []) if c] or (getattr(rag, "last_added", [])[:k] if getattr(rag, "last_added", None) else [])
202
+
203
+ if not contexts:
204
+ latency_ms = int((perf_counter() - t0) * 1000)
205
+ stats.add_question(latency_ms, q=q)
206
+ return AskResponse(
207
+ answer="I couldn't find relevant information in the indexed documents for this question.",
208
+ contexts=[]
209
+ )
210
+
211
+ # 2) Cavabı sintez et (rag içində LLM/rule-based ola bilər)
212
+ try:
213
+ synthesized = (rag.synthesize_answer(q, contexts) or "").strip()
214
+ except Exception:
215
+ synthesized = ""
216
+
217
+ # 3) Generic görünürsə, extractive fallback
218
+ if _is_generic_answer(synthesized):
219
+ synthesized = _extractive_fallback(q, contexts, max_chars=600)
220
+
221
+ latency_ms = int((perf_counter() - t0) * 1000)
222
+ stats.add_question(latency_ms, q=q)
223
+ return AskResponse(answer=synthesized, contexts=contexts)
224
 
225
  @app.get("/get_history", response_model=HistoryResponse)
226
  def get_history():
227
+ return HistoryResponse(
228
+ total_chunks=len(rag.chunks),
229
+ history=[HistoryItem(**h) for h in list(stats.history)]
230
+ )
231
 
232
  @app.get("/stats")
233
+ def stats_endpoint():
234
  return {
235
+ "documents_indexed": stats.documents_indexed,
236
+ "questions_answered": stats.questions_answered,
237
+ "avg_ms": stats.avg_ms,
238
+ "last7_questions": list(stats.last7_questions),
239
  "total_chunks": len(rag.chunks),
240
  "faiss_ntotal": int(getattr(rag.index, "ntotal", 0)),
241
  "model_dim": int(getattr(rag.index, "d", rag.embed_dim)),
242
+ "last_added_chunks": len(getattr(rag, "last_added", [])),
243
  "version": app.version,
244
  }
245
 
 
254
  os.remove(p)
255
  except FileNotFoundError:
256
  pass
257
+ stats.documents_indexed = 0
258
+ stats.questions_answered = 0
259
+ stats.latencies_ms.clear()
260
+ stats.last7_questions = deque([0] * 7, maxlen=7)
261
+ stats.history.clear()
262
  return {"ok": True}
263
  except Exception as e:
264
  raise HTTPException(status_code=500, detail=str(e))
app/rag_system.py CHANGED
@@ -1,46 +1,78 @@
1
  # app/rag_system.py
2
  from __future__ import annotations
3
 
4
- import os, re
 
5
  from pathlib import Path
6
- from typing import List, Tuple
7
 
8
  import faiss
9
  import numpy as np
10
- from pypdf import PdfReader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from sentence_transformers import SentenceTransformer
12
 
13
- ROOT_DIR = Path(__file__).resolve().parent.parent
14
- DATA_DIR = ROOT_DIR / "data"
15
- UPLOAD_DIR = DATA_DIR / "uploads"
16
- INDEX_DIR = DATA_DIR / "index"
17
- CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
 
 
 
18
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
19
  d.mkdir(parents=True, exist_ok=True)
20
 
 
21
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
22
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
23
 
 
24
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
25
- NUM_TOK_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
26
- GENERIC_Q_RE = re.compile(
27
- r"(what\s+is\s+(it|this|the\s+document)\s+about\??|what\s+is\s+about\??|summary|overview)",
28
- re.IGNORECASE,
29
- )
 
 
 
 
 
 
30
 
31
  def _split_sentences(text: str) -> List[str]:
32
- return [s.strip() for s in re.split(r'(?<=[.!?])\s+|[\r\n]+', text) if s.strip()]
33
 
34
  def _mostly_numeric(s: str) -> bool:
35
- alnum = [c for c in s if s and c.isalnum()]
36
  if not alnum:
37
  return True
38
  digits = sum(c.isdigit() for c in alnum)
39
  return digits / max(1, len(alnum)) > 0.3
40
 
 
 
41
  def _tabular_like(s: str) -> bool:
42
- hits = len(NUM_TOK_RE.findall(s))
43
- return hits >= 4 or len(s) < 15
44
 
45
  def _clean_for_summary(text: str) -> str:
46
  out = []
@@ -58,46 +90,23 @@ def _sim_jaccard(a: str, b: str) -> float:
58
  return 0.0
59
  return len(aw & bw) / len(aw | bw)
60
 
 
 
 
 
 
 
 
 
 
 
 
61
  def _looks_azerbaijani(s: str) -> bool:
62
  has_az = any(ch in AZ_CHARS for ch in s)
63
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
64
  return has_az or non_ascii_ratio > 0.15
65
 
66
- def _non_ascii_ratio(s: str) -> float:
67
- return sum(ord(c) > 127 for c in s) / max(1, len(s))
68
-
69
- def _keyword_summary_en(contexts: List[str]) -> List[str]:
70
- text = " ".join(contexts).lower()
71
- bullets: List[str] = []
72
-
73
- def add(b: str):
74
- if b not in bullets:
75
- bullets.append(b)
76
-
77
- if ("şüşə" in text) or ("ara kəsm" in text) or ("s/q" in text):
78
- add("Removal and re-installation of glass partitions in sanitary areas.")
79
- if "divar kağız" in text:
80
- add("Wallpaper repair or replacement; some areas replaced with plaster and paint.")
81
- if ("alçı boya" in text) or ("boya işi" in text) or ("plaster" in text) or ("boya" in text):
82
- add("Wall plastering and painting works.")
83
- if "seramik" in text or "ceramic" in text:
84
- add("Ceramic tiling works (including grouting).")
85
- if ("dilatasyon" in text) or ("ar 153" in text) or ("ar153" in text):
86
- add("Installation of AR 153–050 floor expansion joint profile with accessories and insulation.")
87
- if "daş yunu" in text or "rock wool" in text:
88
- add("Rock wool insulation installed where required.")
89
- if ("sütunlarda" in text) or ("üzlüyün" in text) or ("cladding" in text):
90
- add("Repair of wall cladding on columns.")
91
- if ("m²" in text) or ("ədəd" in text) or ("azn" in text) or ("unit price" in text):
92
- add("Bill of quantities style lines with unit prices and measures (m², pcs).")
93
-
94
- if not bullets:
95
- bullets = [
96
- "The document appears to be a bill of quantities or a structured list of works.",
97
- "Scope likely includes demolition/reinstallation, finishing (plaster & paint), tiling, and profiles.",
98
- ]
99
- return bullets[:5]
100
-
101
  class SimpleRAG:
102
  def __init__(
103
  self,
@@ -112,14 +121,16 @@ class SimpleRAG:
112
  self.cache_dir = Path(cache_dir)
113
 
114
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
115
- self.embed_dim = self.model.get_sentence_embedding_dimension()
116
 
117
- self._translator = None # lazy
118
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
119
  self.chunks: List[str] = []
120
  self.last_added: List[str] = []
 
 
121
  self._load()
122
 
 
123
  def _load(self) -> None:
124
  if self.meta_path.exists():
125
  try:
@@ -138,56 +149,49 @@ class SimpleRAG:
138
  faiss.write_index(self.index, str(self.index_path))
139
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
140
 
 
 
 
 
 
141
  @staticmethod
142
- def _pdf_to_texts(pdf_path: Path, step: int = 1400) -> List[str]:
143
- # 1) pypdf
144
  pages: List[str] = []
145
- try:
146
- reader = PdfReader(str(pdf_path))
147
- for p in reader.pages:
148
- t = p.extract_text() or ""
149
- if t.strip():
150
- pages.append(t)
151
- except Exception:
152
- pages = []
153
-
154
- full = " ".join(pages).strip()
155
- if not full:
156
- # 2) pdfminer fallback
157
- try:
158
- from pdfminer.high_level import extract_text as pdfminer_extract_text
159
- full = (pdfminer_extract_text(str(pdf_path)) or "").strip()
160
- except Exception:
161
- full = ""
162
-
163
- if not full:
164
- return []
165
-
166
  chunks: List[str] = []
167
- for i in range(0, len(full), step):
168
- part = full[i : i + step].strip()
169
- if part:
170
- chunks.append(part)
 
171
  return chunks
172
 
 
173
  def add_pdf(self, pdf_path: Path) -> int:
174
  texts = self._pdf_to_texts(pdf_path)
175
  if not texts:
176
- # IMPORTANT: do NOT clobber last_added if this PDF had no extractable text
177
  return 0
178
-
179
- self.last_added = texts[:] # only set if we actually extracted text
180
- emb = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
181
  self.index.add(emb.astype(np.float32))
182
  self.chunks.extend(texts)
 
183
  self._persist()
184
  return len(texts)
185
 
 
186
  def search(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
187
- if self.index is None or self.index.ntotal == 0:
188
  return []
189
  q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
190
- D, I = self.index.search(q, min(k, max(1, self.index.ntotal)))
 
191
  out: List[Tuple[str, float]] = []
192
  if I.size > 0 and self.chunks:
193
  for idx, score in zip(I[0], D[0]):
@@ -195,6 +199,7 @@ class SimpleRAG:
195
  out.append((self.chunks[idx], float(score)))
196
  return out
197
 
 
198
  def _translate_to_en(self, texts: List[str]) -> List[str]:
199
  if not texts:
200
  return texts
@@ -207,78 +212,91 @@ class SimpleRAG:
207
  cache_dir=str(self.cache_dir),
208
  device=-1,
209
  )
210
- outs = self._translator(texts, max_length=800)
211
  return [o["translation_text"].strip() for o in outs]
212
  except Exception:
213
  return texts
214
 
215
- def _prepare_contexts(self, question: str, contexts: List[str]) -> List[str]:
216
- # Generic question or empty search use last uploaded file snippets
217
- generic = (len((question or "").split()) <= 5) or bool(GENERIC_Q_RE.search(question or ""))
218
- if (not contexts or generic) and self.last_added:
219
- return self.last_added[:5]
220
- return contexts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
 
222
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
223
- contexts = self._prepare_contexts(question, contexts)
224
-
225
- if not contexts:
226
- return "No relevant context found. Please upload a PDF or ask a more specific question."
227
 
228
- # 1) Clean & keep top contexts
229
- cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
230
- cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
231
- if not cleaned_contexts:
232
- bullets = _keyword_summary_en(contexts[:5])
233
- return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
234
 
235
- # 2) Pre-translate paragraphs to EN when target is EN
236
- translated = self._translate_to_en(cleaned_contexts) if OUTPUT_LANG == "en" else cleaned_contexts
237
-
238
- # 3) Split into candidate sentences and filter
239
- candidates: List[str] = []
240
- for para in translated:
241
- for s in _split_sentences(para):
242
  w = s.split()
243
- if not (6 <= len(w) <= 60):
244
- continue
245
- # full sentence requirement: punctuation at end OR sufficiently long
246
- if not re.search(r"[.!?](?:[\"'])?$", s) and len(w) < 18:
247
  continue
248
  if _tabular_like(s) or _mostly_numeric(s):
249
  continue
250
- candidates.append(" ".join(w))
251
 
252
- # 4) Fallback if no sentences
253
- if not candidates:
254
- bullets = _keyword_summary_en(cleaned_contexts)
255
- return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
 
 
 
 
 
 
 
 
 
256
 
257
- # 5) Rank by similarity to the question
258
- q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
259
- cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
260
- scores = (cand_emb @ q_emb.T).ravel()
261
- order = np.argsort(-scores)
262
 
263
- # 6) Aggressive near-duplicate removal
264
- selected: List[str] = []
265
- for i in order:
266
- s = candidates[i].strip()
267
- if any(_sim_jaccard(s, t) >= 0.90 for t in selected):
268
- continue
269
- selected.append(s)
270
- if len(selected) >= max_sentences:
271
- break
272
 
273
- # 7) If still looks non-English, use keyword fallback
274
- if not selected or (sum(_non_ascii_ratio(s) for s in selected) / len(selected) > 0.10):
275
- bullets = _keyword_summary_en(cleaned_contexts)
276
- return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
277
 
278
  bullets = "\n".join(f"- {s}" for s in selected)
279
  return f"Answer (based on document context):\n{bullets}"
280
 
281
- def synthesize_answer(question: str, contexts: List[str]) -> str:
282
- return SimpleRAG().synthesize_answer(question, contexts)
283
 
284
- __all__ = ["SimpleRAG", "synthesize_answer", "DATA_DIR", "UPLOAD_DIR", "INDEX_DIR", "CACHE_DIR", "MODEL_NAME"]
 
 
 
 
 
 
1
  # app/rag_system.py
2
  from __future__ import annotations
3
 
4
+ import os
5
+ import re
6
  from pathlib import Path
7
+ from typing import List, Tuple, Optional
8
 
9
  import faiss
10
  import numpy as np
11
+
12
+ # -- add near other helpers --
13
+ import re
14
+
15
+ AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
16
+ _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
17
+
18
+ def _fix_intra_word_spaces(s: str) -> str:
19
+ """Join sequences like 'H Ə F T Ə' -> 'HƏFTƏ' without touching normal words."""
20
+ if not s:
21
+ return s
22
+ return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
23
+
24
+ # Prefer pypdf; fallback to PyPDF2 if needed
25
+ try:
26
+ from pypdf import PdfReader
27
+ except Exception:
28
+ from PyPDF2 import PdfReader # type: ignore
29
+
30
  from sentence_transformers import SentenceTransformer
31
 
32
+ # ---------------- Paths & Cache (HF-safe) ----------------
33
+ # Writeable base is /app in HF Spaces. Allow ENV overrides.
34
+ ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
35
+ DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
36
+ UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
37
+ INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
38
+ CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers prefers HF_HOME
39
+
40
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
41
  d.mkdir(parents=True, exist_ok=True)
42
 
43
+ # ---------------- Config ----------------
44
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
45
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
46
 
47
+ # ---------------- Helpers ----------------
48
  AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
49
+
50
+ def _fix_mojibake(s: str) -> str:
51
+ """Fix common UTF-8-as-Latin-1 mojibake."""
52
+ if not s:
53
+ return s
54
+ if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
55
+ try:
56
+ return s.encode("latin-1", "ignore").decode("utf-8", "ignore")
57
+ except Exception:
58
+ return s
59
+ return s
60
 
61
  def _split_sentences(text: str) -> List[str]:
62
+ return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
63
 
64
  def _mostly_numeric(s: str) -> bool:
65
+ alnum = [c for c in s if c.isalnum()]
66
  if not alnum:
67
  return True
68
  digits = sum(c.isdigit() for c in alnum)
69
  return digits / max(1, len(alnum)) > 0.3
70
 
71
+ NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
72
+
73
  def _tabular_like(s: str) -> bool:
74
+ hits = len(NUM_TOKEN_RE.findall(s))
75
+ return hits >= 2 or "Page" in s or len(s) < 20
76
 
77
  def _clean_for_summary(text: str) -> str:
78
  out = []
 
90
  return 0.0
91
  return len(aw & bw) / len(aw | bw)
92
 
93
+ STOPWORDS = {
94
+ "the","a","an","and","or","of","to","in","on","for","with","by",
95
+ "this","that","these","those","is","are","was","were","be","been","being",
96
+ "at","as","it","its","from","into","about","over","after","before","than",
97
+ "such","can","could","should","would","may","might","will","shall"
98
+ }
99
+
100
+ def _keywords(text: str) -> List[str]:
101
+ toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
102
+ return [t for t in toks if t not in STOPWORDS and len(t) > 2]
103
+
104
  def _looks_azerbaijani(s: str) -> bool:
105
  has_az = any(ch in AZ_CHARS for ch in s)
106
  non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
107
  return has_az or non_ascii_ratio > 0.15
108
 
109
+ # ---------------- RAG Core ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  class SimpleRAG:
111
  def __init__(
112
  self,
 
121
  self.cache_dir = Path(cache_dir)
122
 
123
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
124
+ self.embed_dim = int(self.model.get_sentence_embedding_dimension())
125
 
 
126
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
127
  self.chunks: List[str] = []
128
  self.last_added: List[str] = []
129
+ self._translator = None # lazy init
130
+
131
  self._load()
132
 
133
+ # ---------- Persistence ----------
134
  def _load(self) -> None:
135
  if self.meta_path.exists():
136
  try:
 
149
  faiss.write_index(self.index, str(self.index_path))
150
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
151
 
152
+ # ---------- Utilities ----------
153
+ @property
154
+ def is_empty(self) -> bool:
155
+ return getattr(self.index, "ntotal", 0) == 0 or not self.chunks
156
+
157
  @staticmethod
158
+ def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
159
+ reader = PdfReader(str(pdf_path))
160
  pages: List[str] = []
161
+ for p in reader.pages:
162
+ t = p.extract_text() or ""
163
+ t = _fix_mojibake(t)
164
+ if t.strip():
165
+ pages.append(t)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  chunks: List[str] = []
167
+ for txt in pages:
168
+ for i in range(0, len(txt), step):
169
+ part = txt[i : i + step].strip()
170
+ if part:
171
+ chunks.append(part)
172
  return chunks
173
 
174
+ # ---------- Indexing ----------
175
  def add_pdf(self, pdf_path: Path) -> int:
176
  texts = self._pdf_to_texts(pdf_path)
177
  if not texts:
 
178
  return 0
179
+ emb = self.model.encode(
180
+ texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
181
+ )
182
  self.index.add(emb.astype(np.float32))
183
  self.chunks.extend(texts)
184
+ self.last_added = texts[:]
185
  self._persist()
186
  return len(texts)
187
 
188
+ # ---------- Search ----------
189
  def search(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
190
+ if self.is_empty:
191
  return []
192
  q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
193
+ k = max(1, min(int(k or 5), getattr(self.index, "ntotal", 1)))
194
+ D, I = self.index.search(q, k)
195
  out: List[Tuple[str, float]] = []
196
  if I.size > 0 and self.chunks:
197
  for idx, score in zip(I[0], D[0]):
 
199
  out.append((self.chunks[idx], float(score)))
200
  return out
201
 
202
+ # ---------- Translation (optional) ----------
203
  def _translate_to_en(self, texts: List[str]) -> List[str]:
204
  if not texts:
205
  return texts
 
212
  cache_dir=str(self.cache_dir),
213
  device=-1,
214
  )
215
+ outs = self._translator(texts, max_length=400)
216
  return [o["translation_text"].strip() for o in outs]
217
  except Exception:
218
  return texts
219
 
220
+ # ---------- Fallbacks ----------
221
+ def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
222
+ qk = set(_keywords(question))
223
+ if not qk:
224
+ return []
225
+ candidates: List[Tuple[float, str]] = []
226
+ for text in pool[:200]:
227
+ cleaned = _clean_for_summary(text)
228
+ for s in _split_sentences(cleaned):
229
+ if _tabular_like(s) or _mostly_numeric(s):
230
+ continue
231
+ toks = set(_keywords(s))
232
+ if not toks:
233
+ continue
234
+ overlap = len(qk & toks)
235
+ if overlap == 0:
236
+ continue
237
+ length_penalty = max(8, min(40, len(s.split())))
238
+ score = overlap + min(0.5, overlap / length_penalty)
239
+ candidates.append((score, s))
240
+ candidates.sort(key=lambda x: x[0], reverse=True)
241
+ out: List[str] = []
242
+ for _, s in candidates:
243
+ if any(_sim_jaccard(s, t) >= 0.82 for t in out):
244
+ continue
245
+ out.append(s)
246
+ if len(out) >= limit_sentences:
247
+ break
248
+ return out
249
 
250
+ # ---------- Answer Synthesis ----------
251
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
252
+ if not contexts and self.is_empty:
253
+ return "No relevant context found. Index is empty — upload a PDF first."
 
 
254
 
255
+ # Fix mojibake in contexts
256
+ contexts = [_fix_mojibake(c) for c in (contexts or [])]
 
 
 
 
257
 
258
+ # Build candidate sentences from nearby contexts
259
+ local_pool: List[str] = []
260
+ for c in (contexts or [])[:5]:
261
+ cleaned = _clean_for_summary(c)
262
+ for s in _split_sentences(cleaned):
 
 
263
  w = s.split()
264
+ if not (8 <= len(w) <= 35):
 
 
 
265
  continue
266
  if _tabular_like(s) or _mostly_numeric(s):
267
  continue
268
+ local_pool.append(" ".join(w))
269
 
270
+ selected: List[str] = []
271
+ if local_pool:
272
+ q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
273
+ cand_emb = self.model.encode(local_pool, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
274
+ scores = (cand_emb @ q_emb.T).ravel()
275
+ order = np.argsort(-scores)
276
+ for i in order:
277
+ s = local_pool[i].strip()
278
+ if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
279
+ continue
280
+ selected.append(s)
281
+ if len(selected) >= max_sentences:
282
+ break
283
 
284
+ if not selected:
285
+ selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
 
 
 
286
 
287
+ if not selected:
288
+ return "No readable sentences matched the question. Try a more specific query."
 
 
 
 
 
 
 
289
 
290
+ if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
291
+ selected = self._translate_to_en(selected)
 
 
292
 
293
  bullets = "\n".join(f"- {s}" for s in selected)
294
  return f"Answer (based on document context):\n{bullets}"
295
 
 
 
296
 
297
+ # Public API
298
+ __all__ = [
299
+ "SimpleRAG",
300
+ "UPLOAD_DIR",
301
+ "INDEX_DIR",
302
+ ]