Spaces:
Sleeping
Sleeping
Merge HF Space updates into local
Browse files- app/api.py +173 -18
- app/rag_system.py +159 -141
app/api.py
CHANGED
@@ -1,16 +1,26 @@
|
|
1 |
# app/api.py
|
2 |
-
from
|
3 |
|
4 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from fastapi import FastAPI, UploadFile, File, HTTPException
|
6 |
from fastapi.middleware.cors import CORSMiddleware
|
7 |
from fastapi.responses import JSONResponse, RedirectResponse
|
8 |
-
from pydantic import BaseModel
|
9 |
|
10 |
from .rag_system import SimpleRAG, UPLOAD_DIR, INDEX_DIR
|
11 |
|
12 |
-
|
|
|
|
|
13 |
|
|
|
14 |
app.add_middleware(
|
15 |
CORSMiddleware,
|
16 |
allow_origins=["*"],
|
@@ -21,30 +31,127 @@ app.add_middleware(
|
|
21 |
|
22 |
rag = SimpleRAG()
|
23 |
|
24 |
-
#
|
25 |
class UploadResponse(BaseModel):
|
26 |
filename: str
|
27 |
chunks_added: int
|
28 |
|
29 |
class AskRequest(BaseModel):
|
30 |
-
question: str
|
31 |
-
top_k: int = 5
|
32 |
|
33 |
class AskResponse(BaseModel):
|
34 |
answer: str
|
35 |
contexts: List[str]
|
36 |
|
|
|
|
|
|
|
|
|
37 |
class HistoryResponse(BaseModel):
|
38 |
total_chunks: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
#
|
41 |
@app.get("/")
|
42 |
def root():
|
43 |
return RedirectResponse(url="/docs")
|
44 |
|
45 |
@app.get("/health")
|
46 |
def health():
|
47 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
@app.get("/debug/translate")
|
50 |
def debug_translate():
|
@@ -56,9 +163,11 @@ def debug_translate():
|
|
56 |
except Exception as e:
|
57 |
return JSONResponse(status_code=500, content={"ok": False, "error": str(e)})
|
58 |
|
59 |
-
# ---------- Core ----------
|
60 |
@app.post("/upload_pdf", response_model=UploadResponse)
|
61 |
async def upload_pdf(file: UploadFile = File(...)):
|
|
|
|
|
|
|
62 |
dest = UPLOAD_DIR / file.filename
|
63 |
with open(dest, "wb") as f:
|
64 |
while True:
|
@@ -66,30 +175,71 @@ async def upload_pdf(file: UploadFile = File(...)):
|
|
66 |
if not chunk:
|
67 |
break
|
68 |
f.write(chunk)
|
|
|
69 |
added = rag.add_pdf(dest)
|
70 |
if added == 0:
|
71 |
-
# Clear message for scanned/empty PDFs
|
72 |
raise HTTPException(status_code=400, detail="No extractable text found (likely a scanned image PDF).")
|
|
|
|
|
73 |
return UploadResponse(filename=file.filename, chunks_added=added)
|
74 |
|
75 |
@app.post("/ask_question", response_model=AskResponse)
|
76 |
def ask_question(payload: AskRequest):
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
@app.get("/get_history", response_model=HistoryResponse)
|
83 |
def get_history():
|
84 |
-
return HistoryResponse(
|
|
|
|
|
|
|
85 |
|
86 |
@app.get("/stats")
|
87 |
-
def
|
88 |
return {
|
|
|
|
|
|
|
|
|
89 |
"total_chunks": len(rag.chunks),
|
90 |
"faiss_ntotal": int(getattr(rag.index, "ntotal", 0)),
|
91 |
"model_dim": int(getattr(rag.index, "d", rag.embed_dim)),
|
92 |
-
"last_added_chunks": len(rag
|
93 |
"version": app.version,
|
94 |
}
|
95 |
|
@@ -104,6 +254,11 @@ def reset_index():
|
|
104 |
os.remove(p)
|
105 |
except FileNotFoundError:
|
106 |
pass
|
|
|
|
|
|
|
|
|
|
|
107 |
return {"ok": True}
|
108 |
except Exception as e:
|
109 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
1 |
# app/api.py
|
2 |
+
from __future__ import annotations
|
3 |
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
from collections import deque
|
7 |
+
from datetime import datetime, timezone
|
8 |
+
from time import perf_counter
|
9 |
+
from typing import List, Optional, Dict, Any
|
10 |
+
|
11 |
+
import faiss
|
12 |
from fastapi import FastAPI, UploadFile, File, HTTPException
|
13 |
from fastapi.middleware.cors import CORSMiddleware
|
14 |
from fastapi.responses import JSONResponse, RedirectResponse
|
15 |
+
from pydantic import BaseModel, Field
|
16 |
|
17 |
from .rag_system import SimpleRAG, UPLOAD_DIR, INDEX_DIR
|
18 |
|
19 |
+
__version__ = "1.3.1"
|
20 |
+
|
21 |
+
app = FastAPI(title="RAG API", version=__version__)
|
22 |
|
23 |
+
# CORS (Streamlit UI üçün)
|
24 |
app.add_middleware(
|
25 |
CORSMiddleware,
|
26 |
allow_origins=["*"],
|
|
|
31 |
|
32 |
rag = SimpleRAG()
|
33 |
|
34 |
+
# -------------------- Schemas --------------------
|
35 |
class UploadResponse(BaseModel):
|
36 |
filename: str
|
37 |
chunks_added: int
|
38 |
|
39 |
class AskRequest(BaseModel):
|
40 |
+
question: str = Field(..., min_length=1)
|
41 |
+
top_k: int = Field(5, ge=1, le=20)
|
42 |
|
43 |
class AskResponse(BaseModel):
|
44 |
answer: str
|
45 |
contexts: List[str]
|
46 |
|
47 |
+
class HistoryItem(BaseModel):
|
48 |
+
question: str
|
49 |
+
timestamp: str
|
50 |
+
|
51 |
class HistoryResponse(BaseModel):
|
52 |
total_chunks: int
|
53 |
+
history: List[HistoryItem] = []
|
54 |
+
|
55 |
+
# -------------------- Stats (in-memory) --------------------
|
56 |
+
class StatsStore:
|
57 |
+
def __init__(self):
|
58 |
+
self.documents_indexed = 0
|
59 |
+
self.questions_answered = 0
|
60 |
+
self.latencies_ms = deque(maxlen=500)
|
61 |
+
self.last7_questions = deque([0] * 7, maxlen=7) # sadə günlük sayğac
|
62 |
+
self.history = deque(maxlen=50)
|
63 |
+
|
64 |
+
def add_docs(self, n: int):
|
65 |
+
if n > 0:
|
66 |
+
self.documents_indexed += int(n)
|
67 |
+
|
68 |
+
def add_question(self, latency_ms: Optional[int] = None, q: Optional[str] = None):
|
69 |
+
self.questions_answered += 1
|
70 |
+
if latency_ms is not None:
|
71 |
+
self.latencies_ms.append(int(latency_ms))
|
72 |
+
if len(self.last7_questions) == 7:
|
73 |
+
self.last7_questions[0] += 1
|
74 |
+
if q:
|
75 |
+
self.history.appendleft(
|
76 |
+
{"question": q, "timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds")}
|
77 |
+
)
|
78 |
+
|
79 |
+
@property
|
80 |
+
def avg_ms(self) -> int:
|
81 |
+
return int(sum(self.latencies_ms) / len(self.latencies_ms)) if self.latencies_ms else 0
|
82 |
+
|
83 |
+
stats = StatsStore()
|
84 |
+
|
85 |
+
# -------------------- Helpers --------------------
|
86 |
+
_STOPWORDS = {
|
87 |
+
"the","a","an","of","for","and","or","in","on","to","from","with","by","is","are",
|
88 |
+
"was","were","be","been","being","at","as","that","this","these","those","it","its",
|
89 |
+
"into","than","then","so","such","about","over","per","via","vs","within"
|
90 |
+
}
|
91 |
+
|
92 |
+
def _tokenize(s: str) -> List[str]:
|
93 |
+
return [w for w in re.findall(r"[a-zA-Z0-9]+", s.lower()) if w and w not in _STOPWORDS and len(w) > 2]
|
94 |
+
|
95 |
+
def _is_generic_answer(text: str) -> bool:
|
96 |
+
if not text:
|
97 |
+
return True
|
98 |
+
low = text.strip().lower()
|
99 |
+
if len(low) < 15:
|
100 |
+
return True
|
101 |
+
# tipik generik pattern-lər
|
102 |
+
if "based on document context" in low or "appears to be" in low:
|
103 |
+
return True
|
104 |
+
return False
|
105 |
+
|
106 |
+
def _extractive_fallback(question: str, contexts: List[str], max_chars: int = 600) -> str:
|
107 |
+
""" Sualın açar sözlərinə əsasən kontekstdən cümlələr seç. """
|
108 |
+
if not contexts:
|
109 |
+
return "I couldn't find relevant information in the indexed documents for this question."
|
110 |
+
qtok = set(_tokenize(question))
|
111 |
+
if not qtok:
|
112 |
+
return (contexts[0] or "")[:max_chars]
|
113 |
+
|
114 |
+
# cümlələrə böl və skorla
|
115 |
+
sentences: List[str] = []
|
116 |
+
for c in contexts:
|
117 |
+
for s in re.split(r"(?<=[\.!\?])\s+|\n+", (c or "").strip()):
|
118 |
+
s = s.strip()
|
119 |
+
if s:
|
120 |
+
sentences.append(s)
|
121 |
+
|
122 |
+
scored: List[tuple[int, str]] = []
|
123 |
+
for s in sentences:
|
124 |
+
st = set(_tokenize(s))
|
125 |
+
scored.append((len(qtok & st), s))
|
126 |
+
scored.sort(key=lambda x: (x[0], len(x[1])), reverse=True)
|
127 |
+
|
128 |
+
picked: List[str] = []
|
129 |
+
for sc, s in scored:
|
130 |
+
if sc <= 0 and picked:
|
131 |
+
break
|
132 |
+
if len((" ".join(picked) + " " + s).strip()) > max_chars:
|
133 |
+
break
|
134 |
+
picked.append(s)
|
135 |
+
|
136 |
+
if not picked:
|
137 |
+
return (contexts[0] or "")[:max_chars]
|
138 |
+
bullets = "\n".join(f"- {p}" for p in picked)
|
139 |
+
return f"Answer (based on document context):\n{bullets}"
|
140 |
|
141 |
+
# -------------------- Routes --------------------
|
142 |
@app.get("/")
|
143 |
def root():
|
144 |
return RedirectResponse(url="/docs")
|
145 |
|
146 |
@app.get("/health")
|
147 |
def health():
|
148 |
+
return {
|
149 |
+
"status": "ok",
|
150 |
+
"version": app.version,
|
151 |
+
"summarizer": "extractive_en + translate + keyword_fallback",
|
152 |
+
"faiss_ntotal": int(getattr(rag.index, "ntotal", 0)),
|
153 |
+
"model_dim": int(getattr(rag.index, "d", rag.embed_dim)),
|
154 |
+
}
|
155 |
|
156 |
@app.get("/debug/translate")
|
157 |
def debug_translate():
|
|
|
163 |
except Exception as e:
|
164 |
return JSONResponse(status_code=500, content={"ok": False, "error": str(e)})
|
165 |
|
|
|
166 |
@app.post("/upload_pdf", response_model=UploadResponse)
|
167 |
async def upload_pdf(file: UploadFile = File(...)):
|
168 |
+
if not file.filename.lower().endswith(".pdf"):
|
169 |
+
raise HTTPException(status_code=400, detail="Only PDF files are allowed.")
|
170 |
+
|
171 |
dest = UPLOAD_DIR / file.filename
|
172 |
with open(dest, "wb") as f:
|
173 |
while True:
|
|
|
175 |
if not chunk:
|
176 |
break
|
177 |
f.write(chunk)
|
178 |
+
|
179 |
added = rag.add_pdf(dest)
|
180 |
if added == 0:
|
|
|
181 |
raise HTTPException(status_code=400, detail="No extractable text found (likely a scanned image PDF).")
|
182 |
+
|
183 |
+
stats.add_docs(added)
|
184 |
return UploadResponse(filename=file.filename, chunks_added=added)
|
185 |
|
186 |
@app.post("/ask_question", response_model=AskResponse)
|
187 |
def ask_question(payload: AskRequest):
|
188 |
+
q = (payload.question or "").strip()
|
189 |
+
if not q:
|
190 |
+
raise HTTPException(status_code=400, detail="Missing 'question'.")
|
191 |
+
|
192 |
+
k = max(1, int(payload.top_k))
|
193 |
+
t0 = perf_counter()
|
194 |
+
|
195 |
+
# 1) Həmişə sual embedding-i ilə axtar
|
196 |
+
try:
|
197 |
+
hits = rag.search(q, k=k) # List[Tuple[text, score]]
|
198 |
+
except Exception as e:
|
199 |
+
raise HTTPException(status_code=500, detail=f"Search failed: {e}")
|
200 |
+
|
201 |
+
contexts = [c for c, _ in (hits or []) if c] or (getattr(rag, "last_added", [])[:k] if getattr(rag, "last_added", None) else [])
|
202 |
+
|
203 |
+
if not contexts:
|
204 |
+
latency_ms = int((perf_counter() - t0) * 1000)
|
205 |
+
stats.add_question(latency_ms, q=q)
|
206 |
+
return AskResponse(
|
207 |
+
answer="I couldn't find relevant information in the indexed documents for this question.",
|
208 |
+
contexts=[]
|
209 |
+
)
|
210 |
+
|
211 |
+
# 2) Cavabı sintez et (rag içində LLM/rule-based ola bilər)
|
212 |
+
try:
|
213 |
+
synthesized = (rag.synthesize_answer(q, contexts) or "").strip()
|
214 |
+
except Exception:
|
215 |
+
synthesized = ""
|
216 |
+
|
217 |
+
# 3) Generic görünürsə, extractive fallback
|
218 |
+
if _is_generic_answer(synthesized):
|
219 |
+
synthesized = _extractive_fallback(q, contexts, max_chars=600)
|
220 |
+
|
221 |
+
latency_ms = int((perf_counter() - t0) * 1000)
|
222 |
+
stats.add_question(latency_ms, q=q)
|
223 |
+
return AskResponse(answer=synthesized, contexts=contexts)
|
224 |
|
225 |
@app.get("/get_history", response_model=HistoryResponse)
|
226 |
def get_history():
|
227 |
+
return HistoryResponse(
|
228 |
+
total_chunks=len(rag.chunks),
|
229 |
+
history=[HistoryItem(**h) for h in list(stats.history)]
|
230 |
+
)
|
231 |
|
232 |
@app.get("/stats")
|
233 |
+
def stats_endpoint():
|
234 |
return {
|
235 |
+
"documents_indexed": stats.documents_indexed,
|
236 |
+
"questions_answered": stats.questions_answered,
|
237 |
+
"avg_ms": stats.avg_ms,
|
238 |
+
"last7_questions": list(stats.last7_questions),
|
239 |
"total_chunks": len(rag.chunks),
|
240 |
"faiss_ntotal": int(getattr(rag.index, "ntotal", 0)),
|
241 |
"model_dim": int(getattr(rag.index, "d", rag.embed_dim)),
|
242 |
+
"last_added_chunks": len(getattr(rag, "last_added", [])),
|
243 |
"version": app.version,
|
244 |
}
|
245 |
|
|
|
254 |
os.remove(p)
|
255 |
except FileNotFoundError:
|
256 |
pass
|
257 |
+
stats.documents_indexed = 0
|
258 |
+
stats.questions_answered = 0
|
259 |
+
stats.latencies_ms.clear()
|
260 |
+
stats.last7_questions = deque([0] * 7, maxlen=7)
|
261 |
+
stats.history.clear()
|
262 |
return {"ok": True}
|
263 |
except Exception as e:
|
264 |
raise HTTPException(status_code=500, detail=str(e))
|
app/rag_system.py
CHANGED
@@ -1,46 +1,78 @@
|
|
1 |
# app/rag_system.py
|
2 |
from __future__ import annotations
|
3 |
|
4 |
-
import os
|
|
|
5 |
from pathlib import Path
|
6 |
-
from typing import List, Tuple
|
7 |
|
8 |
import faiss
|
9 |
import numpy as np
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
18 |
for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
19 |
d.mkdir(parents=True, exist_ok=True)
|
20 |
|
|
|
21 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
22 |
OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
|
23 |
|
|
|
24 |
AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def _split_sentences(text: str) -> List[str]:
|
32 |
-
return [s.strip() for s in re.split(r
|
33 |
|
34 |
def _mostly_numeric(s: str) -> bool:
|
35 |
-
alnum = [c for c in s if
|
36 |
if not alnum:
|
37 |
return True
|
38 |
digits = sum(c.isdigit() for c in alnum)
|
39 |
return digits / max(1, len(alnum)) > 0.3
|
40 |
|
|
|
|
|
41 |
def _tabular_like(s: str) -> bool:
|
42 |
-
hits = len(
|
43 |
-
return hits >=
|
44 |
|
45 |
def _clean_for_summary(text: str) -> str:
|
46 |
out = []
|
@@ -58,46 +90,23 @@ def _sim_jaccard(a: str, b: str) -> float:
|
|
58 |
return 0.0
|
59 |
return len(aw & bw) / len(aw | bw)
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def _looks_azerbaijani(s: str) -> bool:
|
62 |
has_az = any(ch in AZ_CHARS for ch in s)
|
63 |
non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
|
64 |
return has_az or non_ascii_ratio > 0.15
|
65 |
|
66 |
-
|
67 |
-
return sum(ord(c) > 127 for c in s) / max(1, len(s))
|
68 |
-
|
69 |
-
def _keyword_summary_en(contexts: List[str]) -> List[str]:
|
70 |
-
text = " ".join(contexts).lower()
|
71 |
-
bullets: List[str] = []
|
72 |
-
|
73 |
-
def add(b: str):
|
74 |
-
if b not in bullets:
|
75 |
-
bullets.append(b)
|
76 |
-
|
77 |
-
if ("şüşə" in text) or ("ara kəsm" in text) or ("s/q" in text):
|
78 |
-
add("Removal and re-installation of glass partitions in sanitary areas.")
|
79 |
-
if "divar kağız" in text:
|
80 |
-
add("Wallpaper repair or replacement; some areas replaced with plaster and paint.")
|
81 |
-
if ("alçı boya" in text) or ("boya işi" in text) or ("plaster" in text) or ("boya" in text):
|
82 |
-
add("Wall plastering and painting works.")
|
83 |
-
if "seramik" in text or "ceramic" in text:
|
84 |
-
add("Ceramic tiling works (including grouting).")
|
85 |
-
if ("dilatasyon" in text) or ("ar 153" in text) or ("ar153" in text):
|
86 |
-
add("Installation of AR 153–050 floor expansion joint profile with accessories and insulation.")
|
87 |
-
if "daş yunu" in text or "rock wool" in text:
|
88 |
-
add("Rock wool insulation installed where required.")
|
89 |
-
if ("sütunlarda" in text) or ("üzlüyün" in text) or ("cladding" in text):
|
90 |
-
add("Repair of wall cladding on columns.")
|
91 |
-
if ("m²" in text) or ("ədəd" in text) or ("azn" in text) or ("unit price" in text):
|
92 |
-
add("Bill of quantities style lines with unit prices and measures (m², pcs).")
|
93 |
-
|
94 |
-
if not bullets:
|
95 |
-
bullets = [
|
96 |
-
"The document appears to be a bill of quantities or a structured list of works.",
|
97 |
-
"Scope likely includes demolition/reinstallation, finishing (plaster & paint), tiling, and profiles.",
|
98 |
-
]
|
99 |
-
return bullets[:5]
|
100 |
-
|
101 |
class SimpleRAG:
|
102 |
def __init__(
|
103 |
self,
|
@@ -112,14 +121,16 @@ class SimpleRAG:
|
|
112 |
self.cache_dir = Path(cache_dir)
|
113 |
|
114 |
self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
|
115 |
-
self.embed_dim = self.model.get_sentence_embedding_dimension()
|
116 |
|
117 |
-
self._translator = None # lazy
|
118 |
self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
|
119 |
self.chunks: List[str] = []
|
120 |
self.last_added: List[str] = []
|
|
|
|
|
121 |
self._load()
|
122 |
|
|
|
123 |
def _load(self) -> None:
|
124 |
if self.meta_path.exists():
|
125 |
try:
|
@@ -138,56 +149,49 @@ class SimpleRAG:
|
|
138 |
faiss.write_index(self.index, str(self.index_path))
|
139 |
np.save(self.meta_path, np.array(self.chunks, dtype=object))
|
140 |
|
|
|
|
|
|
|
|
|
|
|
141 |
@staticmethod
|
142 |
-
def _pdf_to_texts(pdf_path: Path, step: int =
|
143 |
-
|
144 |
pages: List[str] = []
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
pages.append(t)
|
151 |
-
except Exception:
|
152 |
-
pages = []
|
153 |
-
|
154 |
-
full = " ".join(pages).strip()
|
155 |
-
if not full:
|
156 |
-
# 2) pdfminer fallback
|
157 |
-
try:
|
158 |
-
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
159 |
-
full = (pdfminer_extract_text(str(pdf_path)) or "").strip()
|
160 |
-
except Exception:
|
161 |
-
full = ""
|
162 |
-
|
163 |
-
if not full:
|
164 |
-
return []
|
165 |
-
|
166 |
chunks: List[str] = []
|
167 |
-
for
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
171 |
return chunks
|
172 |
|
|
|
173 |
def add_pdf(self, pdf_path: Path) -> int:
|
174 |
texts = self._pdf_to_texts(pdf_path)
|
175 |
if not texts:
|
176 |
-
# IMPORTANT: do NOT clobber last_added if this PDF had no extractable text
|
177 |
return 0
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
self.index.add(emb.astype(np.float32))
|
182 |
self.chunks.extend(texts)
|
|
|
183 |
self._persist()
|
184 |
return len(texts)
|
185 |
|
|
|
186 |
def search(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
|
187 |
-
if self.
|
188 |
return []
|
189 |
q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
190 |
-
|
|
|
191 |
out: List[Tuple[str, float]] = []
|
192 |
if I.size > 0 and self.chunks:
|
193 |
for idx, score in zip(I[0], D[0]):
|
@@ -195,6 +199,7 @@ class SimpleRAG:
|
|
195 |
out.append((self.chunks[idx], float(score)))
|
196 |
return out
|
197 |
|
|
|
198 |
def _translate_to_en(self, texts: List[str]) -> List[str]:
|
199 |
if not texts:
|
200 |
return texts
|
@@ -207,78 +212,91 @@ class SimpleRAG:
|
|
207 |
cache_dir=str(self.cache_dir),
|
208 |
device=-1,
|
209 |
)
|
210 |
-
outs = self._translator(texts, max_length=
|
211 |
return [o["translation_text"].strip() for o in outs]
|
212 |
except Exception:
|
213 |
return texts
|
214 |
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
if
|
219 |
-
return
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
|
|
222 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
223 |
-
contexts
|
224 |
-
|
225 |
-
if not contexts:
|
226 |
-
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
227 |
|
228 |
-
#
|
229 |
-
|
230 |
-
cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
|
231 |
-
if not cleaned_contexts:
|
232 |
-
bullets = _keyword_summary_en(contexts[:5])
|
233 |
-
return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
|
234 |
|
235 |
-
#
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
for para in translated:
|
241 |
-
for s in _split_sentences(para):
|
242 |
w = s.split()
|
243 |
-
if not (
|
244 |
-
continue
|
245 |
-
# full sentence requirement: punctuation at end OR sufficiently long
|
246 |
-
if not re.search(r"[.!?](?:[\"'])?$", s) and len(w) < 18:
|
247 |
continue
|
248 |
if _tabular_like(s) or _mostly_numeric(s):
|
249 |
continue
|
250 |
-
|
251 |
|
252 |
-
|
253 |
-
if
|
254 |
-
|
255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
-
|
258 |
-
|
259 |
-
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
260 |
-
scores = (cand_emb @ q_emb.T).ravel()
|
261 |
-
order = np.argsort(-scores)
|
262 |
|
263 |
-
|
264 |
-
|
265 |
-
for i in order:
|
266 |
-
s = candidates[i].strip()
|
267 |
-
if any(_sim_jaccard(s, t) >= 0.90 for t in selected):
|
268 |
-
continue
|
269 |
-
selected.append(s)
|
270 |
-
if len(selected) >= max_sentences:
|
271 |
-
break
|
272 |
|
273 |
-
|
274 |
-
|
275 |
-
bullets = _keyword_summary_en(cleaned_contexts)
|
276 |
-
return "Answer (based on document context):\n" + "\n".join(f"- {b}" for b in bullets)
|
277 |
|
278 |
bullets = "\n".join(f"- {s}" for s in selected)
|
279 |
return f"Answer (based on document context):\n{bullets}"
|
280 |
|
281 |
-
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
282 |
-
return SimpleRAG().synthesize_answer(question, contexts)
|
283 |
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# app/rag_system.py
|
2 |
from __future__ import annotations
|
3 |
|
4 |
+
import os
|
5 |
+
import re
|
6 |
from pathlib import Path
|
7 |
+
from typing import List, Tuple, Optional
|
8 |
|
9 |
import faiss
|
10 |
import numpy as np
|
11 |
+
|
12 |
+
# -- add near other helpers --
|
13 |
+
import re
|
14 |
+
|
15 |
+
AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
|
16 |
+
_SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
|
17 |
+
|
18 |
+
def _fix_intra_word_spaces(s: str) -> str:
|
19 |
+
"""Join sequences like 'H Ə F T Ə' -> 'HƏFTƏ' without touching normal words."""
|
20 |
+
if not s:
|
21 |
+
return s
|
22 |
+
return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
|
23 |
+
|
24 |
+
# Prefer pypdf; fallback to PyPDF2 if needed
|
25 |
+
try:
|
26 |
+
from pypdf import PdfReader
|
27 |
+
except Exception:
|
28 |
+
from PyPDF2 import PdfReader # type: ignore
|
29 |
+
|
30 |
from sentence_transformers import SentenceTransformer
|
31 |
|
32 |
+
# ---------------- Paths & Cache (HF-safe) ----------------
|
33 |
+
# Writeable base is /app in HF Spaces. Allow ENV overrides.
|
34 |
+
ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
|
35 |
+
DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
|
36 |
+
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
|
37 |
+
INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
|
38 |
+
CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers prefers HF_HOME
|
39 |
+
|
40 |
for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
41 |
d.mkdir(parents=True, exist_ok=True)
|
42 |
|
43 |
+
# ---------------- Config ----------------
|
44 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
45 |
OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
|
46 |
|
47 |
+
# ---------------- Helpers ----------------
|
48 |
AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
|
49 |
+
|
50 |
+
def _fix_mojibake(s: str) -> str:
|
51 |
+
"""Fix common UTF-8-as-Latin-1 mojibake."""
|
52 |
+
if not s:
|
53 |
+
return s
|
54 |
+
if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
|
55 |
+
try:
|
56 |
+
return s.encode("latin-1", "ignore").decode("utf-8", "ignore")
|
57 |
+
except Exception:
|
58 |
+
return s
|
59 |
+
return s
|
60 |
|
61 |
def _split_sentences(text: str) -> List[str]:
|
62 |
+
return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
|
63 |
|
64 |
def _mostly_numeric(s: str) -> bool:
|
65 |
+
alnum = [c for c in s if c.isalnum()]
|
66 |
if not alnum:
|
67 |
return True
|
68 |
digits = sum(c.isdigit() for c in alnum)
|
69 |
return digits / max(1, len(alnum)) > 0.3
|
70 |
|
71 |
+
NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
|
72 |
+
|
73 |
def _tabular_like(s: str) -> bool:
|
74 |
+
hits = len(NUM_TOKEN_RE.findall(s))
|
75 |
+
return hits >= 2 or "Page" in s or len(s) < 20
|
76 |
|
77 |
def _clean_for_summary(text: str) -> str:
|
78 |
out = []
|
|
|
90 |
return 0.0
|
91 |
return len(aw & bw) / len(aw | bw)
|
92 |
|
93 |
+
STOPWORDS = {
|
94 |
+
"the","a","an","and","or","of","to","in","on","for","with","by",
|
95 |
+
"this","that","these","those","is","are","was","were","be","been","being",
|
96 |
+
"at","as","it","its","from","into","about","over","after","before","than",
|
97 |
+
"such","can","could","should","would","may","might","will","shall"
|
98 |
+
}
|
99 |
+
|
100 |
+
def _keywords(text: str) -> List[str]:
|
101 |
+
toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
|
102 |
+
return [t for t in toks if t not in STOPWORDS and len(t) > 2]
|
103 |
+
|
104 |
def _looks_azerbaijani(s: str) -> bool:
|
105 |
has_az = any(ch in AZ_CHARS for ch in s)
|
106 |
non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
|
107 |
return has_az or non_ascii_ratio > 0.15
|
108 |
|
109 |
+
# ---------------- RAG Core ----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
class SimpleRAG:
|
111 |
def __init__(
|
112 |
self,
|
|
|
121 |
self.cache_dir = Path(cache_dir)
|
122 |
|
123 |
self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
|
124 |
+
self.embed_dim = int(self.model.get_sentence_embedding_dimension())
|
125 |
|
|
|
126 |
self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
|
127 |
self.chunks: List[str] = []
|
128 |
self.last_added: List[str] = []
|
129 |
+
self._translator = None # lazy init
|
130 |
+
|
131 |
self._load()
|
132 |
|
133 |
+
# ---------- Persistence ----------
|
134 |
def _load(self) -> None:
|
135 |
if self.meta_path.exists():
|
136 |
try:
|
|
|
149 |
faiss.write_index(self.index, str(self.index_path))
|
150 |
np.save(self.meta_path, np.array(self.chunks, dtype=object))
|
151 |
|
152 |
+
# ---------- Utilities ----------
|
153 |
+
@property
|
154 |
+
def is_empty(self) -> bool:
|
155 |
+
return getattr(self.index, "ntotal", 0) == 0 or not self.chunks
|
156 |
+
|
157 |
@staticmethod
|
158 |
+
def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
|
159 |
+
reader = PdfReader(str(pdf_path))
|
160 |
pages: List[str] = []
|
161 |
+
for p in reader.pages:
|
162 |
+
t = p.extract_text() or ""
|
163 |
+
t = _fix_mojibake(t)
|
164 |
+
if t.strip():
|
165 |
+
pages.append(t)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
chunks: List[str] = []
|
167 |
+
for txt in pages:
|
168 |
+
for i in range(0, len(txt), step):
|
169 |
+
part = txt[i : i + step].strip()
|
170 |
+
if part:
|
171 |
+
chunks.append(part)
|
172 |
return chunks
|
173 |
|
174 |
+
# ---------- Indexing ----------
|
175 |
def add_pdf(self, pdf_path: Path) -> int:
|
176 |
texts = self._pdf_to_texts(pdf_path)
|
177 |
if not texts:
|
|
|
178 |
return 0
|
179 |
+
emb = self.model.encode(
|
180 |
+
texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
|
181 |
+
)
|
182 |
self.index.add(emb.astype(np.float32))
|
183 |
self.chunks.extend(texts)
|
184 |
+
self.last_added = texts[:]
|
185 |
self._persist()
|
186 |
return len(texts)
|
187 |
|
188 |
+
# ---------- Search ----------
|
189 |
def search(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
|
190 |
+
if self.is_empty:
|
191 |
return []
|
192 |
q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
193 |
+
k = max(1, min(int(k or 5), getattr(self.index, "ntotal", 1)))
|
194 |
+
D, I = self.index.search(q, k)
|
195 |
out: List[Tuple[str, float]] = []
|
196 |
if I.size > 0 and self.chunks:
|
197 |
for idx, score in zip(I[0], D[0]):
|
|
|
199 |
out.append((self.chunks[idx], float(score)))
|
200 |
return out
|
201 |
|
202 |
+
# ---------- Translation (optional) ----------
|
203 |
def _translate_to_en(self, texts: List[str]) -> List[str]:
|
204 |
if not texts:
|
205 |
return texts
|
|
|
212 |
cache_dir=str(self.cache_dir),
|
213 |
device=-1,
|
214 |
)
|
215 |
+
outs = self._translator(texts, max_length=400)
|
216 |
return [o["translation_text"].strip() for o in outs]
|
217 |
except Exception:
|
218 |
return texts
|
219 |
|
220 |
+
# ---------- Fallbacks ----------
|
221 |
+
def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
|
222 |
+
qk = set(_keywords(question))
|
223 |
+
if not qk:
|
224 |
+
return []
|
225 |
+
candidates: List[Tuple[float, str]] = []
|
226 |
+
for text in pool[:200]:
|
227 |
+
cleaned = _clean_for_summary(text)
|
228 |
+
for s in _split_sentences(cleaned):
|
229 |
+
if _tabular_like(s) or _mostly_numeric(s):
|
230 |
+
continue
|
231 |
+
toks = set(_keywords(s))
|
232 |
+
if not toks:
|
233 |
+
continue
|
234 |
+
overlap = len(qk & toks)
|
235 |
+
if overlap == 0:
|
236 |
+
continue
|
237 |
+
length_penalty = max(8, min(40, len(s.split())))
|
238 |
+
score = overlap + min(0.5, overlap / length_penalty)
|
239 |
+
candidates.append((score, s))
|
240 |
+
candidates.sort(key=lambda x: x[0], reverse=True)
|
241 |
+
out: List[str] = []
|
242 |
+
for _, s in candidates:
|
243 |
+
if any(_sim_jaccard(s, t) >= 0.82 for t in out):
|
244 |
+
continue
|
245 |
+
out.append(s)
|
246 |
+
if len(out) >= limit_sentences:
|
247 |
+
break
|
248 |
+
return out
|
249 |
|
250 |
+
# ---------- Answer Synthesis ----------
|
251 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
252 |
+
if not contexts and self.is_empty:
|
253 |
+
return "No relevant context found. Index is empty — upload a PDF first."
|
|
|
|
|
254 |
|
255 |
+
# Fix mojibake in contexts
|
256 |
+
contexts = [_fix_mojibake(c) for c in (contexts or [])]
|
|
|
|
|
|
|
|
|
257 |
|
258 |
+
# Build candidate sentences from nearby contexts
|
259 |
+
local_pool: List[str] = []
|
260 |
+
for c in (contexts or [])[:5]:
|
261 |
+
cleaned = _clean_for_summary(c)
|
262 |
+
for s in _split_sentences(cleaned):
|
|
|
|
|
263 |
w = s.split()
|
264 |
+
if not (8 <= len(w) <= 35):
|
|
|
|
|
|
|
265 |
continue
|
266 |
if _tabular_like(s) or _mostly_numeric(s):
|
267 |
continue
|
268 |
+
local_pool.append(" ".join(w))
|
269 |
|
270 |
+
selected: List[str] = []
|
271 |
+
if local_pool:
|
272 |
+
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
273 |
+
cand_emb = self.model.encode(local_pool, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
274 |
+
scores = (cand_emb @ q_emb.T).ravel()
|
275 |
+
order = np.argsort(-scores)
|
276 |
+
for i in order:
|
277 |
+
s = local_pool[i].strip()
|
278 |
+
if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
|
279 |
+
continue
|
280 |
+
selected.append(s)
|
281 |
+
if len(selected) >= max_sentences:
|
282 |
+
break
|
283 |
|
284 |
+
if not selected:
|
285 |
+
selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
|
|
|
|
|
|
|
286 |
|
287 |
+
if not selected:
|
288 |
+
return "No readable sentences matched the question. Try a more specific query."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
+
if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
|
291 |
+
selected = self._translate_to_en(selected)
|
|
|
|
|
292 |
|
293 |
bullets = "\n".join(f"- {s}" for s in selected)
|
294 |
return f"Answer (based on document context):\n{bullets}"
|
295 |
|
|
|
|
|
296 |
|
297 |
+
# Public API
|
298 |
+
__all__ = [
|
299 |
+
"SimpleRAG",
|
300 |
+
"UPLOAD_DIR",
|
301 |
+
"INDEX_DIR",
|
302 |
+
]
|