Spaces:
Running
Running
Commit
·
a46e32d
1
Parent(s):
0d2dbdc
Aggressive tabular filtering + az->en translation (Helsinki-NLP)
Browse files- app/rag_system.py +49 -23
- requirements.txt +3 -0
app/rag_system.py
CHANGED
@@ -10,7 +10,6 @@ import numpy as np
|
|
10 |
from pypdf import PdfReader
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
|
13 |
-
# Paths & caches
|
14 |
ROOT_DIR = Path(__file__).resolve().parent.parent
|
15 |
DATA_DIR = ROOT_DIR / "data"
|
16 |
UPLOAD_DIR = DATA_DIR / "uploads"
|
@@ -21,36 +20,40 @@ for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
|
21 |
|
22 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def _split_sentences(text: str) -> List[str]:
|
25 |
-
# Split by sentence end or newlines
|
26 |
return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
|
27 |
|
28 |
def _mostly_numeric(s: str) -> bool:
|
|
|
29 |
alnum = [c for c in s if c.isalnum()]
|
30 |
if not alnum:
|
31 |
return True
|
32 |
digits = sum(c.isdigit() for c in alnum)
|
33 |
-
return digits / len(alnum) > 0.
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def _clean_for_summary(text: str) -> str:
|
36 |
-
# Drop lines that are mostly numbers / too short
|
37 |
lines = []
|
38 |
for ln in text.splitlines():
|
39 |
t = " ".join(ln.split())
|
40 |
-
if
|
41 |
continue
|
42 |
-
if _mostly_numeric(t):
|
43 |
continue
|
44 |
lines.append(t)
|
45 |
return " ".join(lines)
|
46 |
|
47 |
class SimpleRAG:
|
48 |
-
"""
|
49 |
-
- PDF -> text chunking
|
50 |
-
- Sentence-Transformers embeddings (cosine/IP)
|
51 |
-
- FAISS index
|
52 |
-
- Extractive answer in EN
|
53 |
-
"""
|
54 |
def __init__(
|
55 |
self,
|
56 |
index_path: Path = INDEX_DIR / "faiss.index",
|
@@ -66,10 +69,33 @@ class SimpleRAG:
|
|
66 |
self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
|
67 |
self.embed_dim = self.model.get_sentence_embedding_dimension()
|
68 |
|
|
|
|
|
|
|
69 |
self.index: faiss.Index = None # type: ignore
|
70 |
self.chunks: List[str] = []
|
71 |
self._load()
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def _load(self) -> None:
|
74 |
if self.meta_path.exists():
|
75 |
try:
|
@@ -127,30 +153,28 @@ class SimpleRAG:
|
|
127 |
out.append((self.chunks[idx], float(score)))
|
128 |
return out
|
129 |
|
130 |
-
# -------- Improved English answer --------
|
131 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 5) -> str:
|
132 |
if not contexts:
|
133 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
134 |
|
135 |
-
#
|
136 |
candidates: List[str] = []
|
137 |
for c in contexts[:5]:
|
138 |
cleaned = _clean_for_summary(c)
|
139 |
for s in _split_sentences(cleaned):
|
140 |
-
if
|
141 |
candidates.append(s)
|
142 |
|
143 |
-
# Fallback if still nothing
|
144 |
if not candidates:
|
145 |
-
return "The document appears
|
146 |
|
147 |
-
# Rank
|
148 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
149 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
150 |
scores = (cand_emb @ q_emb.T).ravel()
|
151 |
order = np.argsort(-scores)
|
152 |
|
153 |
-
# Pick top sentences with
|
154 |
selected: List[str] = []
|
155 |
seen = set()
|
156 |
for i in order:
|
@@ -163,12 +187,14 @@ class SimpleRAG:
|
|
163 |
if len(selected) >= max_sentences:
|
164 |
break
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
169 |
|
170 |
|
171 |
-
# Module-level alias
|
172 |
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
173 |
return SimpleRAG().synthesize_answer(question, contexts)
|
174 |
|
|
|
10 |
from pypdf import PdfReader
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
|
|
|
13 |
ROOT_DIR = Path(__file__).resolve().parent.parent
|
14 |
DATA_DIR = ROOT_DIR / "data"
|
15 |
UPLOAD_DIR = DATA_DIR / "uploads"
|
|
|
20 |
|
21 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
22 |
|
23 |
+
# Output dili – EN üçün "en" saxla (default en)
|
24 |
+
OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
|
25 |
+
|
26 |
+
# --- util funksiyalar ---
|
27 |
+
NUM_PAT = re.compile(r"(\d+([.,]\d+)?|%|m²|AZN|usd|eur|\bset\b|\bmt\b)", re.IGNORECASE)
|
28 |
+
|
29 |
def _split_sentences(text: str) -> List[str]:
|
|
|
30 |
return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
|
31 |
|
32 |
def _mostly_numeric(s: str) -> bool:
|
33 |
+
# daha aqressiv threshold
|
34 |
alnum = [c for c in s if c.isalnum()]
|
35 |
if not alnum:
|
36 |
return True
|
37 |
digits = sum(c.isdigit() for c in alnum)
|
38 |
+
return digits / max(1, len(alnum)) > 0.3
|
39 |
+
|
40 |
+
def _tabular_like(s: str) -> bool:
|
41 |
+
# rəqəmlər/ölçülər/valyuta bol olan sətirləri at
|
42 |
+
hits = len(NUM_PAT.findall(s))
|
43 |
+
return hits >= 2 or "Page" in s or len(s) < 20
|
44 |
|
45 |
def _clean_for_summary(text: str) -> str:
|
|
|
46 |
lines = []
|
47 |
for ln in text.splitlines():
|
48 |
t = " ".join(ln.split())
|
49 |
+
if not t:
|
50 |
continue
|
51 |
+
if _mostly_numeric(t) or _tabular_like(t):
|
52 |
continue
|
53 |
lines.append(t)
|
54 |
return " ".join(lines)
|
55 |
|
56 |
class SimpleRAG:
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
def __init__(
|
58 |
self,
|
59 |
index_path: Path = INDEX_DIR / "faiss.index",
|
|
|
69 |
self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
|
70 |
self.embed_dim = self.model.get_sentence_embedding_dimension()
|
71 |
|
72 |
+
# translator lazy-load
|
73 |
+
self._translator = None
|
74 |
+
|
75 |
self.index: faiss.Index = None # type: ignore
|
76 |
self.chunks: List[str] = []
|
77 |
self._load()
|
78 |
|
79 |
+
# ---- translator (az->en) ----
|
80 |
+
def _translate_to_en(self, texts: List[str]) -> List[str]:
|
81 |
+
if OUTPUT_LANG != "en" or not texts:
|
82 |
+
return texts
|
83 |
+
try:
|
84 |
+
if self._translator is None:
|
85 |
+
from transformers import pipeline
|
86 |
+
# Helsinki-NLP az->en
|
87 |
+
self._translator = pipeline(
|
88 |
+
"translation",
|
89 |
+
model="Helsinki-NLP/opus-mt-az-en",
|
90 |
+
cache_dir=str(self.cache_dir),
|
91 |
+
device=-1,
|
92 |
+
)
|
93 |
+
outs = self._translator(texts, max_length=400)
|
94 |
+
return [o["translation_text"] for o in outs]
|
95 |
+
except Exception:
|
96 |
+
# tərcümə alınmasa, orijinalı qaytar
|
97 |
+
return texts
|
98 |
+
|
99 |
def _load(self) -> None:
|
100 |
if self.meta_path.exists():
|
101 |
try:
|
|
|
153 |
out.append((self.chunks[idx], float(score)))
|
154 |
return out
|
155 |
|
|
|
156 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 5) -> str:
|
157 |
if not contexts:
|
158 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
159 |
|
160 |
+
# Candidate sentences (clean + split)
|
161 |
candidates: List[str] = []
|
162 |
for c in contexts[:5]:
|
163 |
cleaned = _clean_for_summary(c)
|
164 |
for s in _split_sentences(cleaned):
|
165 |
+
if 40 <= len(s) <= 240 and not _tabular_like(s):
|
166 |
candidates.append(s)
|
167 |
|
|
|
168 |
if not candidates:
|
169 |
+
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
170 |
|
171 |
+
# Rank by similarity
|
172 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
173 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
174 |
scores = (cand_emb @ q_emb.T).ravel()
|
175 |
order = np.argsort(-scores)
|
176 |
|
177 |
+
# Pick top sentences with dedup by lowercase
|
178 |
selected: List[str] = []
|
179 |
seen = set()
|
180 |
for i in order:
|
|
|
187 |
if len(selected) >= max_sentences:
|
188 |
break
|
189 |
|
190 |
+
# Translate to EN if needed
|
191 |
+
if OUTPUT_LANG == "en":
|
192 |
+
selected = self._translate_to_en(selected)
|
193 |
+
|
194 |
+
bullets = "\n".join(f"- {s}" for s in selected)
|
195 |
+
return f"Answer (based on document context):\n{bullets}"
|
196 |
|
197 |
|
|
|
198 |
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
199 |
return SimpleRAG().synthesize_answer(question, contexts)
|
200 |
|
requirements.txt
CHANGED
@@ -7,3 +7,6 @@ sentence-transformers
|
|
7 |
faiss-cpu
|
8 |
pypdf
|
9 |
python-dotenv
|
|
|
|
|
|
|
|
7 |
faiss-cpu
|
8 |
pypdf
|
9 |
python-dotenv
|
10 |
+
transformers>=4.40
|
11 |
+
sentencepiece
|
12 |
+
sacremoses
|