HamidOmarov commited on
Commit
a46e32d
·
1 Parent(s): 0d2dbdc

Aggressive tabular filtering + az->en translation (Helsinki-NLP)

Browse files
Files changed (2) hide show
  1. app/rag_system.py +49 -23
  2. requirements.txt +3 -0
app/rag_system.py CHANGED
@@ -10,7 +10,6 @@ import numpy as np
10
  from pypdf import PdfReader
11
  from sentence_transformers import SentenceTransformer
12
 
13
- # Paths & caches
14
  ROOT_DIR = Path(__file__).resolve().parent.parent
15
  DATA_DIR = ROOT_DIR / "data"
16
  UPLOAD_DIR = DATA_DIR / "uploads"
@@ -21,36 +20,40 @@ for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
21
 
22
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
23
 
 
 
 
 
 
 
24
  def _split_sentences(text: str) -> List[str]:
25
- # Split by sentence end or newlines
26
  return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
27
 
28
  def _mostly_numeric(s: str) -> bool:
 
29
  alnum = [c for c in s if c.isalnum()]
30
  if not alnum:
31
  return True
32
  digits = sum(c.isdigit() for c in alnum)
33
- return digits / len(alnum) > 0.5
 
 
 
 
 
34
 
35
  def _clean_for_summary(text: str) -> str:
36
- # Drop lines that are mostly numbers / too short
37
  lines = []
38
  for ln in text.splitlines():
39
  t = " ".join(ln.split())
40
- if len(t) < 10:
41
  continue
42
- if _mostly_numeric(t):
43
  continue
44
  lines.append(t)
45
  return " ".join(lines)
46
 
47
  class SimpleRAG:
48
- """
49
- - PDF -> text chunking
50
- - Sentence-Transformers embeddings (cosine/IP)
51
- - FAISS index
52
- - Extractive answer in EN
53
- """
54
  def __init__(
55
  self,
56
  index_path: Path = INDEX_DIR / "faiss.index",
@@ -66,10 +69,33 @@ class SimpleRAG:
66
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
67
  self.embed_dim = self.model.get_sentence_embedding_dimension()
68
 
 
 
 
69
  self.index: faiss.Index = None # type: ignore
70
  self.chunks: List[str] = []
71
  self._load()
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def _load(self) -> None:
74
  if self.meta_path.exists():
75
  try:
@@ -127,30 +153,28 @@ class SimpleRAG:
127
  out.append((self.chunks[idx], float(score)))
128
  return out
129
 
130
- # -------- Improved English answer --------
131
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 5) -> str:
132
  if not contexts:
133
  return "No relevant context found. Please upload a PDF or ask a more specific question."
134
 
135
- # Prepare candidate sentences
136
  candidates: List[str] = []
137
  for c in contexts[:5]:
138
  cleaned = _clean_for_summary(c)
139
  for s in _split_sentences(cleaned):
140
- if 20 <= len(s) <= 240 and not _mostly_numeric(s):
141
  candidates.append(s)
142
 
143
- # Fallback if still nothing
144
  if not candidates:
145
- return "The document appears to be mostly tabular/numeric; no clear sentences to summarize."
146
 
147
- # Rank candidates by cosine similarity to the question
148
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
149
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
150
  scores = (cand_emb @ q_emb.T).ravel()
151
  order = np.argsort(-scores)
152
 
153
- # Pick top sentences with simple de-dup
154
  selected: List[str] = []
155
  seen = set()
156
  for i in order:
@@ -163,12 +187,14 @@ class SimpleRAG:
163
  if len(selected) >= max_sentences:
164
  break
165
 
166
- bullet = "\n".join(f"- {s}" for s in selected)
167
- note = " (The PDF seems largely tabular; extracted the most relevant lines.)" if all(_mostly_numeric(c) for c in contexts) else ""
168
- return f"Answer (based on document context):\n{bullet}{note}"
 
 
 
169
 
170
 
171
- # Module-level alias
172
  def synthesize_answer(question: str, contexts: List[str]) -> str:
173
  return SimpleRAG().synthesize_answer(question, contexts)
174
 
 
10
  from pypdf import PdfReader
11
  from sentence_transformers import SentenceTransformer
12
 
 
13
  ROOT_DIR = Path(__file__).resolve().parent.parent
14
  DATA_DIR = ROOT_DIR / "data"
15
  UPLOAD_DIR = DATA_DIR / "uploads"
 
20
 
21
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
22
 
23
+ # Output dili – EN üçün "en" saxla (default en)
24
+ OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
25
+
26
+ # --- util funksiyalar ---
27
+ NUM_PAT = re.compile(r"(\d+([.,]\d+)?|%|m²|AZN|usd|eur|\bset\b|\bmt\b)", re.IGNORECASE)
28
+
29
  def _split_sentences(text: str) -> List[str]:
 
30
  return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
31
 
32
  def _mostly_numeric(s: str) -> bool:
33
+ # daha aqressiv threshold
34
  alnum = [c for c in s if c.isalnum()]
35
  if not alnum:
36
  return True
37
  digits = sum(c.isdigit() for c in alnum)
38
+ return digits / max(1, len(alnum)) > 0.3
39
+
40
+ def _tabular_like(s: str) -> bool:
41
+ # rəqəmlər/ölçülər/valyuta bol olan sətirləri at
42
+ hits = len(NUM_PAT.findall(s))
43
+ return hits >= 2 or "Page" in s or len(s) < 20
44
 
45
  def _clean_for_summary(text: str) -> str:
 
46
  lines = []
47
  for ln in text.splitlines():
48
  t = " ".join(ln.split())
49
+ if not t:
50
  continue
51
+ if _mostly_numeric(t) or _tabular_like(t):
52
  continue
53
  lines.append(t)
54
  return " ".join(lines)
55
 
56
  class SimpleRAG:
 
 
 
 
 
 
57
  def __init__(
58
  self,
59
  index_path: Path = INDEX_DIR / "faiss.index",
 
69
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
70
  self.embed_dim = self.model.get_sentence_embedding_dimension()
71
 
72
+ # translator lazy-load
73
+ self._translator = None
74
+
75
  self.index: faiss.Index = None # type: ignore
76
  self.chunks: List[str] = []
77
  self._load()
78
 
79
+ # ---- translator (az->en) ----
80
+ def _translate_to_en(self, texts: List[str]) -> List[str]:
81
+ if OUTPUT_LANG != "en" or not texts:
82
+ return texts
83
+ try:
84
+ if self._translator is None:
85
+ from transformers import pipeline
86
+ # Helsinki-NLP az->en
87
+ self._translator = pipeline(
88
+ "translation",
89
+ model="Helsinki-NLP/opus-mt-az-en",
90
+ cache_dir=str(self.cache_dir),
91
+ device=-1,
92
+ )
93
+ outs = self._translator(texts, max_length=400)
94
+ return [o["translation_text"] for o in outs]
95
+ except Exception:
96
+ # tərcümə alınmasa, orijinalı qaytar
97
+ return texts
98
+
99
  def _load(self) -> None:
100
  if self.meta_path.exists():
101
  try:
 
153
  out.append((self.chunks[idx], float(score)))
154
  return out
155
 
 
156
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 5) -> str:
157
  if not contexts:
158
  return "No relevant context found. Please upload a PDF or ask a more specific question."
159
 
160
+ # Candidate sentences (clean + split)
161
  candidates: List[str] = []
162
  for c in contexts[:5]:
163
  cleaned = _clean_for_summary(c)
164
  for s in _split_sentences(cleaned):
165
+ if 40 <= len(s) <= 240 and not _tabular_like(s):
166
  candidates.append(s)
167
 
 
168
  if not candidates:
169
+ return "The document appears largely tabular/numeric; couldn't extract readable sentences."
170
 
171
+ # Rank by similarity
172
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
173
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
174
  scores = (cand_emb @ q_emb.T).ravel()
175
  order = np.argsort(-scores)
176
 
177
+ # Pick top sentences with dedup by lowercase
178
  selected: List[str] = []
179
  seen = set()
180
  for i in order:
 
187
  if len(selected) >= max_sentences:
188
  break
189
 
190
+ # Translate to EN if needed
191
+ if OUTPUT_LANG == "en":
192
+ selected = self._translate_to_en(selected)
193
+
194
+ bullets = "\n".join(f"- {s}" for s in selected)
195
+ return f"Answer (based on document context):\n{bullets}"
196
 
197
 
 
198
  def synthesize_answer(question: str, contexts: List[str]) -> str:
199
  return SimpleRAG().synthesize_answer(question, contexts)
200
 
requirements.txt CHANGED
@@ -7,3 +7,6 @@ sentence-transformers
7
  faiss-cpu
8
  pypdf
9
  python-dotenv
 
 
 
 
7
  faiss-cpu
8
  pypdf
9
  python-dotenv
10
+ transformers>=4.40
11
+ sentencepiece
12
+ sacremoses