HamidOmarov commited on
Commit
6b6b475
·
1 Parent(s): a5f927b

Fix indentation; stable EN extractive summarizer

Browse files
Files changed (1) hide show
  1. app/rag_system.py +61 -64
app/rag_system.py CHANGED
@@ -19,18 +19,15 @@ for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
19
  d.mkdir(parents=True, exist_ok=True)
20
 
21
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
22
-
23
- # Output dili – EN üçün "en" saxla (default en)
24
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
25
 
26
- # --- util funksiyalar ---
27
- NUM_PAT = re.compile(r"(\d+([.,]\d+)?|%|m²|AZN|usd|eur|\bset\b|\bmt\b)", re.IGNORECASE)
28
 
29
  def _split_sentences(text: str) -> List[str]:
30
  return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
31
 
32
  def _mostly_numeric(s: str) -> bool:
33
- # daha aqressiv threshold
34
  alnum = [c for c in s if c.isalnum()]
35
  if not alnum:
36
  return True
@@ -38,20 +35,34 @@ def _mostly_numeric(s: str) -> bool:
38
  return digits / max(1, len(alnum)) > 0.3
39
 
40
  def _tabular_like(s: str) -> bool:
41
- # rəqəmlər/ölçülər/valyuta bol olan sətirləri at
42
- hits = len(NUM_PAT.findall(s))
43
  return hits >= 2 or "Page" in s or len(s) < 20
44
 
45
  def _clean_for_summary(text: str) -> str:
46
- lines = []
47
  for ln in text.splitlines():
48
  t = " ".join(ln.split())
49
- if not t:
50
- continue
51
- if _mostly_numeric(t) or _tabular_like(t):
52
  continue
53
- lines.append(t)
54
- return " ".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  class SimpleRAG:
57
  def __init__(
@@ -69,33 +80,11 @@ class SimpleRAG:
69
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
70
  self.embed_dim = self.model.get_sentence_embedding_dimension()
71
 
72
- # translator lazy-load
73
- self._translator = None
74
-
75
- self.index: faiss.Index = None # type: ignore
76
  self.chunks: List[str] = []
77
  self._load()
78
 
79
- # ---- translator (az->en) ----
80
- def _translate_to_en(self, texts: List[str]) -> List[str]:
81
- if OUTPUT_LANG != "en" or not texts:
82
- return texts
83
- try:
84
- if self._translator is None:
85
- from transformers import pipeline
86
- # Helsinki-NLP az->en
87
- self._translator = pipeline(
88
- "translation",
89
- model="Helsinki-NLP/opus-mt-az-en",
90
- cache_dir=str(self.cache_dir),
91
- device=-1,
92
- )
93
- outs = self._translator(texts, max_length=400)
94
- return [o["translation_text"] for o in outs]
95
- except Exception:
96
- # tərcümə alınmasa, orijinalı qaytar
97
- return texts
98
-
99
  def _load(self) -> None:
100
  if self.meta_path.exists():
101
  try:
@@ -105,11 +94,10 @@ class SimpleRAG:
105
  if self.index_path.exists():
106
  try:
107
  idx = faiss.read_index(str(self.index_path))
108
- self.index = idx if getattr(idx, "d", None) == self.embed_dim else faiss.IndexFlatIP(self.embed_dim)
 
109
  except Exception:
110
- self.index = faiss.IndexFlatIP(self.embed_dim)
111
- else:
112
- self.index = faiss.IndexFlatIP(self.embed_dim)
113
 
114
  def _persist(self) -> None:
115
  faiss.write_index(self.index, str(self.index_path))
@@ -118,7 +106,7 @@ class SimpleRAG:
118
  @staticmethod
119
  def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
120
  reader = PdfReader(str(pdf_path))
121
- pages = []
122
  for p in reader.pages:
123
  t = p.extract_text() or ""
124
  if t.strip():
@@ -126,7 +114,7 @@ class SimpleRAG:
126
  chunks: List[str] = []
127
  for txt in pages:
128
  for i in range(0, len(txt), step):
129
- part = txt[i:i+step].strip()
130
  if part:
131
  chunks.append(part)
132
  return chunks
@@ -153,36 +141,52 @@ class SimpleRAG:
153
  out.append((self.chunks[idx], float(score)))
154
  return out
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
157
  if not contexts:
158
  return "No relevant context found. Please upload a PDF or ask a more specific question."
159
 
160
- # 1) Candidate sentence-lər (aggressive clean)
161
- candidates: List[str] = []
162
  for c in contexts[:5]:
163
- cleaned = _clean_for_summary(c)
164
  for s in _split_sentences(cleaned):
165
- # uzunluq və keyfiyyət filtrləri
166
  w = s.split()
167
  if not (8 <= len(w) <= 35):
168
- continue
169
  if _tabular_like(s) or _mostly_numeric(s):
170
- continue
171
- candidates.append(" ".join(w)) # normalizasiya: bir boşluq
172
 
173
  if not candidates:
174
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
175
 
176
- # 2) Oxşarlığa görə sıralama
177
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
178
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
179
  scores = (cand_emb @ q_emb.T).ravel()
180
  order = np.argsort(-scores)
181
 
182
- # 3) Near-duplicate dedup (Jaccard söz seti) – threshold 0.82
183
  selected: List[str] = []
184
  for i in order:
185
- s = candidates[i].strip()
186
  if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
187
  continue
188
  selected.append(s)
@@ -192,22 +196,15 @@ class SimpleRAG:
192
  if not selected:
193
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
194
 
195
- # 4) HƏMİŞƏ EN tərcümə (istəyin belədir)
196
- if os.getenv("OUTPUT_LANG", "en").lower() == "en":
197
- selected = self._translate_to_en(selected)
 
198
 
199
  bullets = "\n".join(f"- {s}" for s in selected)
200
  return f"Answer (based on document context):\n{bullets}"
201
 
202
- def _sim_jaccard(a: str, b: str) -> float:
203
- aw = set(a.lower().split())
204
- bw = set(b.lower().split())
205
- if not aw or not bw:
206
- return 0.0
207
- return len(aw & bw) / len(aw | bw)
208
-
209
  def synthesize_answer(question: str, contexts: List[str]) -> str:
210
  return SimpleRAG().synthesize_answer(question, contexts)
211
 
212
-
213
  __all__ = ["SimpleRAG", "synthesize_answer", "DATA_DIR", "UPLOAD_DIR", "INDEX_DIR", "CACHE_DIR", "MODEL_NAME"]
 
19
  d.mkdir(parents=True, exist_ok=True)
20
 
21
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
 
 
22
  OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
23
 
24
+ AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
25
+ NUM_TOK_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
26
 
27
  def _split_sentences(text: str) -> List[str]:
28
  return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
29
 
30
  def _mostly_numeric(s: str) -> bool:
 
31
  alnum = [c for c in s if c.isalnum()]
32
  if not alnum:
33
  return True
 
35
  return digits / max(1, len(alnum)) > 0.3
36
 
37
  def _tabular_like(s: str) -> bool:
38
+ hits = len(NUM_TOK_RE.findall(s))
 
39
  return hits >= 2 or "Page" in s or len(s) < 20
40
 
41
  def _clean_for_summary(text: str) -> str:
42
+ out = []
43
  for ln in text.splitlines():
44
  t = " ".join(ln.split())
45
+ if not t or _mostly_numeric(t) or _tabular_like(t):
 
 
46
  continue
47
+ out.append(t)
48
+ return " ".join(out)
49
+
50
+ def _norm_fingerprint(s: str) -> str:
51
+ s = s.lower()
52
+ s = "".join(ch for ch in s if ch.isalpha() or ch.isspace())
53
+ return " ".join(s.split())
54
+
55
+ def _sim_jaccard(a: str, b: str) -> float:
56
+ aw = set(a.lower().split())
57
+ bw = set(b.lower().split())
58
+ if not aw or not bw:
59
+ return 0.0
60
+ return len(aw & bw) / len(aw | bw)
61
+
62
+ def _looks_azerbaijani(s: str) -> bool:
63
+ has_az = any(ch in AZ_CHARS for ch in s)
64
+ non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
65
+ return has_az or non_ascii_ratio > 0.15
66
 
67
  class SimpleRAG:
68
  def __init__(
 
80
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
81
  self.embed_dim = self.model.get_sentence_embedding_dimension()
82
 
83
+ self._translator = None # lazy
84
+ self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
 
 
85
  self.chunks: List[str] = []
86
  self._load()
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def _load(self) -> None:
89
  if self.meta_path.exists():
90
  try:
 
94
  if self.index_path.exists():
95
  try:
96
  idx = faiss.read_index(str(self.index_path))
97
+ if getattr(idx, "d", None) == self.embed_dim:
98
+ self.index = idx
99
  except Exception:
100
+ pass
 
 
101
 
102
  def _persist(self) -> None:
103
  faiss.write_index(self.index, str(self.index_path))
 
106
  @staticmethod
107
  def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
108
  reader = PdfReader(str(pdf_path))
109
+ pages: List[str] = []
110
  for p in reader.pages:
111
  t = p.extract_text() or ""
112
  if t.strip():
 
114
  chunks: List[str] = []
115
  for txt in pages:
116
  for i in range(0, len(txt), step):
117
+ part = txt[i : i + step].strip()
118
  if part:
119
  chunks.append(part)
120
  return chunks
 
141
  out.append((self.chunks[idx], float(score)))
142
  return out
143
 
144
+ def _translate_to_en(self, texts: List[str]) -> List[str]:
145
+ if not texts:
146
+ return texts
147
+ try:
148
+ from transformers import pipeline
149
+ if self._translator is None:
150
+ self._translator = pipeline(
151
+ "translation",
152
+ model="Helsinki-NLP/opus-mt-az-en",
153
+ cache_dir=str(self.cache_dir),
154
+ device=-1,
155
+ )
156
+ outs = self._translator(texts, max_length=400)
157
+ return [o["translation_text"].strip() for o in outs]
158
+ except Exception:
159
+ return texts
160
+
161
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
162
  if not contexts:
163
  return "No relevant context found. Please upload a PDF or ask a more specific question."
164
 
165
+ # 1) candidates (aggressive clean)
166
+ candidates: List[str] = []
167
  for c in contexts[:5]:
168
+ cleaned = _clean_for_summary(c)
169
  for s in _split_sentences(cleaned):
 
170
  w = s.split()
171
  if not (8 <= len(w) <= 35):
172
+ continue
173
  if _tabular_like(s) or _mostly_numeric(s):
174
+ continue
175
+ candidates.append(" ".join(w))
176
 
177
  if not candidates:
178
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
179
 
180
+ # 2) rank by similarity
181
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
182
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
183
  scores = (cand_emb @ q_emb.T).ravel()
184
  order = np.argsort(-scores)
185
 
186
+ # 3) near-duplicate dedup
187
  selected: List[str] = []
188
  for i in order:
189
+ s = candidates[i].strip()
190
  if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
191
  continue
192
  selected.append(s)
 
196
  if not selected:
197
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
198
 
199
+ # 4) translate to EN if needed
200
+ if OUTPUT_LANG == "en":
201
+ if any(_looks_azerbaijani(s) for s in selected):
202
+ selected = self._translate_to_en(selected)
203
 
204
  bullets = "\n".join(f"- {s}" for s in selected)
205
  return f"Answer (based on document context):\n{bullets}"
206
 
 
 
 
 
 
 
 
207
  def synthesize_answer(question: str, contexts: List[str]) -> str:
208
  return SimpleRAG().synthesize_answer(question, contexts)
209
 
 
210
  __all__ = ["SimpleRAG", "synthesize_answer", "DATA_DIR", "UPLOAD_DIR", "INDEX_DIR", "CACHE_DIR", "MODEL_NAME"]