HamidOmarov commited on
Commit
ebbe4db
·
1 Parent(s): edc48fd

English UX + extractive summarizer

Browse files
Files changed (2) hide show
  1. app/api.py +2 -2
  2. app/rag_system.py +88 -97
app/api.py CHANGED
@@ -32,12 +32,12 @@ def health():
32
  async def upload_pdf(file: UploadFile = File(...)):
33
  try:
34
  if not file.filename.lower().endswith(".pdf"):
35
- http400("Yalnız PDF faylları qəbul olunur.")
36
  dest = UPLOAD_DIR / file.filename
37
  with dest.open("wb") as f:
38
  shutil.copyfileobj(file.file, f)
39
  chunks_added = rag.add_pdf(dest)
40
- return UploadResponse(filename=file.filename, chunks_added=chunks_added)
41
  except Exception as e:
42
  traceback.print_exc()
43
  return JSONResponse(status_code=500, content={"detail": f"Server xətası: {str(e)}"})
 
32
  async def upload_pdf(file: UploadFile = File(...)):
33
  try:
34
  if not file.filename.lower().endswith(".pdf"):
35
+ http400("Only PDF files are accepted.")
36
  dest = UPLOAD_DIR / file.filename
37
  with dest.open("wb") as f:
38
  shutil.copyfileobj(file.file, f)
39
  chunks_added = rag.add_pdf(dest)
40
+ return JSONResponse(status_code=500, content={"detail": f"Server error: {str(e)}"})
41
  except Exception as e:
42
  traceback.print_exc()
43
  return JSONResponse(status_code=500, content={"detail": f"Server xətası: {str(e)}"})
app/rag_system.py CHANGED
@@ -1,7 +1,7 @@
1
  # app/rag_system.py
2
  from __future__ import annotations
3
 
4
- import os
5
  from pathlib import Path
6
  from typing import List, Tuple
7
 
@@ -10,32 +10,47 @@ import numpy as np
10
  from pypdf import PdfReader
11
  from sentence_transformers import SentenceTransformer
12
 
13
-
14
- # -----------------------------
15
- # Konfiqurasiya & qovluqlar
16
- # -----------------------------
17
  ROOT_DIR = Path(__file__).resolve().parent.parent
18
  DATA_DIR = ROOT_DIR / "data"
19
  UPLOAD_DIR = DATA_DIR / "uploads"
20
  INDEX_DIR = DATA_DIR / "index"
21
-
22
- # HF Spaces-də yazma icazəsi olan cache qovluğu
23
  CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
24
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
25
  d.mkdir(parents=True, exist_ok=True)
26
 
27
- # Model adı ENV-dən dəyişdirilə bilər
28
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  class SimpleRAG:
32
  """
33
- Sadə RAG nüvəsi:
34
- - PDF -> mətn parçalama
35
- - Sentence-Transformers embeddings
36
- - FAISS Index (IP / cosine bərabərləşdirilmiş)
37
  """
38
-
39
  def __init__(
40
  self,
41
  index_path: Path = INDEX_DIR / "faiss.index",
@@ -48,39 +63,23 @@ class SimpleRAG:
48
  self.model_name = model_name
49
  self.cache_dir = Path(cache_dir)
50
 
51
- # Model
52
- # cache_folder Spaces-də /.cache icazə xətasının qarşısını alır
53
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
54
  self.embed_dim = self.model.get_sentence_embedding_dimension()
55
 
56
- # FAISS index və meta (chunks)
57
  self.index: faiss.Index = None # type: ignore
58
  self.chunks: List[str] = []
59
-
60
  self._load()
61
 
62
- # -----------------------------
63
- # Yükləmə / Saxlama
64
- # -----------------------------
65
  def _load(self) -> None:
66
- # Chunks (meta) yüklə
67
  if self.meta_path.exists():
68
  try:
69
  self.chunks = np.load(self.meta_path, allow_pickle=True).tolist()
70
  except Exception:
71
- # zədələnmişsə sıfırla
72
  self.chunks = []
73
-
74
- # FAISS index yüklə
75
  if self.index_path.exists():
76
  try:
77
  idx = faiss.read_index(str(self.index_path))
78
- # ölçü uyğunluğunu yoxla
79
- if hasattr(idx, "d") and idx.d == self.embed_dim:
80
- self.index = idx
81
- else:
82
- # uyğunsuzluqda sıfırdan qur
83
- self.index = faiss.IndexFlatIP(self.embed_dim)
84
  except Exception:
85
  self.index = faiss.IndexFlatIP(self.embed_dim)
86
  else:
@@ -90,96 +89,88 @@ class SimpleRAG:
90
  faiss.write_index(self.index, str(self.index_path))
91
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
92
 
93
- # -----------------------------
94
- # PDF -> Mətn -> Parçalama
95
- # -----------------------------
96
  @staticmethod
97
  def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
98
  reader = PdfReader(str(pdf_path))
99
- pages_text: List[str] = []
100
- for page in reader.pages:
101
- t = page.extract_text() or ""
102
  if t.strip():
103
- pages_text.append(t)
104
-
105
  chunks: List[str] = []
106
- for txt in pages_text:
107
  for i in range(0, len(txt), step):
108
- chunk = txt[i : i + step].strip()
109
- if chunk:
110
- chunks.append(chunk)
111
  return chunks
112
 
113
- # -----------------------------
114
- # Index-ə əlavə
115
- # -----------------------------
116
  def add_pdf(self, pdf_path: Path) -> int:
117
  texts = self._pdf_to_texts(pdf_path)
118
  if not texts:
119
  return 0
120
-
121
- emb = self.model.encode(
122
- texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
123
- )
124
- # FAISS-ə əlavə
125
  self.index.add(emb.astype(np.float32))
126
- # Meta-ya əlavə
127
  self.chunks.extend(texts)
128
- # Diskə yaz
129
  self._persist()
130
  return len(texts)
131
 
132
- # -----------------------------
133
- # Axtarış
134
- # -----------------------------
135
  def search(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
136
- if self.index is None:
137
  return []
138
-
139
- q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
140
- # FAISS float32 gözləyir
141
- D, I = self.index.search(q.astype(np.float32), min(k, max(1, self.index.ntotal)))
142
- results: List[Tuple[str, float]] = []
143
-
144
  if I.size > 0 and self.chunks:
145
  for idx, score in zip(I[0], D[0]):
146
  if 0 <= idx < len(self.chunks):
147
- results.append((self.chunks[idx], float(score)))
148
- return results
149
 
150
- # -----------------------------
151
- # Cavab Sinttezi (LLM-siz demo)
152
- # -----------------------------
153
- def synthesize_answer(self, question: str, contexts: List[str]) -> str:
154
  if not contexts:
155
- return "Kontekst tapılmadı. Sualı daha dəqiq verin ya PDF yükləyin."
156
- joined = "\n---\n".join(contexts[:3])
157
- return (
158
- f"Sual: {question}\n\n"
159
- f"Cavab (kontekstdən çıxarış):\n{joined}\n\n"
160
- f"(Qeyd: Demo rejimi — LLM inteqrasiyası üçün sonradan OpenAI/Groq və s. əlavə edilə bilər.)"
161
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
 
164
- # Köhnə import yolunu dəstəkləmək üçün eyni funksiyanı modul səviyyəsində də saxlayırıq
165
- def synthesize_answer(question: str, contexts: List[str]) -> str:
166
- if not contexts:
167
- return "Kontekst tapılmadı. Sualı daha dəqiq verin və ya PDF yükləyin."
168
- joined = "\n---\n".join(contexts[:3])
169
- return (
170
- f"Sual: {question}\n\n"
171
- f"Cavab (kontekstdən çıxarış):\n{joined}\n\n"
172
- f"(Qeyd: Demo rejimi — LLM inteqrasiyası üçün sonradan OpenAI/Groq və s. əlavə edilə bilər.)"
173
- )
174
-
175
-
176
- # Faylı import edən tərəfin rahatlığı üçün bu qovluqları export edirik
177
- __all__ = [
178
- "SimpleRAG",
179
- "synthesize_answer",
180
- "DATA_DIR",
181
- "UPLOAD_DIR",
182
- "INDEX_DIR",
183
- "CACHE_DIR",
184
- "MODEL_NAME",
185
- ]
 
1
  # app/rag_system.py
2
  from __future__ import annotations
3
 
4
+ import os, re
5
  from pathlib import Path
6
  from typing import List, Tuple
7
 
 
10
  from pypdf import PdfReader
11
  from sentence_transformers import SentenceTransformer
12
 
13
+ # Paths & caches
 
 
 
14
  ROOT_DIR = Path(__file__).resolve().parent.parent
15
  DATA_DIR = ROOT_DIR / "data"
16
  UPLOAD_DIR = DATA_DIR / "uploads"
17
  INDEX_DIR = DATA_DIR / "index"
 
 
18
  CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
19
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
20
  d.mkdir(parents=True, exist_ok=True)
21
 
 
22
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
23
 
24
+ def _split_sentences(text: str) -> List[str]:
25
+ # Split by sentence end or newlines
26
+ return [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+|[\r\n]+', text) if s.strip()]
27
+
28
+ def _mostly_numeric(s: str) -> bool:
29
+ alnum = [c for c in s if c.isalnum()]
30
+ if not alnum:
31
+ return True
32
+ digits = sum(c.isdigit() for c in alnum)
33
+ return digits / len(alnum) > 0.5
34
+
35
+ def _clean_for_summary(text: str) -> str:
36
+ # Drop lines that are mostly numbers / too short
37
+ lines = []
38
+ for ln in text.splitlines():
39
+ t = " ".join(ln.split())
40
+ if len(t) < 10:
41
+ continue
42
+ if _mostly_numeric(t):
43
+ continue
44
+ lines.append(t)
45
+ return " ".join(lines)
46
 
47
  class SimpleRAG:
48
  """
49
+ - PDF -> text chunking
50
+ - Sentence-Transformers embeddings (cosine/IP)
51
+ - FAISS index
52
+ - Extractive answer in EN
53
  """
 
54
  def __init__(
55
  self,
56
  index_path: Path = INDEX_DIR / "faiss.index",
 
63
  self.model_name = model_name
64
  self.cache_dir = Path(cache_dir)
65
 
 
 
66
  self.model = SentenceTransformer(self.model_name, cache_folder=str(self.cache_dir))
67
  self.embed_dim = self.model.get_sentence_embedding_dimension()
68
 
 
69
  self.index: faiss.Index = None # type: ignore
70
  self.chunks: List[str] = []
 
71
  self._load()
72
 
 
 
 
73
  def _load(self) -> None:
 
74
  if self.meta_path.exists():
75
  try:
76
  self.chunks = np.load(self.meta_path, allow_pickle=True).tolist()
77
  except Exception:
 
78
  self.chunks = []
 
 
79
  if self.index_path.exists():
80
  try:
81
  idx = faiss.read_index(str(self.index_path))
82
+ self.index = idx if getattr(idx, "d", None) == self.embed_dim else faiss.IndexFlatIP(self.embed_dim)
 
 
 
 
 
83
  except Exception:
84
  self.index = faiss.IndexFlatIP(self.embed_dim)
85
  else:
 
89
  faiss.write_index(self.index, str(self.index_path))
90
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
91
 
 
 
 
92
  @staticmethod
93
  def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
94
  reader = PdfReader(str(pdf_path))
95
+ pages = []
96
+ for p in reader.pages:
97
+ t = p.extract_text() or ""
98
  if t.strip():
99
+ pages.append(t)
 
100
  chunks: List[str] = []
101
+ for txt in pages:
102
  for i in range(0, len(txt), step):
103
+ part = txt[i:i+step].strip()
104
+ if part:
105
+ chunks.append(part)
106
  return chunks
107
 
 
 
 
108
  def add_pdf(self, pdf_path: Path) -> int:
109
  texts = self._pdf_to_texts(pdf_path)
110
  if not texts:
111
  return 0
112
+ emb = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
 
 
 
 
113
  self.index.add(emb.astype(np.float32))
 
114
  self.chunks.extend(texts)
 
115
  self._persist()
116
  return len(texts)
117
 
 
 
 
118
  def search(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
119
+ if self.index is None or self.index.ntotal == 0:
120
  return []
121
+ q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
122
+ D, I = self.index.search(q, min(k, max(1, self.index.ntotal)))
123
+ out: List[Tuple[str, float]] = []
 
 
 
124
  if I.size > 0 and self.chunks:
125
  for idx, score in zip(I[0], D[0]):
126
  if 0 <= idx < len(self.chunks):
127
+ out.append((self.chunks[idx], float(score)))
128
+ return out
129
 
130
+ # -------- Improved English answer --------
131
+ def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 5) -> str:
 
 
132
  if not contexts:
133
+ return "No relevant context found. Please upload a PDF or ask a more specific question."
134
+
135
+ # Prepare candidate sentences
136
+ candidates: List[str] = []
137
+ for c in contexts[:5]:
138
+ cleaned = _clean_for_summary(c)
139
+ for s in _split_sentences(cleaned):
140
+ if 20 <= len(s) <= 240 and not _mostly_numeric(s):
141
+ candidates.append(s)
142
+
143
+ # Fallback if still nothing
144
+ if not candidates:
145
+ return "The document appears to be mostly tabular/numeric; no clear sentences to summarize."
146
+
147
+ # Rank candidates by cosine similarity to the question
148
+ q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
149
+ cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
150
+ scores = (cand_emb @ q_emb.T).ravel()
151
+ order = np.argsort(-scores)
152
+
153
+ # Pick top sentences with simple de-dup
154
+ selected: List[str] = []
155
+ seen = set()
156
+ for i in order:
157
+ s = candidates[i].strip()
158
+ key = s.lower()
159
+ if key in seen:
160
+ continue
161
+ seen.add(key)
162
+ selected.append(s)
163
+ if len(selected) >= max_sentences:
164
+ break
165
+
166
+ bullet = "\n".join(f"- {s}" for s in selected)
167
+ note = " (The PDF seems largely tabular; extracted the most relevant lines.)" if all(_mostly_numeric(c) for c in contexts) else ""
168
+ return f"Answer (based on document context):\n{bullet}{note}"
169
+
170
+
171
+ # Module-level alias
172
+ def synthesize_answer(question: str, contexts: List[str]) -> str:
173
+ return SimpleRAG().synthesize_answer(question, contexts)
174
 
175
 
176
+ __all__ = ["SimpleRAG", "synthesize_answer", "DATA_DIR", "UPLOAD_DIR", "INDEX_DIR", "CACHE_DIR", "MODEL_NAME"]