HamidOmarov commited on
Commit
40a908e
·
1 Parent(s): 88d2e91

Robust RAG: pdfminer fallback, safe last_added, 400 on scanned PDFs, stats & reset endpoints

Browse files
Files changed (2) hide show
  1. app/api.py +43 -16
  2. app/rag_system.py +11 -9
app/api.py CHANGED
@@ -1,14 +1,15 @@
1
  # app/api.py
2
- from typing import List, Optional
3
 
4
- from fastapi import FastAPI, UploadFile, File
 
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from fastapi.responses import JSONResponse, RedirectResponse
7
  from pydantic import BaseModel
8
 
9
- from .rag_system import SimpleRAG, UPLOAD_DIR
10
 
11
- app = FastAPI(title="RAG API", version="1.2.3")
12
 
13
  app.add_middleware(
14
  CORSMiddleware,
@@ -20,7 +21,7 @@ app.add_middleware(
20
 
21
  rag = SimpleRAG()
22
 
23
- # ---------- Models ----------
24
  class UploadResponse(BaseModel):
25
  filename: str
26
  chunks_added: int
@@ -36,7 +37,15 @@ class AskResponse(BaseModel):
36
  class HistoryResponse(BaseModel):
37
  total_chunks: int
38
 
39
- # ---------- Debug ----------
 
 
 
 
 
 
 
 
40
  @app.get("/debug/translate")
41
  def debug_translate():
42
  try:
@@ -48,14 +57,6 @@ def debug_translate():
48
  return JSONResponse(status_code=500, content={"ok": False, "error": str(e)})
49
 
50
  # ---------- Core ----------
51
- @app.get("/")
52
- def root():
53
- return RedirectResponse(url="/docs")
54
-
55
- @app.get("/health")
56
- def health():
57
- return {"status": "ok", "version": app.version, "summarizer": "extractive_en+translate+fallback"}
58
-
59
  @app.post("/upload_pdf", response_model=UploadResponse)
60
  async def upload_pdf(file: UploadFile = File(...)):
61
  dest = UPLOAD_DIR / file.filename
@@ -66,17 +67,43 @@ async def upload_pdf(file: UploadFile = File(...)):
66
  break
67
  f.write(chunk)
68
  added = rag.add_pdf(dest)
 
 
 
69
  return UploadResponse(filename=file.filename, chunks_added=added)
70
 
71
- # app/api.py içində ask_question endpoint
72
  @app.post("/ask_question", response_model=AskResponse)
73
  def ask_question(payload: AskRequest):
74
  hits = rag.search(payload.question, k=max(1, payload.top_k))
75
  contexts = [c for c, _ in hits]
76
- # fallback: (optional) burda da son faylı ötürmək olar; synthesize_answer onsuz da edir:
77
  answer = rag.synthesize_answer(payload.question, contexts)
78
  return AskResponse(answer=answer, contexts=contexts or rag.last_added[:5])
79
 
80
  @app.get("/get_history", response_model=HistoryResponse)
81
  def get_history():
82
  return HistoryResponse(total_chunks=len(rag.chunks))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # app/api.py
2
+ from typing import List
3
 
4
+ import faiss, os
5
+ from fastapi import FastAPI, UploadFile, File, HTTPException
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from fastapi.responses import JSONResponse, RedirectResponse
8
  from pydantic import BaseModel
9
 
10
+ from .rag_system import SimpleRAG, UPLOAD_DIR, INDEX_DIR
11
 
12
+ app = FastAPI(title="RAG API", version="1.3.0")
13
 
14
  app.add_middleware(
15
  CORSMiddleware,
 
21
 
22
  rag = SimpleRAG()
23
 
24
+ # ---------- Schemas ----------
25
  class UploadResponse(BaseModel):
26
  filename: str
27
  chunks_added: int
 
37
  class HistoryResponse(BaseModel):
38
  total_chunks: int
39
 
40
+ # ---------- Utility ----------
41
+ @app.get("/")
42
+ def root():
43
+ return RedirectResponse(url="/docs")
44
+
45
+ @app.get("/health")
46
+ def health():
47
+ return {"status": "ok", "version": app.version, "summarizer": "extractive_en + translate + fallback"}
48
+
49
  @app.get("/debug/translate")
50
  def debug_translate():
51
  try:
 
57
  return JSONResponse(status_code=500, content={"ok": False, "error": str(e)})
58
 
59
  # ---------- Core ----------
 
 
 
 
 
 
 
 
60
  @app.post("/upload_pdf", response_model=UploadResponse)
61
  async def upload_pdf(file: UploadFile = File(...)):
62
  dest = UPLOAD_DIR / file.filename
 
67
  break
68
  f.write(chunk)
69
  added = rag.add_pdf(dest)
70
+ if added == 0:
71
+ # Clear message for scanned/empty PDFs
72
+ raise HTTPException(status_code=400, detail="No extractable text found (likely a scanned image PDF).")
73
  return UploadResponse(filename=file.filename, chunks_added=added)
74
 
 
75
  @app.post("/ask_question", response_model=AskResponse)
76
  def ask_question(payload: AskRequest):
77
  hits = rag.search(payload.question, k=max(1, payload.top_k))
78
  contexts = [c for c, _ in hits]
 
79
  answer = rag.synthesize_answer(payload.question, contexts)
80
  return AskResponse(answer=answer, contexts=contexts or rag.last_added[:5])
81
 
82
  @app.get("/get_history", response_model=HistoryResponse)
83
  def get_history():
84
  return HistoryResponse(total_chunks=len(rag.chunks))
85
+
86
+ @app.get("/stats")
87
+ def stats():
88
+ return {
89
+ "total_chunks": len(rag.chunks),
90
+ "faiss_ntotal": int(getattr(rag.index, "ntotal", 0)),
91
+ "model_dim": int(getattr(rag.index, "d", rag.embed_dim)),
92
+ "last_added_chunks": len(rag.last_added),
93
+ "version": app.version,
94
+ }
95
+
96
+ @app.post("/reset_index")
97
+ def reset_index():
98
+ try:
99
+ rag.index = faiss.IndexFlatIP(rag.embed_dim)
100
+ rag.chunks = []
101
+ rag.last_added = []
102
+ for p in [INDEX_DIR / "faiss.index", INDEX_DIR / "meta.npy"]:
103
+ try:
104
+ os.remove(p)
105
+ except FileNotFoundError:
106
+ pass
107
+ return {"ok": True}
108
+ except Exception as e:
109
+ raise HTTPException(status_code=500, detail=str(e)}
app/rag_system.py CHANGED
@@ -32,7 +32,7 @@ def _split_sentences(text: str) -> List[str]:
32
  return [s.strip() for s in re.split(r'(?<=[.!?])\s+|[\r\n]+', text) if s.strip()]
33
 
34
  def _mostly_numeric(s: str) -> bool:
35
- alnum = [c for c in s if c.isalnum()]
36
  if not alnum:
37
  return True
38
  digits = sum(c.isdigit() for c in alnum)
@@ -40,7 +40,7 @@ def _mostly_numeric(s: str) -> bool:
40
 
41
  def _tabular_like(s: str) -> bool:
42
  hits = len(NUM_TOK_RE.findall(s))
43
- return hits >= 4 or len(s) < 15 # daha səxavətli
44
 
45
  def _clean_for_summary(text: str) -> str:
46
  out = []
@@ -69,6 +69,7 @@ def _non_ascii_ratio(s: str) -> float:
69
  def _keyword_summary_en(contexts: List[str]) -> List[str]:
70
  text = " ".join(contexts).lower()
71
  bullets: List[str] = []
 
72
  def add(b: str):
73
  if b not in bullets:
74
  bullets.append(b)
@@ -116,7 +117,7 @@ class SimpleRAG:
116
  self._translator = None # lazy
117
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
118
  self.chunks: List[str] = []
119
- self.last_added: List[str] = [] # son yüklənən faylın parçaları (RAM)
120
  self._load()
121
 
122
  def _load(self) -> None:
@@ -171,9 +172,11 @@ class SimpleRAG:
171
 
172
  def add_pdf(self, pdf_path: Path) -> int:
173
  texts = self._pdf_to_texts(pdf_path)
174
- self.last_added = texts[:] # son faylı yadda saxla (summarize fallback üçün)
175
  if not texts:
 
176
  return 0
 
 
177
  emb = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
178
  self.index.add(emb.astype(np.float32))
179
  self.chunks.extend(texts)
@@ -210,11 +213,10 @@ class SimpleRAG:
210
  return texts
211
 
212
  def _prepare_contexts(self, question: str, contexts: List[str]) -> List[str]:
213
- # Generik sual ya boş axtarış halında: son yüklənən fayldan istifadə et
214
- generic = (len(question.split()) <= 5) or bool(GENERIC_Q_RE.search(question or ""))
215
  if (not contexts or generic) and self.last_added:
216
- base = self.last_added[:5]
217
- return base
218
  return contexts
219
 
220
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
@@ -240,7 +242,7 @@ class SimpleRAG:
240
  w = s.split()
241
  if not (6 <= len(w) <= 60):
242
  continue
243
- # tam cümlə tələbi (ya düzgün sonlu durğu, ya da kifayət qədər uzunluq)
244
  if not re.search(r"[.!?](?:[\"'])?$", s) and len(w) < 18:
245
  continue
246
  if _tabular_like(s) or _mostly_numeric(s):
 
32
  return [s.strip() for s in re.split(r'(?<=[.!?])\s+|[\r\n]+', text) if s.strip()]
33
 
34
  def _mostly_numeric(s: str) -> bool:
35
+ alnum = [c for c in s if s and c.isalnum()]
36
  if not alnum:
37
  return True
38
  digits = sum(c.isdigit() for c in alnum)
 
40
 
41
  def _tabular_like(s: str) -> bool:
42
  hits = len(NUM_TOK_RE.findall(s))
43
+ return hits >= 4 or len(s) < 15
44
 
45
  def _clean_for_summary(text: str) -> str:
46
  out = []
 
69
  def _keyword_summary_en(contexts: List[str]) -> List[str]:
70
  text = " ".join(contexts).lower()
71
  bullets: List[str] = []
72
+
73
  def add(b: str):
74
  if b not in bullets:
75
  bullets.append(b)
 
117
  self._translator = None # lazy
118
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
119
  self.chunks: List[str] = []
120
+ self.last_added: List[str] = []
121
  self._load()
122
 
123
  def _load(self) -> None:
 
172
 
173
  def add_pdf(self, pdf_path: Path) -> int:
174
  texts = self._pdf_to_texts(pdf_path)
 
175
  if not texts:
176
+ # IMPORTANT: do NOT clobber last_added if this PDF had no extractable text
177
  return 0
178
+
179
+ self.last_added = texts[:] # only set if we actually extracted text
180
  emb = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
181
  self.index.add(emb.astype(np.float32))
182
  self.chunks.extend(texts)
 
213
  return texts
214
 
215
  def _prepare_contexts(self, question: str, contexts: List[str]) -> List[str]:
216
+ # Generic question or empty search use last uploaded file snippets
217
+ generic = (len((question or "").split()) <= 5) or bool(GENERIC_Q_RE.search(question or ""))
218
  if (not contexts or generic) and self.last_added:
219
+ return self.last_added[:5]
 
220
  return contexts
221
 
222
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
 
242
  w = s.split()
243
  if not (6 <= len(w) <= 60):
244
  continue
245
+ # full sentence requirement: punctuation at end OR sufficiently long
246
  if not re.search(r"[.!?](?:[\"'])?$", s) and len(w) < 18:
247
  continue
248
  if _tabular_like(s) or _mostly_numeric(s):