HamidOmarov commited on
Commit
a7ef914
·
1 Parent(s): 70b60a8

Clean Dockerfile; pre-translate paragraphs; add /debug/translate

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. app/api.py +13 -0
  3. app/rag_system.py +6 -6
Dockerfile CHANGED
@@ -14,7 +14,7 @@ WORKDIR /app
14
  RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
- # Bust pip cache layer when requirements change
18
  ARG CACHEBUST=20250810
19
  COPY requirements.txt .
20
  RUN pip install --no-cache-dir -r requirements.txt
 
14
  RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
+ # Bust pip cache when requirements change
18
  ARG CACHEBUST=20250810
19
  COPY requirements.txt .
20
  RUN pip install --no-cache-dir -r requirements.txt
app/api.py CHANGED
@@ -5,6 +5,19 @@ from fastapi.responses import JSONResponse, RedirectResponse
5
  from pathlib import Path
6
  import shutil
7
  import traceback
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  from .rag_system import SimpleRAG, UPLOAD_DIR, synthesize_answer as summarize
10
  from .schemas import AskRequest, AskResponse, UploadResponse, HistoryResponse, HistoryItem
 
5
  from pathlib import Path
6
  import shutil
7
  import traceback
8
+ # app/api.py (importların altından)
9
+ from fastapi.responses import JSONResponse
10
+
11
+ @app.get("/debug/translate")
12
+ def debug_translate():
13
+ try:
14
+ from transformers import pipeline
15
+ tr = pipeline("translation", model="Helsinki-NLP/opus-mt-az-en", cache_dir=str(CACHE_DIR), device=-1)
16
+ out = tr("Sənəd təmiri və quraşdırılması ilə bağlı işlər görülüb.", max_length=80)[0]["translation_text"]
17
+ return {"ok": True, "example_in": "Sənəd təmiri və quraşdırılması ilə bağlı işlər görülüb.", "example_out": out}
18
+ except Exception as e:
19
+ return JSONResponse(status_code=500, content={"ok": False, "error": str(e)})
20
+
21
 
22
  from .rag_system import SimpleRAG, UPLOAD_DIR, synthesize_answer as summarize
23
  from .schemas import AskRequest, AskResponse, UploadResponse, HistoryResponse, HistoryItem
app/rag_system.py CHANGED
@@ -114,7 +114,7 @@ class SimpleRAG:
114
  chunks: List[str] = []
115
  for txt in pages:
116
  for i in range(0, len(txt), step):
117
- part = txt[i:i+step].strip()
118
  if part:
119
  chunks.append(part)
120
  return chunks
@@ -162,20 +162,20 @@ class SimpleRAG:
162
  if not contexts:
163
  return "No relevant context found. Please upload a PDF or ask a more specific question."
164
 
165
- # 1) Clean & keep top contexts
166
  cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
167
  cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
168
  if not cleaned_contexts:
169
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
170
 
171
- # 2) Pre-translate paragraphs to EN (if output language is EN)
172
  if OUTPUT_LANG == "en":
173
  try:
174
  cleaned_contexts = self._translate_to_en(cleaned_contexts)
175
  except Exception:
176
  pass
177
 
178
- # 3) Split into candidate sentences and filter
179
  candidates: List[str] = []
180
  for para in cleaned_contexts:
181
  for s in _split_sentences(para):
@@ -189,13 +189,13 @@ class SimpleRAG:
189
  if not candidates:
190
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
191
 
192
- # 4) Rank by similarity
193
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
194
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
195
  scores = (cand_emb @ q_emb.T).ravel()
196
  order = np.argsort(-scores)
197
 
198
- # 5) Aggressive near-duplicate removal
199
  selected: List[str] = []
200
  for i in order:
201
  s = candidates[i].strip()
 
114
  chunks: List[str] = []
115
  for txt in pages:
116
  for i in range(0, len(txt), step):
117
+ part = txt[i : i + step].strip()
118
  if part:
119
  chunks.append(part)
120
  return chunks
 
162
  if not contexts:
163
  return "No relevant context found. Please upload a PDF or ask a more specific question."
164
 
165
+ # 1) Clean top contexts
166
  cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
167
  cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
168
  if not cleaned_contexts:
169
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
170
 
171
+ # 2) Pre-translate paragraphs to EN (if target is EN)
172
  if OUTPUT_LANG == "en":
173
  try:
174
  cleaned_contexts = self._translate_to_en(cleaned_contexts)
175
  except Exception:
176
  pass
177
 
178
+ # 3) Split into sentence candidates & filter
179
  candidates: List[str] = []
180
  for para in cleaned_contexts:
181
  for s in _split_sentences(para):
 
189
  if not candidates:
190
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
191
 
192
+ # 4) Rank by similarity to question
193
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
194
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
195
  scores = (cand_emb @ q_emb.T).ravel()
196
  order = np.argsort(-scores)
197
 
198
+ # 5) Aggressive near-duplicate removal (Jaccard >= 0.90)
199
  selected: List[str] = []
200
  for i in order:
201
  s = candidates[i].strip()