Spaces:
Sleeping
Sleeping
Commit
·
a7ef914
1
Parent(s):
70b60a8
Clean Dockerfile; pre-translate paragraphs; add /debug/translate
Browse files- Dockerfile +1 -1
- app/api.py +13 -0
- app/rag_system.py +6 -6
Dockerfile
CHANGED
@@ -14,7 +14,7 @@ WORKDIR /app
|
|
14 |
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
|
15 |
&& rm -rf /var/lib/apt/lists/*
|
16 |
|
17 |
-
# Bust pip cache
|
18 |
ARG CACHEBUST=20250810
|
19 |
COPY requirements.txt .
|
20 |
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
14 |
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
|
15 |
&& rm -rf /var/lib/apt/lists/*
|
16 |
|
17 |
+
# Bust pip cache when requirements change
|
18 |
ARG CACHEBUST=20250810
|
19 |
COPY requirements.txt .
|
20 |
RUN pip install --no-cache-dir -r requirements.txt
|
app/api.py
CHANGED
@@ -5,6 +5,19 @@ from fastapi.responses import JSONResponse, RedirectResponse
|
|
5 |
from pathlib import Path
|
6 |
import shutil
|
7 |
import traceback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
from .rag_system import SimpleRAG, UPLOAD_DIR, synthesize_answer as summarize
|
10 |
from .schemas import AskRequest, AskResponse, UploadResponse, HistoryResponse, HistoryItem
|
|
|
5 |
from pathlib import Path
|
6 |
import shutil
|
7 |
import traceback
|
8 |
+
# app/api.py (importların altından)
|
9 |
+
from fastapi.responses import JSONResponse
|
10 |
+
|
11 |
+
@app.get("/debug/translate")
|
12 |
+
def debug_translate():
|
13 |
+
try:
|
14 |
+
from transformers import pipeline
|
15 |
+
tr = pipeline("translation", model="Helsinki-NLP/opus-mt-az-en", cache_dir=str(CACHE_DIR), device=-1)
|
16 |
+
out = tr("Sənəd təmiri və quraşdırılması ilə bağlı işlər görülüb.", max_length=80)[0]["translation_text"]
|
17 |
+
return {"ok": True, "example_in": "Sənəd təmiri və quraşdırılması ilə bağlı işlər görülüb.", "example_out": out}
|
18 |
+
except Exception as e:
|
19 |
+
return JSONResponse(status_code=500, content={"ok": False, "error": str(e)})
|
20 |
+
|
21 |
|
22 |
from .rag_system import SimpleRAG, UPLOAD_DIR, synthesize_answer as summarize
|
23 |
from .schemas import AskRequest, AskResponse, UploadResponse, HistoryResponse, HistoryItem
|
app/rag_system.py
CHANGED
@@ -114,7 +114,7 @@ class SimpleRAG:
|
|
114 |
chunks: List[str] = []
|
115 |
for txt in pages:
|
116 |
for i in range(0, len(txt), step):
|
117 |
-
part = txt[i:i+step].strip()
|
118 |
if part:
|
119 |
chunks.append(part)
|
120 |
return chunks
|
@@ -162,20 +162,20 @@ class SimpleRAG:
|
|
162 |
if not contexts:
|
163 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
164 |
|
165 |
-
# 1) Clean
|
166 |
cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
|
167 |
cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
|
168 |
if not cleaned_contexts:
|
169 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
170 |
|
171 |
-
# 2) Pre-translate paragraphs to EN (if
|
172 |
if OUTPUT_LANG == "en":
|
173 |
try:
|
174 |
cleaned_contexts = self._translate_to_en(cleaned_contexts)
|
175 |
except Exception:
|
176 |
pass
|
177 |
|
178 |
-
# 3) Split into
|
179 |
candidates: List[str] = []
|
180 |
for para in cleaned_contexts:
|
181 |
for s in _split_sentences(para):
|
@@ -189,13 +189,13 @@ class SimpleRAG:
|
|
189 |
if not candidates:
|
190 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
191 |
|
192 |
-
# 4) Rank by similarity
|
193 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
194 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
195 |
scores = (cand_emb @ q_emb.T).ravel()
|
196 |
order = np.argsort(-scores)
|
197 |
|
198 |
-
# 5) Aggressive near-duplicate removal
|
199 |
selected: List[str] = []
|
200 |
for i in order:
|
201 |
s = candidates[i].strip()
|
|
|
114 |
chunks: List[str] = []
|
115 |
for txt in pages:
|
116 |
for i in range(0, len(txt), step):
|
117 |
+
part = txt[i : i + step].strip()
|
118 |
if part:
|
119 |
chunks.append(part)
|
120 |
return chunks
|
|
|
162 |
if not contexts:
|
163 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
164 |
|
165 |
+
# 1) Clean top contexts
|
166 |
cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
|
167 |
cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
|
168 |
if not cleaned_contexts:
|
169 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
170 |
|
171 |
+
# 2) Pre-translate paragraphs to EN (if target is EN)
|
172 |
if OUTPUT_LANG == "en":
|
173 |
try:
|
174 |
cleaned_contexts = self._translate_to_en(cleaned_contexts)
|
175 |
except Exception:
|
176 |
pass
|
177 |
|
178 |
+
# 3) Split into sentence candidates & filter
|
179 |
candidates: List[str] = []
|
180 |
for para in cleaned_contexts:
|
181 |
for s in _split_sentences(para):
|
|
|
189 |
if not candidates:
|
190 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
191 |
|
192 |
+
# 4) Rank by similarity to question
|
193 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
194 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
195 |
scores = (cand_emb @ q_emb.T).ravel()
|
196 |
order = np.argsort(-scores)
|
197 |
|
198 |
+
# 5) Aggressive near-duplicate removal (Jaccard >= 0.90)
|
199 |
selected: List[str] = []
|
200 |
for i in order:
|
201 |
s = candidates[i].strip()
|