Spaces:
Sleeping
Sleeping
Commit
·
70b60a8
1
Parent(s):
a0a7164
Clean Dockerfile; pre-translate contexts to EN before summarizing
Browse files- Dockerfile +7 -14
- app/rag_system.py +21 -14
Dockerfile
CHANGED
@@ -1,9 +1,4 @@
|
|
1 |
FROM python:3.11-slim
|
2 |
-
WORKDIR /app
|
3 |
-
ARG CACHEBUST=20250810
|
4 |
-
COPY requirements.txt .
|
5 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
6 |
-
FROM python:3.11-slim
|
7 |
|
8 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
9 |
PYTHONUNBUFFERED=1 \
|
@@ -15,25 +10,23 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
15 |
|
16 |
WORKDIR /app
|
17 |
|
|
|
18 |
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
|
19 |
&& rm -rf /var/lib/apt/lists/*
|
20 |
|
21 |
-
|
|
|
22 |
COPY requirements.txt .
|
23 |
RUN pip install --no-cache-dir -r requirements.txt
|
24 |
|
|
|
25 |
COPY . .
|
26 |
|
27 |
-
#
|
28 |
-
RUN mkdir -p /app/.cache /app/data/uploads /app/data/index
|
|
|
29 |
|
30 |
ENV PORT=7860
|
31 |
EXPOSE 7860
|
32 |
|
33 |
CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
|
34 |
-
|
35 |
-
COPY . .
|
36 |
-
RUN mkdir -p /app/data/uploads /app/data/index
|
37 |
-
ENV PORT=7860
|
38 |
-
EXPOSE 7860
|
39 |
-
CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
1 |
FROM python:3.11-slim
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
4 |
PYTHONUNBUFFERED=1 \
|
|
|
10 |
|
11 |
WORKDIR /app
|
12 |
|
13 |
+
# System deps
|
14 |
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
|
15 |
&& rm -rf /var/lib/apt/lists/*
|
16 |
|
17 |
+
# Bust pip cache layer when requirements change
|
18 |
+
ARG CACHEBUST=20250810
|
19 |
COPY requirements.txt .
|
20 |
RUN pip install --no-cache-dir -r requirements.txt
|
21 |
|
22 |
+
# App code
|
23 |
COPY . .
|
24 |
|
25 |
+
# Writable caches/data
|
26 |
+
RUN mkdir -p /app/.cache /app/data/uploads /app/data/index \
|
27 |
+
&& chmod -R 777 /app/.cache /app/data
|
28 |
|
29 |
ENV PORT=7860
|
30 |
EXPOSE 7860
|
31 |
|
32 |
CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
|
|
|
|
|
app/rag_system.py
CHANGED
@@ -114,7 +114,7 @@ class SimpleRAG:
|
|
114 |
chunks: List[str] = []
|
115 |
for txt in pages:
|
116 |
for i in range(0, len(txt), step):
|
117 |
-
part = txt[i
|
118 |
if part:
|
119 |
chunks.append(part)
|
120 |
return chunks
|
@@ -153,7 +153,7 @@ class SimpleRAG:
|
|
153 |
cache_dir=str(self.cache_dir),
|
154 |
device=-1,
|
155 |
)
|
156 |
-
outs = self._translator(texts, max_length=
|
157 |
return [o["translation_text"].strip() for o in outs]
|
158 |
except Exception:
|
159 |
return texts
|
@@ -162,11 +162,23 @@ class SimpleRAG:
|
|
162 |
if not contexts:
|
163 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
164 |
|
165 |
-
# 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
candidates: List[str] = []
|
167 |
-
for
|
168 |
-
|
169 |
-
for s in _split_sentences(cleaned):
|
170 |
w = s.split()
|
171 |
if not (8 <= len(w) <= 35):
|
172 |
continue
|
@@ -177,17 +189,17 @@ class SimpleRAG:
|
|
177 |
if not candidates:
|
178 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
179 |
|
180 |
-
#
|
181 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
182 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
183 |
scores = (cand_emb @ q_emb.T).ravel()
|
184 |
order = np.argsort(-scores)
|
185 |
|
186 |
-
#
|
187 |
selected: List[str] = []
|
188 |
for i in order:
|
189 |
s = candidates[i].strip()
|
190 |
-
if any(_sim_jaccard(s, t) >= 0.
|
191 |
continue
|
192 |
selected.append(s)
|
193 |
if len(selected) >= max_sentences:
|
@@ -196,11 +208,6 @@ class SimpleRAG:
|
|
196 |
if not selected:
|
197 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
198 |
|
199 |
-
# 4) translate to EN if needed
|
200 |
-
if OUTPUT_LANG == "en":
|
201 |
-
if any(_looks_azerbaijani(s) for s in selected):
|
202 |
-
selected = self._translate_to_en(selected)
|
203 |
-
|
204 |
bullets = "\n".join(f"- {s}" for s in selected)
|
205 |
return f"Answer (based on document context):\n{bullets}"
|
206 |
|
|
|
114 |
chunks: List[str] = []
|
115 |
for txt in pages:
|
116 |
for i in range(0, len(txt), step):
|
117 |
+
part = txt[i:i+step].strip()
|
118 |
if part:
|
119 |
chunks.append(part)
|
120 |
return chunks
|
|
|
153 |
cache_dir=str(self.cache_dir),
|
154 |
device=-1,
|
155 |
)
|
156 |
+
outs = self._translator(texts, max_length=800)
|
157 |
return [o["translation_text"].strip() for o in outs]
|
158 |
except Exception:
|
159 |
return texts
|
|
|
162 |
if not contexts:
|
163 |
return "No relevant context found. Please upload a PDF or ask a more specific question."
|
164 |
|
165 |
+
# 1) Clean & keep top contexts
|
166 |
+
cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
|
167 |
+
cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
|
168 |
+
if not cleaned_contexts:
|
169 |
+
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
170 |
+
|
171 |
+
# 2) Pre-translate paragraphs to EN (if output language is EN)
|
172 |
+
if OUTPUT_LANG == "en":
|
173 |
+
try:
|
174 |
+
cleaned_contexts = self._translate_to_en(cleaned_contexts)
|
175 |
+
except Exception:
|
176 |
+
pass
|
177 |
+
|
178 |
+
# 3) Split into candidate sentences and filter
|
179 |
candidates: List[str] = []
|
180 |
+
for para in cleaned_contexts:
|
181 |
+
for s in _split_sentences(para):
|
|
|
182 |
w = s.split()
|
183 |
if not (8 <= len(w) <= 35):
|
184 |
continue
|
|
|
189 |
if not candidates:
|
190 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
191 |
|
192 |
+
# 4) Rank by similarity
|
193 |
q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
194 |
cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
|
195 |
scores = (cand_emb @ q_emb.T).ravel()
|
196 |
order = np.argsort(-scores)
|
197 |
|
198 |
+
# 5) Aggressive near-duplicate removal
|
199 |
selected: List[str] = []
|
200 |
for i in order:
|
201 |
s = candidates[i].strip()
|
202 |
+
if any(_sim_jaccard(s, t) >= 0.90 for t in selected):
|
203 |
continue
|
204 |
selected.append(s)
|
205 |
if len(selected) >= max_sentences:
|
|
|
208 |
if not selected:
|
209 |
return "The document appears largely tabular/numeric; couldn't extract readable sentences."
|
210 |
|
|
|
|
|
|
|
|
|
|
|
211 |
bullets = "\n".join(f"- {s}" for s in selected)
|
212 |
return f"Answer (based on document context):\n{bullets}"
|
213 |
|