HamidOmarov commited on
Commit
70b60a8
·
1 Parent(s): a0a7164

Clean Dockerfile; pre-translate contexts to EN before summarizing

Browse files
Files changed (2) hide show
  1. Dockerfile +7 -14
  2. app/rag_system.py +21 -14
Dockerfile CHANGED
@@ -1,9 +1,4 @@
1
  FROM python:3.11-slim
2
- WORKDIR /app
3
- ARG CACHEBUST=20250810
4
- COPY requirements.txt .
5
- RUN pip install --no-cache-dir -r requirements.txt
6
- FROM python:3.11-slim
7
 
8
  ENV PYTHONDONTWRITEBYTECODE=1 \
9
  PYTHONUNBUFFERED=1 \
@@ -15,25 +10,23 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
15
 
16
  WORKDIR /app
17
 
 
18
  RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
19
  && rm -rf /var/lib/apt/lists/*
20
 
21
-
 
22
  COPY requirements.txt .
23
  RUN pip install --no-cache-dir -r requirements.txt
24
 
 
25
  COPY . .
26
 
27
- # Cache data qovluqları
28
- RUN mkdir -p /app/.cache /app/data/uploads /app/data/index && chmod -R 777 /app/.cache /app/data
 
29
 
30
  ENV PORT=7860
31
  EXPOSE 7860
32
 
33
  CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
34
-
35
- COPY . .
36
- RUN mkdir -p /app/data/uploads /app/data/index
37
- ENV PORT=7860
38
- EXPOSE 7860
39
- CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.11-slim
 
 
 
 
 
2
 
3
  ENV PYTHONDONTWRITEBYTECODE=1 \
4
  PYTHONUNBUFFERED=1 \
 
10
 
11
  WORKDIR /app
12
 
13
+ # System deps
14
  RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
+ # Bust pip cache layer when requirements change
18
+ ARG CACHEBUST=20250810
19
  COPY requirements.txt .
20
  RUN pip install --no-cache-dir -r requirements.txt
21
 
22
+ # App code
23
  COPY . .
24
 
25
+ # Writable caches/data
26
+ RUN mkdir -p /app/.cache /app/data/uploads /app/data/index \
27
+ && chmod -R 777 /app/.cache /app/data
28
 
29
  ENV PORT=7860
30
  EXPOSE 7860
31
 
32
  CMD ["uvicorn", "app.api:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
app/rag_system.py CHANGED
@@ -114,7 +114,7 @@ class SimpleRAG:
114
  chunks: List[str] = []
115
  for txt in pages:
116
  for i in range(0, len(txt), step):
117
- part = txt[i : i + step].strip()
118
  if part:
119
  chunks.append(part)
120
  return chunks
@@ -153,7 +153,7 @@ class SimpleRAG:
153
  cache_dir=str(self.cache_dir),
154
  device=-1,
155
  )
156
- outs = self._translator(texts, max_length=400)
157
  return [o["translation_text"].strip() for o in outs]
158
  except Exception:
159
  return texts
@@ -162,11 +162,23 @@ class SimpleRAG:
162
  if not contexts:
163
  return "No relevant context found. Please upload a PDF or ask a more specific question."
164
 
165
- # 1) candidates (aggressive clean)
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  candidates: List[str] = []
167
- for c in contexts[:5]:
168
- cleaned = _clean_for_summary(c)
169
- for s in _split_sentences(cleaned):
170
  w = s.split()
171
  if not (8 <= len(w) <= 35):
172
  continue
@@ -177,17 +189,17 @@ class SimpleRAG:
177
  if not candidates:
178
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
179
 
180
- # 2) rank by similarity
181
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
182
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
183
  scores = (cand_emb @ q_emb.T).ravel()
184
  order = np.argsort(-scores)
185
 
186
- # 3) near-duplicate dedup
187
  selected: List[str] = []
188
  for i in order:
189
  s = candidates[i].strip()
190
- if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
191
  continue
192
  selected.append(s)
193
  if len(selected) >= max_sentences:
@@ -196,11 +208,6 @@ class SimpleRAG:
196
  if not selected:
197
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
198
 
199
- # 4) translate to EN if needed
200
- if OUTPUT_LANG == "en":
201
- if any(_looks_azerbaijani(s) for s in selected):
202
- selected = self._translate_to_en(selected)
203
-
204
  bullets = "\n".join(f"- {s}" for s in selected)
205
  return f"Answer (based on document context):\n{bullets}"
206
 
 
114
  chunks: List[str] = []
115
  for txt in pages:
116
  for i in range(0, len(txt), step):
117
+ part = txt[i:i+step].strip()
118
  if part:
119
  chunks.append(part)
120
  return chunks
 
153
  cache_dir=str(self.cache_dir),
154
  device=-1,
155
  )
156
+ outs = self._translator(texts, max_length=800)
157
  return [o["translation_text"].strip() for o in outs]
158
  except Exception:
159
  return texts
 
162
  if not contexts:
163
  return "No relevant context found. Please upload a PDF or ask a more specific question."
164
 
165
+ # 1) Clean & keep top contexts
166
+ cleaned_contexts = [_clean_for_summary(c) for c in contexts[:5]]
167
+ cleaned_contexts = [c for c in cleaned_contexts if len(c) > 40]
168
+ if not cleaned_contexts:
169
+ return "The document appears largely tabular/numeric; couldn't extract readable sentences."
170
+
171
+ # 2) Pre-translate paragraphs to EN (if output language is EN)
172
+ if OUTPUT_LANG == "en":
173
+ try:
174
+ cleaned_contexts = self._translate_to_en(cleaned_contexts)
175
+ except Exception:
176
+ pass
177
+
178
+ # 3) Split into candidate sentences and filter
179
  candidates: List[str] = []
180
+ for para in cleaned_contexts:
181
+ for s in _split_sentences(para):
 
182
  w = s.split()
183
  if not (8 <= len(w) <= 35):
184
  continue
 
189
  if not candidates:
190
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
191
 
192
+ # 4) Rank by similarity
193
  q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
194
  cand_emb = self.model.encode(candidates, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
195
  scores = (cand_emb @ q_emb.T).ravel()
196
  order = np.argsort(-scores)
197
 
198
+ # 5) Aggressive near-duplicate removal
199
  selected: List[str] = []
200
  for i in order:
201
  s = candidates[i].strip()
202
+ if any(_sim_jaccard(s, t) >= 0.90 for t in selected):
203
  continue
204
  selected.append(s)
205
  if len(selected) >= max_sentences:
 
208
  if not selected:
209
  return "The document appears largely tabular/numeric; couldn't extract readable sentences."
210
 
 
 
 
 
 
211
  bullets = "\n".join(f"- {s}" for s in selected)
212
  return f"Answer (based on document context):\n{bullets}"
213