Spaces:
Running
Running
Commit
Β·
8ea6d3a
1
Parent(s):
7d8f85a
chore: use DATA_DIR (/data) via env; create folders on startup
Browse files- app.py +1 -1
- app/api.py +7 -7
- app/paths.py +6 -0
- app/rag_system.py +11 -11
- app/schemas.py +1 -1
- app/store.py +2 -2
- app/utils.py +1 -1
- main.py +1 -1
app.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
# app.py (repo
|
2 |
from app.api import app # FastAPI instance
|
|
|
1 |
+
ο»Ώ# app.py (repo kΠΒΆkΠΡndΠβ’)
|
2 |
from app.api import app # FastAPI instance
|
app/api.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# app/api.py
|
2 |
from __future__ import annotations
|
3 |
|
4 |
import time
|
@@ -18,7 +18,7 @@ __version__ = "1.3.2"
|
|
18 |
|
19 |
app = FastAPI(title="RAG API", version=__version__)
|
20 |
|
21 |
-
#
|
22 |
app.add_middleware(
|
23 |
CORSMiddleware,
|
24 |
allow_origins=["*"], # tighten if needed
|
@@ -27,7 +27,7 @@ app.add_middleware(
|
|
27 |
allow_headers=["*"],
|
28 |
)
|
29 |
|
30 |
-
#
|
31 |
rag = SimpleRAG()
|
32 |
|
33 |
METRICS: Dict[str, Any] = {
|
@@ -38,7 +38,7 @@ METRICS: Dict[str, Any] = {
|
|
38 |
}
|
39 |
HISTORY: List[Dict[str, Any]] = [] # [{"question":..., "timestamp":...}]
|
40 |
|
41 |
-
#
|
42 |
class UploadResponse(BaseModel):
|
43 |
message: str
|
44 |
filename: str
|
@@ -60,7 +60,7 @@ class HistoryResponse(BaseModel):
|
|
60 |
total_chunks: int
|
61 |
history: List[Dict[str, Any]]
|
62 |
|
63 |
-
#
|
64 |
@app.get("/")
|
65 |
def root():
|
66 |
return RedirectResponse(url="/docs")
|
@@ -78,7 +78,7 @@ def health():
|
|
78 |
@app.get("/debug/translate")
|
79 |
def debug_translate():
|
80 |
"""
|
81 |
-
Simple smoke test for the AZ
|
82 |
"""
|
83 |
try:
|
84 |
from transformers import pipeline # type: ignore
|
@@ -88,7 +88,7 @@ def debug_translate():
|
|
88 |
cache_dir=str(rag.cache_dir),
|
89 |
device=-1,
|
90 |
)
|
91 |
-
out = tr("
|
92 |
return {"ok": True, "example_out": out}
|
93 |
except Exception as e:
|
94 |
return {"ok": False, "error": str(e)}
|
|
|
1 |
+
ο»Ώ# app/api.py
|
2 |
from __future__ import annotations
|
3 |
|
4 |
import time
|
|
|
18 |
|
19 |
app = FastAPI(title="RAG API", version=__version__)
|
20 |
|
21 |
+
# Π²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠ CORS Π²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠ
|
22 |
app.add_middleware(
|
23 |
CORSMiddleware,
|
24 |
allow_origins=["*"], # tighten if needed
|
|
|
27 |
allow_headers=["*"],
|
28 |
)
|
29 |
|
30 |
+
# Π²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠ Core singleton & metrics Π²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠ
|
31 |
rag = SimpleRAG()
|
32 |
|
33 |
METRICS: Dict[str, Any] = {
|
|
|
38 |
}
|
39 |
HISTORY: List[Dict[str, Any]] = [] # [{"question":..., "timestamp":...}]
|
40 |
|
41 |
+
# Π²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠ Models Π²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠ
|
42 |
class UploadResponse(BaseModel):
|
43 |
message: str
|
44 |
filename: str
|
|
|
60 |
total_chunks: int
|
61 |
history: List[Dict[str, Any]]
|
62 |
|
63 |
+
# Π²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠ Routes Π²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠΠ²βΠ
|
64 |
@app.get("/")
|
65 |
def root():
|
66 |
return RedirectResponse(url="/docs")
|
|
|
78 |
@app.get("/debug/translate")
|
79 |
def debug_translate():
|
80 |
"""
|
81 |
+
Simple smoke test for the AZΠ²β βEN translator pipeline (if available).
|
82 |
"""
|
83 |
try:
|
84 |
from transformers import pipeline # type: ignore
|
|
|
88 |
cache_dir=str(rag.cache_dir),
|
89 |
device=-1,
|
90 |
)
|
91 |
+
out = tr("SΠβ’nΠβ’d tΠβ’miri vΠβ’ quraΠΡdΠΒ±rΠΒ±lmasΠΒ± ilΠβ’ baΠΡlΠΒ± iΠΡlΠβ’r gΠΒΆrΠΡlΠΡb.", max_length=80)[0]["translation_text"]
|
92 |
return {"ok": True, "example_out": out}
|
93 |
except Exception as e:
|
94 |
return {"ok": False, "error": str(e)}
|
app/paths.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ο»Ώimport os
|
2 |
+
from pathlib import Path
|
3 |
+
DATA_DIR = Path(os.getenv("DATA_DIR", "/data"))
|
4 |
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
5 |
+
INDEX_DIR = DATA_DIR / "index"; INDEX_DIR.mkdir(exist_ok=True)
|
6 |
+
HISTORY_JSON = DATA_DIR / "history.json"
|
app/rag_system.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# app/rag_system.py
|
2 |
from __future__ import annotations
|
3 |
|
4 |
import os
|
@@ -36,12 +36,12 @@ for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
|
|
36 |
|
37 |
# ---------------- Config ----------------
|
38 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
39 |
-
OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").strip().lower() # "en"
|
40 |
|
41 |
|
42 |
# ---------------- Text helpers ----------------
|
43 |
-
# Join AZ letters split by spaces (e.g., "H
|
44 |
-
AZ_LATIN = "A-Za-
|
45 |
_SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
|
46 |
|
47 |
def _fix_intra_word_spaces(s: str) -> str:
|
@@ -53,7 +53,7 @@ def _fix_mojibake(s: str) -> str:
|
|
53 |
"""Fix common UTF-8-as-Latin-1 mojibake quickly; then ftfy."""
|
54 |
if not s:
|
55 |
return s
|
56 |
-
if any(sym in s for sym in ("
|
57 |
try:
|
58 |
s = s.encode("latin-1", "ignore").decode("utf-8", "ignore")
|
59 |
except Exception:
|
@@ -63,7 +63,7 @@ def _fix_mojibake(s: str) -> str:
|
|
63 |
|
64 |
def _clean_for_summary(text: str) -> str:
|
65 |
"""Remove ultra-short / numeric / tabular-ish lines, collapse spaces."""
|
66 |
-
NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m
|
67 |
|
68 |
def _mostly_numeric(s: str) -> bool:
|
69 |
alnum = [c for c in s if c.isalnum()]
|
@@ -96,7 +96,7 @@ STOPWORDS = {
|
|
96 |
}
|
97 |
|
98 |
def _keywords(text: str) -> List[str]:
|
99 |
-
toks = re.findall(r"[A-Za-z
|
100 |
return [t for t in toks if t not in STOPWORDS and len(t) > 2]
|
101 |
|
102 |
def _sim_jaccard(a: str, b: str) -> float:
|
@@ -112,7 +112,7 @@ class SimpleRAG:
|
|
112 |
"""
|
113 |
Minimal RAG core:
|
114 |
- FAISS (IP) over sentence-transformers embeddings
|
115 |
-
- PDF
|
116 |
- Extractive answer synthesis with embedding ranking + keyword fallback
|
117 |
"""
|
118 |
|
@@ -185,7 +185,7 @@ class SimpleRAG:
|
|
185 |
except Exception:
|
186 |
pass
|
187 |
|
188 |
-
# ---------- PDF
|
189 |
@staticmethod
|
190 |
def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
|
191 |
texts: List[str] = []
|
@@ -308,7 +308,7 @@ class SimpleRAG:
|
|
308 |
# ---------- Answer Synthesis ----------
|
309 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
310 |
if not contexts and self.is_empty:
|
311 |
-
return "No relevant context found. Index is empty
|
312 |
|
313 |
# Strong decoding & spacing fixes on contexts
|
314 |
contexts = [_fix_mojibake(_fix_intra_word_spaces(c)) for c in (contexts or [])]
|
@@ -344,7 +344,7 @@ class SimpleRAG:
|
|
344 |
if not selected:
|
345 |
return "No readable sentences matched the question. Try a more specific query."
|
346 |
|
347 |
-
# Optional AZ
|
348 |
if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
|
349 |
try:
|
350 |
selected = self._translate_to_en(selected)
|
|
|
1 |
+
ο»Ώ# app/rag_system.py
|
2 |
from __future__ import annotations
|
3 |
|
4 |
import os
|
|
|
36 |
|
37 |
# ---------------- Config ----------------
|
38 |
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
39 |
+
OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").strip().lower() # "en" Π²β β translate AZΠ²β βEN
|
40 |
|
41 |
|
42 |
# ---------------- Text helpers ----------------
|
43 |
+
# Join AZ letters split by spaces (e.g., "H ΠΠ F T ΠΠ" Π²β β "HΠΠFTΠΠ")
|
44 |
+
AZ_LATIN = "A-Za-zΠΠΠβ’ΠΡΠΡΠΒ°ΠΒ±ΠβΠΒΆΠΡΠΡΠβ‘ΠΒ§ΠΡΠΡ"
|
45 |
_SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
|
46 |
|
47 |
def _fix_intra_word_spaces(s: str) -> str:
|
|
|
53 |
"""Fix common UTF-8-as-Latin-1 mojibake quickly; then ftfy."""
|
54 |
if not s:
|
55 |
return s
|
56 |
+
if any(sym in s for sym in ("ΠΡ", "Πβ", "Πβ¦", "ΠΡ", "ΠΡ", "ΠΡ", "ΠΡ")):
|
57 |
try:
|
58 |
s = s.encode("latin-1", "ignore").decode("utf-8", "ignore")
|
59 |
except Exception:
|
|
|
63 |
|
64 |
def _clean_for_summary(text: str) -> str:
|
65 |
"""Remove ultra-short / numeric / tabular-ish lines, collapse spaces."""
|
66 |
+
NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|mΠΠ|azn|usd|eur|mt|m2)\b", re.IGNORECASE)
|
67 |
|
68 |
def _mostly_numeric(s: str) -> bool:
|
69 |
alnum = [c for c in s if c.isalnum()]
|
|
|
96 |
}
|
97 |
|
98 |
def _keywords(text: str) -> List[str]:
|
99 |
+
toks = re.findall(r"[A-Za-zΠΠ-ΠβΠΒ-ΠΒΆΠΡ-ΠΡ0-9]+", text.lower())
|
100 |
return [t for t in toks if t not in STOPWORDS and len(t) > 2]
|
101 |
|
102 |
def _sim_jaccard(a: str, b: str) -> float:
|
|
|
112 |
"""
|
113 |
Minimal RAG core:
|
114 |
- FAISS (IP) over sentence-transformers embeddings
|
115 |
+
- PDF Π²β β texts with robust decoding (pypdf/PyPDF2 + ftfy; optional pdfminer fallback)
|
116 |
- Extractive answer synthesis with embedding ranking + keyword fallback
|
117 |
"""
|
118 |
|
|
|
185 |
except Exception:
|
186 |
pass
|
187 |
|
188 |
+
# ---------- PDF Π²β β texts ----------
|
189 |
@staticmethod
|
190 |
def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
|
191 |
texts: List[str] = []
|
|
|
308 |
# ---------- Answer Synthesis ----------
|
309 |
def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
|
310 |
if not contexts and self.is_empty:
|
311 |
+
return "No relevant context found. Index is empty Π²Πβ upload a PDF first."
|
312 |
|
313 |
# Strong decoding & spacing fixes on contexts
|
314 |
contexts = [_fix_mojibake(_fix_intra_word_spaces(c)) for c in (contexts or [])]
|
|
|
344 |
if not selected:
|
345 |
return "No readable sentences matched the question. Try a more specific query."
|
346 |
|
347 |
+
# Optional AZΠ²β βEN translate if output language is English and text is non-ASCII
|
348 |
if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
|
349 |
try:
|
350 |
selected = self._translate_to_en(selected)
|
app/schemas.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# app/schemas.py
|
2 |
from pydantic import BaseModel, Field
|
3 |
from typing import Optional, List
|
4 |
|
|
|
1 |
+
ο»Ώ# app/schemas.py
|
2 |
from pydantic import BaseModel, Field
|
3 |
from typing import Optional, List
|
4 |
|
app/store.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
# app/store.py
|
2 |
from collections import defaultdict
|
3 |
from typing import List, Dict
|
4 |
|
5 |
-
# in-memory chat tarixi (prod
|
6 |
_history: Dict[str, List[dict]] = defaultdict(list)
|
7 |
|
8 |
def add_history(session_id: str, role: str, content: str):
|
|
|
1 |
+
ο»Ώ# app/store.py
|
2 |
from collections import defaultdict
|
3 |
from typing import List, Dict
|
4 |
|
5 |
+
# in-memory chat tarixi (prod ΠΡΠΒ§ΠΡn Redis/Postgres mΠβ’slΠβ’hΠβ’tdir)
|
6 |
_history: Dict[str, List[dict]] = defaultdict(list)
|
7 |
|
8 |
def add_history(session_id: str, role: str, content: str):
|
app/utils.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# app/utils.py
|
2 |
import uuid
|
3 |
from fastapi import HTTPException
|
4 |
|
|
|
1 |
+
ο»Ώ# app/utils.py
|
2 |
import uuid
|
3 |
from fastapi import HTTPException
|
4 |
|
main.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# --- ADD: /generate alias for compatibility ---
|
2 |
from fastapi import Form
|
3 |
|
4 |
@app.post("/generate")
|
|
|
1 |
+
ο»Ώ# --- ADD: /generate alias for compatibility ---
|
2 |
from fastapi import Form
|
3 |
|
4 |
@app.post("/generate")
|