HamidOmarov commited on
Commit
8ea6d3a
Β·
1 Parent(s): 7d8f85a

chore: use DATA_DIR (/data) via env; create folders on startup

Browse files
Files changed (8) hide show
  1. app.py +1 -1
  2. app/api.py +7 -7
  3. app/paths.py +6 -0
  4. app/rag_system.py +11 -11
  5. app/schemas.py +1 -1
  6. app/store.py +2 -2
  7. app/utils.py +1 -1
  8. main.py +1 -1
app.py CHANGED
@@ -1,2 +1,2 @@
1
- # app.py (repo kΓΆkΓΌndΙ™)
2
  from app.api import app # FastAPI instance
 
1
+ ο»Ώ# app.py (repo kΠ“ΒΆkΠ“Ρ˜ndΠ™β„’)
2
  from app.api import app # FastAPI instance
app/api.py CHANGED
@@ -1,4 +1,4 @@
1
- # app/api.py
2
  from __future__ import annotations
3
 
4
  import time
@@ -18,7 +18,7 @@ __version__ = "1.3.2"
18
 
19
  app = FastAPI(title="RAG API", version=__version__)
20
 
21
- # ───────────────────────── CORS ─────────────────────────
22
  app.add_middleware(
23
  CORSMiddleware,
24
  allow_origins=["*"], # tighten if needed
@@ -27,7 +27,7 @@ app.add_middleware(
27
  allow_headers=["*"],
28
  )
29
 
30
- # ──────────────────── Core singleton & metrics ────────────────────
31
  rag = SimpleRAG()
32
 
33
  METRICS: Dict[str, Any] = {
@@ -38,7 +38,7 @@ METRICS: Dict[str, Any] = {
38
  }
39
  HISTORY: List[Dict[str, Any]] = [] # [{"question":..., "timestamp":...}]
40
 
41
- # ───────────────────────── Models ─────────────────────────
42
  class UploadResponse(BaseModel):
43
  message: str
44
  filename: str
@@ -60,7 +60,7 @@ class HistoryResponse(BaseModel):
60
  total_chunks: int
61
  history: List[Dict[str, Any]]
62
 
63
- # ───────────────────────── Routes ─────────────────────────
64
  @app.get("/")
65
  def root():
66
  return RedirectResponse(url="/docs")
@@ -78,7 +78,7 @@ def health():
78
  @app.get("/debug/translate")
79
  def debug_translate():
80
  """
81
- Simple smoke test for the AZ→EN translator pipeline (if available).
82
  """
83
  try:
84
  from transformers import pipeline # type: ignore
@@ -88,7 +88,7 @@ def debug_translate():
88
  cache_dir=str(rag.cache_dir),
89
  device=-1,
90
  )
91
- out = tr("SΙ™nΙ™d tΙ™miri vΙ™ quraşdΔ±rΔ±lmasΔ± ilΙ™ bağlΔ± işlΙ™r gΓΆrΓΌlΓΌb.", max_length=80)[0]["translation_text"]
92
  return {"ok": True, "example_out": out}
93
  except Exception as e:
94
  return {"ok": False, "error": str(e)}
 
1
+ ο»Ώ# app/api.py
2
  from __future__ import annotations
3
 
4
  import time
 
18
 
19
  app = FastAPI(title="RAG API", version=__version__)
20
 
21
+ # в”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђ CORS в”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђ
22
  app.add_middleware(
23
  CORSMiddleware,
24
  allow_origins=["*"], # tighten if needed
 
27
  allow_headers=["*"],
28
  )
29
 
30
+ # в”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђ Core singleton & metrics в”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђ
31
  rag = SimpleRAG()
32
 
33
  METRICS: Dict[str, Any] = {
 
38
  }
39
  HISTORY: List[Dict[str, Any]] = [] # [{"question":..., "timestamp":...}]
40
 
41
+ # в”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђ Models в”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђ
42
  class UploadResponse(BaseModel):
43
  message: str
44
  filename: str
 
60
  total_chunks: int
61
  history: List[Dict[str, Any]]
62
 
63
+ # в”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђ Routes в”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђв”Ђ
64
  @app.get("/")
65
  def root():
66
  return RedirectResponse(url="/docs")
 
78
  @app.get("/debug/translate")
79
  def debug_translate():
80
  """
81
+ Simple smoke test for the AZ→EN translator pipeline (if available).
82
  """
83
  try:
84
  from transformers import pipeline # type: ignore
 
88
  cache_dir=str(rag.cache_dir),
89
  device=-1,
90
  )
91
+ out = tr("SΠ™β„’nΠ™β„’d tΠ™β„’miri vΠ™β„’ quraΠ•ΡŸdΠ”Β±rΠ”Β±lmasΠ”Β± ilΠ™β„’ baΠ”ΡŸlΠ”Β± iΠ•ΡŸlΠ™β„’r gΠ“ΒΆrΠ“Ρ˜lΠ“Ρ˜b.", max_length=80)[0]["translation_text"]
92
  return {"ok": True, "example_out": out}
93
  except Exception as e:
94
  return {"ok": False, "error": str(e)}
app/paths.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ο»Ώimport os
2
+ from pathlib import Path
3
+ DATA_DIR = Path(os.getenv("DATA_DIR", "/data"))
4
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
5
+ INDEX_DIR = DATA_DIR / "index"; INDEX_DIR.mkdir(exist_ok=True)
6
+ HISTORY_JSON = DATA_DIR / "history.json"
app/rag_system.py CHANGED
@@ -1,4 +1,4 @@
1
- # app/rag_system.py
2
  from __future__ import annotations
3
 
4
  import os
@@ -36,12 +36,12 @@ for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
36
 
37
  # ---------------- Config ----------------
38
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
39
- OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").strip().lower() # "en" → translate AZ→EN
40
 
41
 
42
  # ---------------- Text helpers ----------------
43
- # Join AZ letters split by spaces (e.g., "H Ə F T Ə" β†’ "HƏFTƏ")
44
- AZ_LATIN = "A-Za-zΖΙ™ΔžΔŸΔ°Δ±Γ–ΓΆΕžΕŸΓ‡Γ§ΓœΓΌ"
45
  _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
46
 
47
  def _fix_intra_word_spaces(s: str) -> str:
@@ -53,7 +53,7 @@ def _fix_mojibake(s: str) -> str:
53
  """Fix common UTF-8-as-Latin-1 mojibake quickly; then ftfy."""
54
  if not s:
55
  return s
56
- if any(sym in s for sym in ("Γƒ", "Γ„", "Γ…", "Ð", "Þ", "ΓΎ", "Γ’")):
57
  try:
58
  s = s.encode("latin-1", "ignore").decode("utf-8", "ignore")
59
  except Exception:
@@ -63,7 +63,7 @@ def _fix_mojibake(s: str) -> str:
63
 
64
  def _clean_for_summary(text: str) -> str:
65
  """Remove ultra-short / numeric / tabular-ish lines, collapse spaces."""
66
- NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|mΒ²|azn|usd|eur|mt|m2)\b", re.IGNORECASE)
67
 
68
  def _mostly_numeric(s: str) -> bool:
69
  alnum = [c for c in s if c.isalnum()]
@@ -96,7 +96,7 @@ STOPWORDS = {
96
  }
97
 
98
  def _keywords(text: str) -> List[str]:
99
- toks = re.findall(r"[A-Za-zΓ€-Γ–Γ˜-ΓΆΓΈ-ΓΏ0-9]+", text.lower())
100
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
101
 
102
  def _sim_jaccard(a: str, b: str) -> float:
@@ -112,7 +112,7 @@ class SimpleRAG:
112
  """
113
  Minimal RAG core:
114
  - FAISS (IP) over sentence-transformers embeddings
115
- - PDF β†’ texts with robust decoding (pypdf/PyPDF2 + ftfy; optional pdfminer fallback)
116
  - Extractive answer synthesis with embedding ranking + keyword fallback
117
  """
118
 
@@ -185,7 +185,7 @@ class SimpleRAG:
185
  except Exception:
186
  pass
187
 
188
- # ---------- PDF β†’ texts ----------
189
  @staticmethod
190
  def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
191
  texts: List[str] = []
@@ -308,7 +308,7 @@ class SimpleRAG:
308
  # ---------- Answer Synthesis ----------
309
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
310
  if not contexts and self.is_empty:
311
- return "No relevant context found. Index is empty β€” upload a PDF first."
312
 
313
  # Strong decoding & spacing fixes on contexts
314
  contexts = [_fix_mojibake(_fix_intra_word_spaces(c)) for c in (contexts or [])]
@@ -344,7 +344,7 @@ class SimpleRAG:
344
  if not selected:
345
  return "No readable sentences matched the question. Try a more specific query."
346
 
347
- # Optional AZ→EN translate if output language is English and text is non-ASCII
348
  if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
349
  try:
350
  selected = self._translate_to_en(selected)
 
1
+ ο»Ώ# app/rag_system.py
2
  from __future__ import annotations
3
 
4
  import os
 
36
 
37
  # ---------------- Config ----------------
38
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
39
+ OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").strip().lower() # "en" → translate AZ→EN
40
 
41
 
42
  # ---------------- Text helpers ----------------
43
+ # Join AZ letters split by spaces (e.g., "H ЖЏ F T ЖЏ" в†’ "HЖЏFTЖЏ")
44
+ AZ_LATIN = "A-Za-zΠ–ΠΠ™β„’Π”Ρ›Π”ΡŸΠ”Β°Π”Β±Π“β€“Π“ΒΆΠ•Ρ›Π•ΡŸΠ“β€‘Π“Β§Π“ΡšΠ“Ρ˜"
45
  _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
46
 
47
  def _fix_intra_word_spaces(s: str) -> str:
 
53
  """Fix common UTF-8-as-Latin-1 mojibake quickly; then ftfy."""
54
  if not s:
55
  return s
56
+ if any(sym in s for sym in ("Π“Ρ“", "Π“β€ž", "Г…", "Π“Ρ’", "Π“Ρ›", "Π“Ρ•", "Π“Ρž")):
57
  try:
58
  s = s.encode("latin-1", "ignore").decode("utf-8", "ignore")
59
  except Exception:
 
63
 
64
  def _clean_for_summary(text: str) -> str:
65
  """Remove ultra-short / numeric / tabular-ish lines, collapse spaces."""
66
+ NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|mΠ’Π†|azn|usd|eur|mt|m2)\b", re.IGNORECASE)
67
 
68
  def _mostly_numeric(s: str) -> bool:
69
  alnum = [c for c in s if c.isalnum()]
 
96
  }
97
 
98
  def _keywords(text: str) -> List[str]:
99
+ toks = re.findall(r"[A-Za-zΠ“Π‚-Π“β€“Π“Β˜-Π“ΒΆΠ“Ρ‘-Π“Ρ—0-9]+", text.lower())
100
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
101
 
102
  def _sim_jaccard(a: str, b: str) -> float:
 
112
  """
113
  Minimal RAG core:
114
  - FAISS (IP) over sentence-transformers embeddings
115
+ - PDF в†’ texts with robust decoding (pypdf/PyPDF2 + ftfy; optional pdfminer fallback)
116
  - Extractive answer synthesis with embedding ranking + keyword fallback
117
  """
118
 
 
185
  except Exception:
186
  pass
187
 
188
+ # ---------- PDF в†’ texts ----------
189
  @staticmethod
190
  def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
191
  texts: List[str] = []
 
308
  # ---------- Answer Synthesis ----------
309
  def synthesize_answer(self, question: str, contexts: List[str], max_sentences: int = 4) -> str:
310
  if not contexts and self.is_empty:
311
+ return "No relevant context found. Index is empty — upload a PDF first."
312
 
313
  # Strong decoding & spacing fixes on contexts
314
  contexts = [_fix_mojibake(_fix_intra_word_spaces(c)) for c in (contexts or [])]
 
344
  if not selected:
345
  return "No readable sentences matched the question. Try a more specific query."
346
 
347
+ # Optional AZ→EN translate if output language is English and text is non-ASCII
348
  if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
349
  try:
350
  selected = self._translate_to_en(selected)
app/schemas.py CHANGED
@@ -1,4 +1,4 @@
1
- # app/schemas.py
2
  from pydantic import BaseModel, Field
3
  from typing import Optional, List
4
 
 
1
+ ο»Ώ# app/schemas.py
2
  from pydantic import BaseModel, Field
3
  from typing import Optional, List
4
 
app/store.py CHANGED
@@ -1,8 +1,8 @@
1
- # app/store.py
2
  from collections import defaultdict
3
  from typing import List, Dict
4
 
5
- # in-memory chat tarixi (prod ΓΌΓ§ΓΌn Redis/Postgres mΙ™slΙ™hΙ™tdir)
6
  _history: Dict[str, List[dict]] = defaultdict(list)
7
 
8
  def add_history(session_id: str, role: str, content: str):
 
1
+ ο»Ώ# app/store.py
2
  from collections import defaultdict
3
  from typing import List, Dict
4
 
5
+ # in-memory chat tarixi (prod Π“Ρ˜Π“Β§Π“Ρ˜n Redis/Postgres mΠ™β„’slΠ™β„’hΠ™β„’tdir)
6
  _history: Dict[str, List[dict]] = defaultdict(list)
7
 
8
  def add_history(session_id: str, role: str, content: str):
app/utils.py CHANGED
@@ -1,4 +1,4 @@
1
- # app/utils.py
2
  import uuid
3
  from fastapi import HTTPException
4
 
 
1
+ ο»Ώ# app/utils.py
2
  import uuid
3
  from fastapi import HTTPException
4
 
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # --- ADD: /generate alias for compatibility ---
2
  from fastapi import Form
3
 
4
  @app.post("/generate")
 
1
+ ο»Ώ# --- ADD: /generate alias for compatibility ---
2
  from fastapi import Form
3
 
4
  @app.post("/generate")