Spaces:

HamidOmarov
/

FastAPI-RAG-API

Sleeping

App Files Files Community

HamidOmarov commited on 9 days ago

Commit

1fb5688

1 Parent(s): 9dc7698

Text cleanups: join inter-letter spaces + ftfy normalize

Browse files

Files changed (1) hide show

app/rag_system.py +59 -60

app/rag_system.py CHANGED Viewed

@@ -8,53 +8,72 @@ from typing import List, Tuple
 import faiss
 import numpy as np
-from ftfy import fix_text
-# Prefer pypdf; fallback to PyPDF2
 try:
     from pypdf import PdfReader
-except Exception:  # pragma: no cover
     from PyPDF2 import PdfReader  # type: ignore
 from sentence_transformers import SentenceTransformer
-# ===================== Paths (HF-safe) =====================
-# HF Spaces üçün yazıla bilən baza /app-dir. Lokal mühitdə də işləyir.
-ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
 DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
 UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
 INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
-CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
-for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
-    d.mkdir(parents=True, exist_ok=True)
-# ===================== Config =====================
 MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
 OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
-# ===================== Helpers =====================
 AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
 NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
 STOPWORDS = {
-    "the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "by",
-    "this", "that", "these", "those", "is", "are", "was", "were", "be", "been", "being",
-    "at", "as", "it", "its", "from", "into", "about", "over", "after", "before", "than",
-    "such", "can", "could", "should", "would", "may", "might", "will", "shall"
 }
-AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
-_SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
 def _fix_intra_word_spaces(s: str) -> str:
-    """'H Ə F T Ə' → 'HƏFTƏ' (yalnız ardıcıl tək-hərf qaçışlarını birləşdirir)."""
     if not s:
         return s
     return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
 def _fix_mojibake(s: str) -> str:
-    """UTF-8-as-Latin1 tipik mojibake üçün sürətli həll."""
     if not s:
         return s
     if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
@@ -64,17 +83,6 @@ def _fix_mojibake(s: str) -> str:
             return s
     return s
-def _normalize_text(s: str) -> str:
-    if not s:
-        return s
-    s = fix_text(s)                 # ftfy ilə ümumi düzəlişlər
-    s = _fix_mojibake(s)            # latin-1 → utf-8 “çevrilməsi” cəhd
-    s = s.replace("ﬁ", "fi").replace("ﬂ", "fl")
-    s = _fix_intra_word_spaces(s)   # H Ə F T Ə → HƏFTƏ
-    s = re.sub(r"[ \t]+", " ", s)
-    s = re.sub(r"\s+\n", "\n", s)
-    return s.strip()
 def _split_sentences(text: str) -> List[str]:
     return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
@@ -83,7 +91,7 @@ def _mostly_numeric(s: str) -> bool:
     if not alnum:
         return True
     digits = sum(c.isdigit() for c in alnum)
-    return digits / max(1, len(alnum)) > 0.30
 def _tabular_like(s: str) -> bool:
     hits = len(NUM_TOKEN_RE.findall(s))
@@ -93,11 +101,7 @@ def _clean_for_summary(text: str) -> str:
     out = []
     for ln in text.splitlines():
         t = " ".join(ln.split())
-        if not t:
-            continue
-        if len(t) < 25:
-            continue
-        if _mostly_numeric(t) or _tabular_like(t):
             continue
         out.append(t)
     return " ".join(out)
@@ -118,7 +122,7 @@ def _looks_azerbaijani(s: str) -> bool:
     non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
     return has_az or non_ascii_ratio > 0.15
-# ===================== RAG Core =====================
 class SimpleRAG:
     def __init__(
         self,
@@ -138,7 +142,7 @@ class SimpleRAG:
         self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
         self.chunks: List[str] = []
         self.last_added: List[str] = []
-        self._translator = None  # lazy
         self._load()
@@ -172,26 +176,20 @@ class SimpleRAG:
         pages: List[str] = []
         for p in reader.pages:
             t = p.extract_text() or ""
-            if t.strip():
-                t = _normalize_text(t)
                 pages.append(t)
         chunks: List[str] = []
         for txt in pages:
             for i in range(0, len(txt), step):
-                part = txt[i: i + step].strip()
                 if part:
                     chunks.append(part)
-        # simple dedup to avoid exact repeats
-        seen = set()
-        uniq: List[str] = []
-        for c in chunks:
-            if c in seen:
-                continue
-            seen.add(c)
-            uniq.append(c)
-        return uniq
     # ---------- Indexing ----------
     def add_pdf(self, pdf_path: Path) -> int:
@@ -274,10 +272,13 @@ class SimpleRAG:
         if not contexts and self.is_empty:
             return "No relevant context found. Index is empty — upload a PDF first."
-        # normalize contexts (mojibake, spacing, etc.)
-        contexts = [_normalize_text(c) for c in (contexts or [])]
-        # 1) local candidate pool
         local_pool: List[str] = []
         for c in (contexts or [])[:5]:
             cleaned = _clean_for_summary(c)
@@ -289,7 +290,6 @@ class SimpleRAG:
                     continue
                 local_pool.append(" ".join(w))
-        # 2) rank by similarity to question
         selected: List[str] = []
         if local_pool:
             q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
@@ -304,15 +304,14 @@ class SimpleRAG:
                 if len(selected) >= max_sentences:
                     break
-        # 3) keyword fallback (whole corpus) əgər nəticə zəifdirsə
         if not selected:
             selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
         if not selected:
             return "No readable sentences matched the question. Try a more specific query."
-        # 4) translate to EN if needed
-        if OUTPUT_LANG == "en" and any(_looks_azerbaijani(s) for s in selected):
             selected = self._translate_to_en(selected)
         bullets = "\n".join(f"- {s}" for s in selected)

 import faiss
 import numpy as np
+# --- ftfy (mojibake/normalizasiya) ---
+try:
+    from ftfy import fix_text as _ftfy
+except Exception:  # ftfy yoxdursa, no-op
+    def _ftfy(x: str) -> str:
+        return x
+# pypdf -> PyPDF2 fallback
 try:
     from pypdf import PdfReader
+except Exception:
     from PyPDF2 import PdfReader  # type: ignore
 from sentence_transformers import SentenceTransformer
+# ---------------- Paths & Cache (HF-safe) ----------------
+# Default: repo kökü; APP_ROOT verilərsə ona keç
+DEFAULT_ROOT = Path(__file__).resolve().parents[1]
+ROOT_DIR = Path(os.getenv("APP_ROOT", str(DEFAULT_ROOT)))
 DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
 UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
 INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
+CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))  # transformers üçün ən yaxşısı HF_HOME
+# cəhd et yaratmağa; icazə problemi olsa, local ./data-a düş
+for pth in (CACHE_DIR,):
+    try:
+        pth.mkdir(parents=True, exist_ok=True)
+    except PermissionError:
+        pass
+try:
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+    INDEX_DIR.mkdir(parents=True, exist_ok=True)
+except PermissionError:
+    DATA_DIR = Path("./data"); DATA_DIR.mkdir(parents=True, exist_ok=True)
+    UPLOAD_DIR = DATA_DIR / "uploads"; UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+    INDEX_DIR = DATA_DIR / "index"; INDEX_DIR.mkdir(parents=True, exist_ok=True)
+# ---------------- Config ----------------
 MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
 OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
+# ---------------- Helpers ----------------
 AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
+AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
+_SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
 NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
 STOPWORDS = {
+    "the","a","an","and","or","of","to","in","on","for","with","by",
+    "this","that","these","those","is","are","was","were","be","been","being",
+    "at","as","it","its","from","into","about","over","after","before","than",
+    "such","can","could","should","would","may","might","will","shall"
 }
 def _fix_intra_word_spaces(s: str) -> str:
+    """'c l a s s' → 'class', 'H Ə F T Ə' → 'HƏFTƏ' (yalnız ardıcıl tək-hərflər)."""
     if not s:
         return s
     return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
 def _fix_mojibake(s: str) -> str:
+    """UTF-8-in Latin-1 kimi oxunmasından yaranan 'Ã¶' və s. pozuntuları yumşaq düzəlt."""
     if not s:
         return s
     if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
             return s
     return s
 def _split_sentences(text: str) -> List[str]:
     return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
     if not alnum:
         return True
     digits = sum(c.isdigit() for c in alnum)
+    return digits / max(1, len(alnum)) > 0.3
 def _tabular_like(s: str) -> bool:
     hits = len(NUM_TOKEN_RE.findall(s))
     out = []
     for ln in text.splitlines():
         t = " ".join(ln.split())
+        if not t or _mostly_numeric(t) or _tabular_like(t):
             continue
         out.append(t)
     return " ".join(out)
     non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
     return has_az or non_ascii_ratio > 0.15
+# ---------------- RAG Core ----------------
 class SimpleRAG:
     def __init__(
         self,
         self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
         self.chunks: List[str] = []
         self.last_added: List[str] = []
+        self._translator = None  # lazy init
         self._load()
         pages: List[str] = []
         for p in reader.pages:
             t = p.extract_text() or ""
+            # normalizasiya ardıcıllığı
+            t = _ftfy(t)
+            t = _fix_mojibake(t)
+            t = _fix_intra_word_spaces(t)
+            t = re.sub(r"\s+", " ", t).strip()
+            if t:
                 pages.append(t)
         chunks: List[str] = []
         for txt in pages:
             for i in range(0, len(txt), step):
+                part = txt[i : i + step].strip()
                 if part:
                     chunks.append(part)
+        return chunks
     # ---------- Indexing ----------
     def add_pdf(self, pdf_path: Path) -> int:
         if not contexts and self.is_empty:
             return "No relevant context found. Index is empty — upload a PDF first."
+        # konteksləri də təmizlə
+        contexts = [
+            re.sub(r"\s+", " ", _fix_intra_word_spaces(_fix_mojibake(_ftfy(c)))).strip()
+            for c in (contexts or [])
+        ]
+        # Yaxın kontekstlərdən namizədlər
         local_pool: List[str] = []
         for c in (contexts or [])[:5]:
             cleaned = _clean_for_summary(c)
                     continue
                 local_pool.append(" ".join(w))
         selected: List[str] = []
         if local_pool:
             q_emb = self.model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
                 if len(selected) >= max_sentences:
                     break
         if not selected:
             selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
         if not selected:
             return "No readable sentences matched the question. Try a more specific query."
+        # EN istəyə uyğun tərcümə
+        if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
             selected = self._translate_to_en(selected)
         bullets = "\n".join(f"- {s}" for s in selected)