Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

App Files Files Community

HemanM commited on 22 days ago

Commit

be24b1a

verified ·

1 Parent(s): ef43d13

Update indexer.py

Browse files

Files changed (1) hide show

indexer.py +125 -60

indexer.py CHANGED Viewed

@@ -1,31 +1,35 @@
 """
-Step 3: Minimal indexer for TXT files under data/seed/.
-What it does (Objective):
-- Scans data/seed/ for *.txt files
-- Splits each file into overlapping chunks (for better retrieval)
-- Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small)
-- Builds a FAISS index (inner-product / cosine-like) and saves:
-    - data/index.faiss  (the vector index)
-    - data/meta.json    (mapping from vector -> source chunk)
-    - docs_out/*.txt    (chunked text files for easy loading later)
-- Returns a short status string for the UI
-Design notes (Objective):
-- TXT-only for this step to avoid parser complexity.
-- We use normalize_embeddings=True so inner product approximates cosine.
-- Safe defaults: max_tokens=700 words, stride=200 words.
 """
 import os
 import json
-from pathlib import Path
 import hashlib
-from typing import List, Dict
-import numpy as np
 import faiss
 from sentence_transformers import SentenceTransformer
 # ---- Paths (Objective)
 DATA_DIR = Path("data")
@@ -34,99 +38,160 @@ DOCS_DIR = Path("docs_out")
 INDEX_PATH = DATA_DIR / "index.faiss"
 META_PATH = DATA_DIR / "meta.json"
-# ---- Embedding model (Objective): small, fast, good enough for MVP
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 def _read_txt(path: Path) -> str:
-    """
-    Read a UTF-8 text file safely.
-    (Objective) Returns empty string on failure; we guard later.
-    """
     try:
         return path.read_text(encoding="utf-8", errors="ignore")
     except Exception:
         return ""
-def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]:
     """
-    Split long text into overlapping word chunks.
-    (Objective) Overlap helps retrieval recall.
     """
     words = text.split()
     chunks: List[str] = []
     i = 0
     while i < len(words):
-        chunk_words = words[i : i + max_tokens]
-        chunk = " ".join(chunk_words).strip()
-        if len(chunk) >= 50:  # ignore tiny fragments
-            chunks.append(chunk)
-        # Move forward by (max - stride) to create overlap
-        i += max(1, max_tokens - stride)
     return chunks
 def build_index() -> str:
     """
-    Build the FAISS index from TXT files in data/seed/.
-    Saves index + metadata to disk. Returns a human-readable status.
-    (Objective)
     """
-    # Ensure folders exist
     DATA_DIR.mkdir(exist_ok=True)
     SEED_DIR.mkdir(parents=True, exist_ok=True)
     DOCS_DIR.mkdir(exist_ok=True)
-    # Collect all .txt files
-    txt_files = sorted(SEED_DIR.glob("*.txt"))
-    if not txt_files:
-        return "No TXT files found in data/seed/. Add *.txt and try again."
-    # Read and chunk
     docs: List[str] = []
     metas: List[Dict] = []
-    for fp in txt_files:
-        raw = _read_txt(fp)
-        if not raw or len(raw.strip()) < 50:
-            # Skip empty/near-empty
             continue
-        chunks = _chunk_text(raw)
-        # Stable ID per source file for nice chunk filenames
         src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
         for j, ch in enumerate(chunks):
-            outp = DOCS_DIR / f"{src_id}_{j}.txt"
-            outp.write_text(ch, encoding="utf-8")
             metas.append({
                 "file": str(fp),
-                "chunk_file": str(outp),
                 "chunk_id": f"{src_id}_{j}",
             })
             docs.append(ch)
     if not docs:
-        return "Found TXT files, but no usable content (after filtering)."
-    # Embed
     model = SentenceTransformer(EMBED_MODEL)
-    # normalize_embeddings=True -> inner product becomes cosine-like
     emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
-    # Build FAISS IP index
     d = emb.shape[1]
     index = faiss.IndexFlatIP(d)
     index.add(emb)
-    # Save index + metadata
     faiss.write_index(index, str(INDEX_PATH))
     META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
-    return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}."
 if __name__ == "__main__":
-    # Allow running `python indexer.py` locally. In HF Spaces
-    # we'll call build_index() from the UI button in app.py.
     print(build_index())

 """
+Step 7: Indexer for TXT + PDF + HTML with a clearer status message.
+(Objective)
+- Scans data/seed/ for .txt, .pdf, .html/.htm/.xhtml (non-recursive for now).
+- Extracts text safely:
+    - TXT: read as UTF-8
+    - PDF: pdfminer.six
+    - HTML: BeautifulSoup (remove scripts/styles/nav)
+- Chunks text with overlap for better recall.
+- Embeds chunks (all-MiniLM-L6-v2) and builds FAISS IP index (cosine-like).
+- Saves:
+    data/index.faiss   -> FAISS vector index
+    data/meta.json     -> list of chunk metadata (file, chunk_file, chunk_id, src_hash)
+    docs_out/*.txt     -> individual chunk files
+Quality-of-life:
+- Computes a hash per *source file content* to detect when a source changed.
+- Returns a status string reporting files seen, chunks built, and updated file count.
 """
 import os
 import json
 import hashlib
+from pathlib import Path
+from typing import List, Dict, Tuple
 import faiss
+import numpy as np
 from sentence_transformers import SentenceTransformer
+from pdfminer.high_level import extract_text as pdf_extract_text
+from bs4 import BeautifulSoup
 # ---- Paths (Objective)
 DATA_DIR = Path("data")
 INDEX_PATH = DATA_DIR / "index.faiss"
 META_PATH = DATA_DIR / "meta.json"
+# ---- Embedding model (Objective)
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+# ---- Supported extensions (Objective)
+TXT_EXT = {".txt", ".md"}
+PDF_EXT = {".pdf"}
+HTML_EXT = {".html", ".htm", ".xhtml"}
+ALL_EXT = TXT_EXT | PDF_EXT | HTML_EXT
 def _read_txt(path: Path) -> str:
     try:
         return path.read_text(encoding="utf-8", errors="ignore")
     except Exception:
         return ""
+def _read_pdf(path: Path) -> str:
+    try:
+        return pdf_extract_text(str(path)) or ""
+    except Exception:
+        return ""
+def _read_html(path: Path) -> str:
+    try:
+        raw = path.read_bytes()
+        soup = BeautifulSoup(raw, "lxml")
+        # remove noisy tags
+        for tag in soup(["script", "style", "noscript", "header", "footer", "nav"]):
+            tag.decompose()
+        text = " ".join(soup.get_text(separator=" ").split())
+        return text
+    except Exception:
+        return ""
+def _load_source(path: Path) -> str:
     """
+    (Objective) Route by extension and return plain text.
+    """
+    ext = path.suffix.lower()
+    if ext in TXT_EXT:
+        return _read_txt(path)
+    if ext in PDF_EXT:
+        return _read_pdf(path)
+    if ext in HTML_EXT:
+        return _read_html(path)
+    return ""
+def _chunk_text(text: str, max_words: int = 700, stride: int = 200) -> List[str]:
+    """
+    (Objective) Split long text into overlapping word chunks.
     """
     words = text.split()
     chunks: List[str] = []
     i = 0
     while i < len(words):
+        seg = " ".join(words[i : i + max_words]).strip()
+        if len(seg) >= 50:  # ignore tiny bits
+            chunks.append(seg)
+        i += max(1, max_words - stride)
     return chunks
+def _hash_text(text: str) -> str:
+    """
+    (Objective) Stable hash of text to detect source changes between runs.
+    """
+    return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()
+def _load_previous_hashes() -> Dict[str, str]:
+    """
+    (Objective) Load previous (file -> src_hash) mapping from meta.json if present.
+    """
+    if not META_PATH.exists():
+        return {}
+    try:
+        metas = json.loads(META_PATH.read_text(encoding="utf-8"))
+        hashes = {}
+        for m in metas:
+            # save the last seen hash per file (any chunk has same src_hash)
+            hashes[m["file"]] = m.get("src_hash", "")
+        return hashes
+    except Exception:
+        return {}
 def build_index() -> str:
     """
+    (Objective) Build FAISS index from TXT+PDF+HTML under data/seed/.
+    Returns a human-friendly status string.
     """
     DATA_DIR.mkdir(exist_ok=True)
     SEED_DIR.mkdir(parents=True, exist_ok=True)
     DOCS_DIR.mkdir(exist_ok=True)
+    # Collect files (non-recursive)
+    files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in ALL_EXT]
+    if not files:
+        return "No files found under data/seed/. Supported: .txt, .pdf, .html"
+    prev_hashes = _load_previous_hashes()
     docs: List[str] = []
     metas: List[Dict] = []
+    updated_src_files = set()
+    for fp in sorted(files):
+        text = _load_source(fp)
+        if len(text.strip()) < 50:
+            # skip empty/near-empty sources
             continue
+        src_hash = _hash_text(text)
+        if prev_hashes.get(str(fp), "") and prev_hashes[str(fp)] != src_hash:
+            updated_src_files.add(str(fp))
+        # Chunk and persist per-chunk text files for quick reading later
+        chunks = _chunk_text(text)
         src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
         for j, ch in enumerate(chunks):
+            chunk_file = DOCS_DIR / f"{src_id}_{j}.txt"
+            chunk_file.write_text(ch, encoding="utf-8")
             metas.append({
                 "file": str(fp),
+                "chunk_file": str(chunk_file),
                 "chunk_id": f"{src_id}_{j}",
+                "src_hash": src_hash,
             })
             docs.append(ch)
     if not docs:
+        return "Found files but no usable content after parsing."
+    # Embed all chunks
     model = SentenceTransformer(EMBED_MODEL)
     emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
+    # Build FAISS index
     d = emb.shape[1]
     index = faiss.IndexFlatIP(d)
     index.add(emb)
+    # Save
     faiss.write_index(index, str(INDEX_PATH))
     META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
+    changed_note = f"{len(updated_src_files)} source(s) updated since last index." if prev_hashes else "Initial index build."
+    return f"Indexed {len(docs)} chunks from {len(files)} file(s). {changed_note} Saved to {INDEX_PATH.name}."
 if __name__ == "__main__":
     print(build_index())