Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

App Files Files Community

HemanM commited on 22 days ago

Commit

c18afb0

verified ·

1 Parent(s): c39e2ac

Create indexer.py

Browse files

Files changed (1) hide show

indexer.py +132 -0

indexer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Step 3: Minimal indexer for TXT files under data/seed/.
+What it does (Objective):
+- Scans data/seed/ for *.txt files
+- Splits each file into overlapping chunks (for better retrieval)
+- Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small)
+- Builds a FAISS index (inner-product / cosine-like) and saves:
+    - data/index.faiss  (the vector index)
+    - data/meta.json    (mapping from vector -> source chunk)
+    - docs_out/*.txt    (chunked text files for easy loading later)
+- Returns a short status string for the UI
+Design notes (Objective):
+- TXT-only for this step to avoid parser complexity.
+- We use normalize_embeddings=True so inner product approximates cosine.
+- Safe defaults: max_tokens=700 words, stride=200 words.
+"""
+import os
+import json
+from pathlib import Path
+import hashlib
+from typing import List, Dict
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+# ---- Paths (Objective)
+DATA_DIR = Path("data")
+SEED_DIR = DATA_DIR / "seed"
+DOCS_DIR = Path("docs_out")
+INDEX_PATH = DATA_DIR / "index.faiss"
+META_PATH = DATA_DIR / "meta.json"
+# ---- Embedding model (Objective): small, fast, good enough for MVP
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+def _read_txt(path: Path) -> str:
+    """
+    Read a UTF-8 text file safely.
+    (Objective) Returns empty string on failure; we guard later.
+    """
+    try:
+        return path.read_text(encoding="utf-8", errors="ignore")
+    except Exception:
+        return ""
+def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]:
+    """
+    Split long text into overlapping word chunks.
+    (Objective) Overlap helps retrieval recall.
+    """
+    words = text.split()
+    chunks: List[str] = []
+    i = 0
+    while i < len(words):
+        chunk_words = words[i : i + max_tokens]
+        chunk = " ".join(chunk_words).strip()
+        if len(chunk) >= 50:  # ignore tiny fragments
+            chunks.append(chunk)
+        # Move forward by (max - stride) to create overlap
+        i += max(1, max_tokens - stride)
+    return chunks
+def build_index() -> str:
+    """
+    Build the FAISS index from TXT files in data/seed/.
+    Saves index + metadata to disk. Returns a human-readable status.
+    (Objective)
+    """
+    # Ensure folders exist
+    DATA_DIR.mkdir(exist_ok=True)
+    SEED_DIR.mkdir(parents=True, exist_ok=True)
+    DOCS_DIR.mkdir(exist_ok=True)
+    # Collect all .txt files
+    txt_files = sorted(SEED_DIR.glob("*.txt"))
+    if not txt_files:
+        return "No TXT files found in data/seed/. Add *.txt and try again."
+    # Read and chunk
+    docs: List[str] = []
+    metas: List[Dict] = []
+    for fp in txt_files:
+        raw = _read_txt(fp)
+        if not raw or len(raw.strip()) < 50:
+            # Skip empty/near-empty
+            continue
+        chunks = _chunk_text(raw)
+        # Stable ID per source file for nice chunk filenames
+        src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
+        for j, ch in enumerate(chunks):
+            outp = DOCS_DIR / f"{src_id}_{j}.txt"
+            outp.write_text(ch, encoding="utf-8")
+            metas.append({
+                "file": str(fp),
+                "chunk_file": str(outp),
+                "chunk_id": f"{src_id}_{j}",
+            })
+            docs.append(ch)
+    if not docs:
+        return "Found TXT files, but no usable content (after filtering)."
+    # Embed
+    model = SentenceTransformer(EMBED_MODEL)
+    # normalize_embeddings=True -> inner product becomes cosine-like
+    emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
+    # Build FAISS IP index
+    d = emb.shape[1]
+    index = faiss.IndexFlatIP(d)
+    index.add(emb)
+    # Save index + metadata
+    faiss.write_index(index, str(INDEX_PATH))
+    META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
+    return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}."
+if __name__ == "__main__":
+    # Allow running `python indexer.py` locally. In HF Spaces
+    # we'll call build_index() from the UI button in app.py.
+    print(build_index())