Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

File size: 4,183 Bytes

c18afb0

"""
Step 3: Minimal indexer for TXT files under data/seed/.

What it does (Objective):
- Scans data/seed/ for *.txt files
- Splits each file into overlapping chunks (for better retrieval)
- Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small)
- Builds a FAISS index (inner-product / cosine-like) and saves:
    - data/index.faiss  (the vector index)
    - data/meta.json    (mapping from vector -> source chunk)
    - docs_out/*.txt    (chunked text files for easy loading later)
- Returns a short status string for the UI

Design notes (Objective):
- TXT-only for this step to avoid parser complexity.
- We use normalize_embeddings=True so inner product approximates cosine.
- Safe defaults: max_tokens=700 words, stride=200 words.
"""

import os
import json
from pathlib import Path
import hashlib
from typing import List, Dict

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# ---- Paths (Objective)
DATA_DIR = Path("data")
SEED_DIR = DATA_DIR / "seed"
DOCS_DIR = Path("docs_out")
INDEX_PATH = DATA_DIR / "index.faiss"
META_PATH = DATA_DIR / "meta.json"

# ---- Embedding model (Objective): small, fast, good enough for MVP
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"


def _read_txt(path: Path) -> str:
    """
    Read a UTF-8 text file safely.
    (Objective) Returns empty string on failure; we guard later.
    """
    try:
        return path.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        return ""


def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]:
    """
    Split long text into overlapping word chunks.
    (Objective) Overlap helps retrieval recall.
    """
    words = text.split()
    chunks: List[str] = []
    i = 0
    while i < len(words):
        chunk_words = words[i : i + max_tokens]
        chunk = " ".join(chunk_words).strip()
        if len(chunk) >= 50:  # ignore tiny fragments
            chunks.append(chunk)
        # Move forward by (max - stride) to create overlap
        i += max(1, max_tokens - stride)
    return chunks


def build_index() -> str:
    """
    Build the FAISS index from TXT files in data/seed/.
    Saves index + metadata to disk. Returns a human-readable status.
    (Objective)
    """
    # Ensure folders exist
    DATA_DIR.mkdir(exist_ok=True)
    SEED_DIR.mkdir(parents=True, exist_ok=True)
    DOCS_DIR.mkdir(exist_ok=True)

    # Collect all .txt files
    txt_files = sorted(SEED_DIR.glob("*.txt"))
    if not txt_files:
        return "No TXT files found in data/seed/. Add *.txt and try again."

    # Read and chunk
    docs: List[str] = []
    metas: List[Dict] = []
    for fp in txt_files:
        raw = _read_txt(fp)
        if not raw or len(raw.strip()) < 50:
            # Skip empty/near-empty
            continue

        chunks = _chunk_text(raw)
        # Stable ID per source file for nice chunk filenames
        src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]

        for j, ch in enumerate(chunks):
            outp = DOCS_DIR / f"{src_id}_{j}.txt"
            outp.write_text(ch, encoding="utf-8")
            metas.append({
                "file": str(fp),
                "chunk_file": str(outp),
                "chunk_id": f"{src_id}_{j}",
            })
            docs.append(ch)

    if not docs:
        return "Found TXT files, but no usable content (after filtering)."

    # Embed
    model = SentenceTransformer(EMBED_MODEL)
    # normalize_embeddings=True -> inner product becomes cosine-like
    emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)

    # Build FAISS IP index
    d = emb.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(emb)

    # Save index + metadata
    faiss.write_index(index, str(INDEX_PATH))
    META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")

    return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}."


if __name__ == "__main__":
    # Allow running `python indexer.py` locally. In HF Spaces
    # we'll call build_index() from the UI button in app.py.
    print(build_index())