""" Step 3: Minimal indexer for TXT files under data/seed/. What it does (Objective): - Scans data/seed/ for *.txt files - Splits each file into overlapping chunks (for better retrieval) - Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small) - Builds a FAISS index (inner-product / cosine-like) and saves: - data/index.faiss (the vector index) - data/meta.json (mapping from vector -> source chunk) - docs_out/*.txt (chunked text files for easy loading later) - Returns a short status string for the UI Design notes (Objective): - TXT-only for this step to avoid parser complexity. - We use normalize_embeddings=True so inner product approximates cosine. - Safe defaults: max_tokens=700 words, stride=200 words. """ import os import json from pathlib import Path import hashlib from typing import List, Dict import numpy as np import faiss from sentence_transformers import SentenceTransformer # ---- Paths (Objective) DATA_DIR = Path("data") SEED_DIR = DATA_DIR / "seed" DOCS_DIR = Path("docs_out") INDEX_PATH = DATA_DIR / "index.faiss" META_PATH = DATA_DIR / "meta.json" # ---- Embedding model (Objective): small, fast, good enough for MVP EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" def _read_txt(path: Path) -> str: """ Read a UTF-8 text file safely. (Objective) Returns empty string on failure; we guard later. """ try: return path.read_text(encoding="utf-8", errors="ignore") except Exception: return "" def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]: """ Split long text into overlapping word chunks. (Objective) Overlap helps retrieval recall. """ words = text.split() chunks: List[str] = [] i = 0 while i < len(words): chunk_words = words[i : i + max_tokens] chunk = " ".join(chunk_words).strip() if len(chunk) >= 50: # ignore tiny fragments chunks.append(chunk) # Move forward by (max - stride) to create overlap i += max(1, max_tokens - stride) return chunks def build_index() -> str: """ Build the FAISS index from TXT files in data/seed/. Saves index + metadata to disk. Returns a human-readable status. (Objective) """ # Ensure folders exist DATA_DIR.mkdir(exist_ok=True) SEED_DIR.mkdir(parents=True, exist_ok=True) DOCS_DIR.mkdir(exist_ok=True) # Collect all .txt files txt_files = sorted(SEED_DIR.glob("*.txt")) if not txt_files: return "No TXT files found in data/seed/. Add *.txt and try again." # Read and chunk docs: List[str] = [] metas: List[Dict] = [] for fp in txt_files: raw = _read_txt(fp) if not raw or len(raw.strip()) < 50: # Skip empty/near-empty continue chunks = _chunk_text(raw) # Stable ID per source file for nice chunk filenames src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10] for j, ch in enumerate(chunks): outp = DOCS_DIR / f"{src_id}_{j}.txt" outp.write_text(ch, encoding="utf-8") metas.append({ "file": str(fp), "chunk_file": str(outp), "chunk_id": f"{src_id}_{j}", }) docs.append(ch) if not docs: return "Found TXT files, but no usable content (after filtering)." # Embed model = SentenceTransformer(EMBED_MODEL) # normalize_embeddings=True -> inner product becomes cosine-like emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True) # Build FAISS IP index d = emb.shape[1] index = faiss.IndexFlatIP(d) index.add(emb) # Save index + metadata faiss.write_index(index, str(INDEX_PATH)) META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8") return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}." if __name__ == "__main__": # Allow running `python indexer.py` locally. In HF Spaces # we'll call build_index() from the UI button in app.py. print(build_index())