Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

App Files Files Community

evo-gov-copilot-mu / indexer.py

HemanM

Create indexer.py

c18afb0 verified 22 days ago

raw

history blame

4.18 kB

	"""
	Step 3: Minimal indexer for TXT files under data/seed/.

	What it does (Objective):
	- Scans data/seed/ for *.txt files
	- Splits each file into overlapping chunks (for better retrieval)
	- Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small)
	- Builds a FAISS index (inner-product / cosine-like) and saves:
	- data/index.faiss (the vector index)
	- data/meta.json (mapping from vector -> source chunk)
	- docs_out/*.txt (chunked text files for easy loading later)
	- Returns a short status string for the UI

	Design notes (Objective):
	- TXT-only for this step to avoid parser complexity.
	- We use normalize_embeddings=True so inner product approximates cosine.
	- Safe defaults: max_tokens=700 words, stride=200 words.
	"""

	import os
	import json
	from pathlib import Path
	import hashlib
	from typing import List, Dict

	import numpy as np
	import faiss
	from sentence_transformers import SentenceTransformer

	# ---- Paths (Objective)
	DATA_DIR = Path("data")
	SEED_DIR = DATA_DIR / "seed"
	DOCS_DIR = Path("docs_out")
	INDEX_PATH = DATA_DIR / "index.faiss"
	META_PATH = DATA_DIR / "meta.json"

	# ---- Embedding model (Objective): small, fast, good enough for MVP
	EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"


	def _read_txt(path: Path) -> str:
	"""
	Read a UTF-8 text file safely.
	(Objective) Returns empty string on failure; we guard later.
	"""
	try:
	return path.read_text(encoding="utf-8", errors="ignore")
	except Exception:
	return ""


	def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]:
	"""
	Split long text into overlapping word chunks.
	(Objective) Overlap helps retrieval recall.
	"""
	words = text.split()
	chunks: List[str] = []
	i = 0
	while i < len(words):
	chunk_words = words[i : i + max_tokens]
	chunk = " ".join(chunk_words).strip()
	if len(chunk) >= 50: # ignore tiny fragments
	chunks.append(chunk)
	# Move forward by (max - stride) to create overlap
	i += max(1, max_tokens - stride)
	return chunks


	def build_index() -> str:
	"""
	Build the FAISS index from TXT files in data/seed/.
	Saves index + metadata to disk. Returns a human-readable status.
	(Objective)
	"""
	# Ensure folders exist
	DATA_DIR.mkdir(exist_ok=True)
	SEED_DIR.mkdir(parents=True, exist_ok=True)
	DOCS_DIR.mkdir(exist_ok=True)

	# Collect all .txt files
	txt_files = sorted(SEED_DIR.glob("*.txt"))
	if not txt_files:
	return "No TXT files found in data/seed/. Add *.txt and try again."

	# Read and chunk
	docs: List[str] = []
	metas: List[Dict] = []
	for fp in txt_files:
	raw = _read_txt(fp)
	if not raw or len(raw.strip()) < 50:
	# Skip empty/near-empty
	continue

	chunks = _chunk_text(raw)
	# Stable ID per source file for nice chunk filenames
	src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]

	for j, ch in enumerate(chunks):
	outp = DOCS_DIR / f"{src_id}_{j}.txt"
	outp.write_text(ch, encoding="utf-8")
	metas.append({
	"file": str(fp),
	"chunk_file": str(outp),
	"chunk_id": f"{src_id}_{j}",
	})
	docs.append(ch)

	if not docs:
	return "Found TXT files, but no usable content (after filtering)."

	# Embed
	model = SentenceTransformer(EMBED_MODEL)
	# normalize_embeddings=True -> inner product becomes cosine-like
	emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)

	# Build FAISS IP index
	d = emb.shape[1]
	index = faiss.IndexFlatIP(d)
	index.add(emb)

	# Save index + metadata
	faiss.write_index(index, str(INDEX_PATH))
	META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")

	return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}."


	if __name__ == "__main__":
	# Allow running `python indexer.py` locally. In HF Spaces
	# we'll call build_index() from the UI button in app.py.
	print(build_index())