Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

App Files Files Community

evo-gov-copilot-mu / indexer.py

HemanM

Update indexer.py

be24b1a verified 22 days ago

raw

history blame

5.96 kB

	"""
	Step 7: Indexer for TXT + PDF + HTML with a clearer status message.

	(Objective)
	- Scans data/seed/ for .txt, .pdf, .html/.htm/.xhtml (non-recursive for now).
	- Extracts text safely:
	- TXT: read as UTF-8
	- PDF: pdfminer.six
	- HTML: BeautifulSoup (remove scripts/styles/nav)
	- Chunks text with overlap for better recall.
	- Embeds chunks (all-MiniLM-L6-v2) and builds FAISS IP index (cosine-like).
	- Saves:
	data/index.faiss -> FAISS vector index
	data/meta.json -> list of chunk metadata (file, chunk_file, chunk_id, src_hash)
	docs_out/*.txt -> individual chunk files

	Quality-of-life:
	- Computes a hash per source file content to detect when a source changed.
	- Returns a status string reporting files seen, chunks built, and updated file count.
	"""

	import os
	import json
	import hashlib
	from pathlib import Path
	from typing import List, Dict, Tuple

	import faiss
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from pdfminer.high_level import extract_text as pdf_extract_text
	from bs4 import BeautifulSoup

	# ---- Paths (Objective)
	DATA_DIR = Path("data")
	SEED_DIR = DATA_DIR / "seed"
	DOCS_DIR = Path("docs_out")
	INDEX_PATH = DATA_DIR / "index.faiss"
	META_PATH = DATA_DIR / "meta.json"

	# ---- Embedding model (Objective)
	EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

	# ---- Supported extensions (Objective)
	TXT_EXT = {".txt", ".md"}
	PDF_EXT = {".pdf"}
	HTML_EXT = {".html", ".htm", ".xhtml"}
	ALL_EXT = TXT_EXT \| PDF_EXT \| HTML_EXT


	def _read_txt(path: Path) -> str:
	try:
	return path.read_text(encoding="utf-8", errors="ignore")
	except Exception:
	return ""


	def _read_pdf(path: Path) -> str:
	try:
	return pdf_extract_text(str(path)) or ""
	except Exception:
	return ""


	def _read_html(path: Path) -> str:
	try:
	raw = path.read_bytes()
	soup = BeautifulSoup(raw, "lxml")
	# remove noisy tags
	for tag in soup(["script", "style", "noscript", "header", "footer", "nav"]):
	tag.decompose()
	text = " ".join(soup.get_text(separator=" ").split())
	return text
	except Exception:
	return ""


	def _load_source(path: Path) -> str:
	"""
	(Objective) Route by extension and return plain text.
	"""
	ext = path.suffix.lower()
	if ext in TXT_EXT:
	return _read_txt(path)
	if ext in PDF_EXT:
	return _read_pdf(path)
	if ext in HTML_EXT:
	return _read_html(path)
	return ""


	def _chunk_text(text: str, max_words: int = 700, stride: int = 200) -> List[str]:
	"""
	(Objective) Split long text into overlapping word chunks.
	"""
	words = text.split()
	chunks: List[str] = []
	i = 0
	while i < len(words):
	seg = " ".join(words[i : i + max_words]).strip()
	if len(seg) >= 50: # ignore tiny bits
	chunks.append(seg)
	i += max(1, max_words - stride)
	return chunks


	def _hash_text(text: str) -> str:
	"""
	(Objective) Stable hash of text to detect source changes between runs.
	"""
	return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()


	def _load_previous_hashes() -> Dict[str, str]:
	"""
	(Objective) Load previous (file -> src_hash) mapping from meta.json if present.
	"""
	if not META_PATH.exists():
	return {}
	try:
	metas = json.loads(META_PATH.read_text(encoding="utf-8"))
	hashes = {}
	for m in metas:
	# save the last seen hash per file (any chunk has same src_hash)
	hashes[m["file"]] = m.get("src_hash", "")
	return hashes
	except Exception:
	return {}


	def build_index() -> str:
	"""
	(Objective) Build FAISS index from TXT+PDF+HTML under data/seed/.
	Returns a human-friendly status string.
	"""
	DATA_DIR.mkdir(exist_ok=True)
	SEED_DIR.mkdir(parents=True, exist_ok=True)
	DOCS_DIR.mkdir(exist_ok=True)

	# Collect files (non-recursive)
	files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in ALL_EXT]
	if not files:
	return "No files found under data/seed/. Supported: .txt, .pdf, .html"

	prev_hashes = _load_previous_hashes()

	docs: List[str] = []
	metas: List[Dict] = []
	updated_src_files = set()

	for fp in sorted(files):
	text = _load_source(fp)
	if len(text.strip()) < 50:
	# skip empty/near-empty sources
	continue

	src_hash = _hash_text(text)
	if prev_hashes.get(str(fp), "") and prev_hashes[str(fp)] != src_hash:
	updated_src_files.add(str(fp))

	# Chunk and persist per-chunk text files for quick reading later
	chunks = _chunk_text(text)
	src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]

	for j, ch in enumerate(chunks):
	chunk_file = DOCS_DIR / f"{src_id}_{j}.txt"
	chunk_file.write_text(ch, encoding="utf-8")
	metas.append({
	"file": str(fp),
	"chunk_file": str(chunk_file),
	"chunk_id": f"{src_id}_{j}",
	"src_hash": src_hash,
	})
	docs.append(ch)

	if not docs:
	return "Found files but no usable content after parsing."

	# Embed all chunks
	model = SentenceTransformer(EMBED_MODEL)
	emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)

	# Build FAISS index
	d = emb.shape[1]
	index = faiss.IndexFlatIP(d)
	index.add(emb)

	# Save
	faiss.write_index(index, str(INDEX_PATH))
	META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")

	changed_note = f"{len(updated_src_files)} source(s) updated since last index." if prev_hashes else "Initial index build."
	return f"Indexed {len(docs)} chunks from {len(files)} file(s). {changed_note} Saved to {INDEX_PATH.name}."


	if __name__ == "__main__":
	print(build_index())