Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

App Files Files Community

evo-gov-copilot-mu / indexer.py

HemanM

Update indexer.py

f43b958 verified 21 days ago

raw

history blame contribute delete

3.21 kB

	"""
	Step 7: Indexer for TXT + PDF + HTML.

	- Scans data/seed/ for .txt/.md, .pdf, .html/.htm/.xhtml
	- Extracts text, chunks with overlap, embeds (MiniLM), builds FAISS index
	- Saves:
	data/index.faiss (vectors)
	data/meta.json (chunk metadata)
	docs_out/*.txt (chunk files)
	"""

	import json, hashlib
	from pathlib import Path
	from typing import List, Dict

	import faiss
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from pdfminer.high_level import extract_text as pdf_extract_text
	from bs4 import BeautifulSoup

	DATA_DIR = Path("data")
	SEED_DIR = DATA_DIR / "seed"
	DOCS_DIR = Path("docs_out")
	INDEX_PATH = DATA_DIR / "index.faiss"
	META_PATH = DATA_DIR / "meta.json"

	EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

	TXT_EXT = {".txt", ".md"}
	PDF_EXT = {".pdf"}
	HTML_EXT = {".html", ".htm", ".xhtml"}

	def _read_txt(p: Path) -> str:
	try: return p.read_text(encoding="utf-8", errors="ignore")
	except: return ""

	def _read_pdf(p: Path) -> str:
	try: return pdf_extract_text(str(p)) or ""
	except: return ""

	def _read_html(p: Path) -> str:
	try:
	raw = p.read_bytes()
	soup = BeautifulSoup(raw, "lxml")
	for t in soup(["script","style","noscript","header","footer","nav"]): t.decompose()
	return " ".join(soup.get_text(" ").split())
	except: return ""

	def _load_source(p: Path) -> str:
	ext = p.suffix.lower()
	if ext in TXT_EXT: return _read_txt(p)
	if ext in PDF_EXT: return _read_pdf(p)
	if ext in HTML_EXT: return _read_html(p)
	return ""

	def _chunk(text: str, max_words=700, stride=200) -> List[str]:
	w = text.split(); out=[]; i=0
	while i < len(w):
	seg = " ".join(w[i:i+max_words]).strip()
	if len(seg) >= 50: out.append(seg)
	i += max(1, max_words - stride)
	return out

	def build_index() -> str:
	DATA_DIR.mkdir(exist_ok=True); SEED_DIR.mkdir(parents=True, exist_ok=True); DOCS_DIR.mkdir(exist_ok=True)
	files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in TXT_EXT\|PDF_EXT\|HTML_EXT]
	if not files: return "No files found under data/seed/. Supported: .txt .pdf .html"

	docs: List[str] = []; metas: List[Dict] = []
	for p in sorted(files):
	text = _load_source(p)
	if len(text.strip()) < 50: continue
	src_id = hashlib.md5(str(p).encode()).hexdigest()[:10]
	for j, ch in enumerate(_chunk(text)):
	cf = DOCS_DIR / f"{src_id}_{j}.txt"
	cf.write_text(ch, encoding="utf-8")
	metas.append({"file": str(p), "chunk_file": str(cf), "chunk_id": f"{src_id}_{j}"})
	docs.append(ch)

	if not docs: return "Found files but no usable content after parsing."

	model = SentenceTransformer(EMBED_MODEL)
	emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
	index = faiss.IndexFlatIP(emb.shape[1]); index.add(emb)
	faiss.write_index(index, str(INDEX_PATH))
	META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
	return f"Indexed {len(docs)} chunks from {len(files)} file(s). Saved to {INDEX_PATH.name}."

	if __name__ == "__main__":
	print(build_index())