evo-gov-copilot-mu / indexer.py
HemanM's picture
Update indexer.py
be24b1a verified
raw
history blame
5.96 kB
"""
Step 7: Indexer for TXT + PDF + HTML with a clearer status message.
(Objective)
- Scans data/seed/ for .txt, .pdf, .html/.htm/.xhtml (non-recursive for now).
- Extracts text safely:
- TXT: read as UTF-8
- PDF: pdfminer.six
- HTML: BeautifulSoup (remove scripts/styles/nav)
- Chunks text with overlap for better recall.
- Embeds chunks (all-MiniLM-L6-v2) and builds FAISS IP index (cosine-like).
- Saves:
data/index.faiss -> FAISS vector index
data/meta.json -> list of chunk metadata (file, chunk_file, chunk_id, src_hash)
docs_out/*.txt -> individual chunk files
Quality-of-life:
- Computes a hash per *source file content* to detect when a source changed.
- Returns a status string reporting files seen, chunks built, and updated file count.
"""
import os
import json
import hashlib
from pathlib import Path
from typing import List, Dict, Tuple
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from pdfminer.high_level import extract_text as pdf_extract_text
from bs4 import BeautifulSoup
# ---- Paths (Objective)
DATA_DIR = Path("data")
SEED_DIR = DATA_DIR / "seed"
DOCS_DIR = Path("docs_out")
INDEX_PATH = DATA_DIR / "index.faiss"
META_PATH = DATA_DIR / "meta.json"
# ---- Embedding model (Objective)
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
# ---- Supported extensions (Objective)
TXT_EXT = {".txt", ".md"}
PDF_EXT = {".pdf"}
HTML_EXT = {".html", ".htm", ".xhtml"}
ALL_EXT = TXT_EXT | PDF_EXT | HTML_EXT
def _read_txt(path: Path) -> str:
try:
return path.read_text(encoding="utf-8", errors="ignore")
except Exception:
return ""
def _read_pdf(path: Path) -> str:
try:
return pdf_extract_text(str(path)) or ""
except Exception:
return ""
def _read_html(path: Path) -> str:
try:
raw = path.read_bytes()
soup = BeautifulSoup(raw, "lxml")
# remove noisy tags
for tag in soup(["script", "style", "noscript", "header", "footer", "nav"]):
tag.decompose()
text = " ".join(soup.get_text(separator=" ").split())
return text
except Exception:
return ""
def _load_source(path: Path) -> str:
"""
(Objective) Route by extension and return plain text.
"""
ext = path.suffix.lower()
if ext in TXT_EXT:
return _read_txt(path)
if ext in PDF_EXT:
return _read_pdf(path)
if ext in HTML_EXT:
return _read_html(path)
return ""
def _chunk_text(text: str, max_words: int = 700, stride: int = 200) -> List[str]:
"""
(Objective) Split long text into overlapping word chunks.
"""
words = text.split()
chunks: List[str] = []
i = 0
while i < len(words):
seg = " ".join(words[i : i + max_words]).strip()
if len(seg) >= 50: # ignore tiny bits
chunks.append(seg)
i += max(1, max_words - stride)
return chunks
def _hash_text(text: str) -> str:
"""
(Objective) Stable hash of text to detect source changes between runs.
"""
return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()
def _load_previous_hashes() -> Dict[str, str]:
"""
(Objective) Load previous (file -> src_hash) mapping from meta.json if present.
"""
if not META_PATH.exists():
return {}
try:
metas = json.loads(META_PATH.read_text(encoding="utf-8"))
hashes = {}
for m in metas:
# save the last seen hash per file (any chunk has same src_hash)
hashes[m["file"]] = m.get("src_hash", "")
return hashes
except Exception:
return {}
def build_index() -> str:
"""
(Objective) Build FAISS index from TXT+PDF+HTML under data/seed/.
Returns a human-friendly status string.
"""
DATA_DIR.mkdir(exist_ok=True)
SEED_DIR.mkdir(parents=True, exist_ok=True)
DOCS_DIR.mkdir(exist_ok=True)
# Collect files (non-recursive)
files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in ALL_EXT]
if not files:
return "No files found under data/seed/. Supported: .txt, .pdf, .html"
prev_hashes = _load_previous_hashes()
docs: List[str] = []
metas: List[Dict] = []
updated_src_files = set()
for fp in sorted(files):
text = _load_source(fp)
if len(text.strip()) < 50:
# skip empty/near-empty sources
continue
src_hash = _hash_text(text)
if prev_hashes.get(str(fp), "") and prev_hashes[str(fp)] != src_hash:
updated_src_files.add(str(fp))
# Chunk and persist per-chunk text files for quick reading later
chunks = _chunk_text(text)
src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
for j, ch in enumerate(chunks):
chunk_file = DOCS_DIR / f"{src_id}_{j}.txt"
chunk_file.write_text(ch, encoding="utf-8")
metas.append({
"file": str(fp),
"chunk_file": str(chunk_file),
"chunk_id": f"{src_id}_{j}",
"src_hash": src_hash,
})
docs.append(ch)
if not docs:
return "Found files but no usable content after parsing."
# Embed all chunks
model = SentenceTransformer(EMBED_MODEL)
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
# Build FAISS index
d = emb.shape[1]
index = faiss.IndexFlatIP(d)
index.add(emb)
# Save
faiss.write_index(index, str(INDEX_PATH))
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
changed_note = f"{len(updated_src_files)} source(s) updated since last index." if prev_hashes else "Initial index build."
return f"Indexed {len(docs)} chunks from {len(files)} file(s). {changed_note} Saved to {INDEX_PATH.name}."
if __name__ == "__main__":
print(build_index())