Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

App Files Files Community

HemanM commited on 21 days ago

Commit

f43b958

verified ·

1 Parent(s): a7a0dd9

Update indexer.py

Browse files

Files changed (1) hide show

indexer.py +48 -154

indexer.py CHANGED Viewed

@@ -1,29 +1,17 @@
 """
-Step 7: Indexer for TXT + PDF + HTML with a clearer status message.
-(Objective)
-- Scans data/seed/ for .txt, .pdf, .html/.htm/.xhtml (non-recursive for now).
-- Extracts text safely:
-    - TXT: read as UTF-8
-    - PDF: pdfminer.six
-    - HTML: BeautifulSoup (remove scripts/styles/nav)
-- Chunks text with overlap for better recall.
-- Embeds chunks (all-MiniLM-L6-v2) and builds FAISS IP index (cosine-like).
-- Saves:
-    data/index.faiss   -> FAISS vector index
-    data/meta.json     -> list of chunk metadata (file, chunk_file, chunk_id, src_hash)
-    docs_out/*.txt     -> individual chunk files
-Quality-of-life:
-- Computes a hash per *source file content* to detect when a source changed.
-- Returns a status string reporting files seen, chunks built, and updated file count.
 """
-import os
-import json
-import hashlib
 from pathlib import Path
-from typing import List, Dict, Tuple
 import faiss
 import numpy as np
@@ -31,167 +19,73 @@ from sentence_transformers import SentenceTransformer
 from pdfminer.high_level import extract_text as pdf_extract_text
 from bs4 import BeautifulSoup
-# ---- Paths (Objective)
 DATA_DIR = Path("data")
 SEED_DIR = DATA_DIR / "seed"
 DOCS_DIR = Path("docs_out")
 INDEX_PATH = DATA_DIR / "index.faiss"
 META_PATH = DATA_DIR / "meta.json"
-# ---- Embedding model (Objective)
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-# ---- Supported extensions (Objective)
 TXT_EXT = {".txt", ".md"}
 PDF_EXT = {".pdf"}
 HTML_EXT = {".html", ".htm", ".xhtml"}
-ALL_EXT = TXT_EXT | PDF_EXT | HTML_EXT
-def _read_txt(path: Path) -> str:
-    try:
-        return path.read_text(encoding="utf-8", errors="ignore")
-    except Exception:
-        return ""
-def _read_pdf(path: Path) -> str:
     try:
-        return pdf_extract_text(str(path)) or ""
-    except Exception:
-        return ""
-def _read_html(path: Path) -> str:
-    try:
-        raw = path.read_bytes()
         soup = BeautifulSoup(raw, "lxml")
-        # remove noisy tags
-        for tag in soup(["script", "style", "noscript", "header", "footer", "nav"]):
-            tag.decompose()
-        text = " ".join(soup.get_text(separator=" ").split())
-        return text
-    except Exception:
-        return ""
-def _load_source(path: Path) -> str:
-    """
-    (Objective) Route by extension and return plain text.
-    """
-    ext = path.suffix.lower()
-    if ext in TXT_EXT:
-        return _read_txt(path)
-    if ext in PDF_EXT:
-        return _read_pdf(path)
-    if ext in HTML_EXT:
-        return _read_html(path)
     return ""
-def _chunk_text(text: str, max_words: int = 700, stride: int = 200) -> List[str]:
-    """
-    (Objective) Split long text into overlapping word chunks.
-    """
-    words = text.split()
-    chunks: List[str] = []
-    i = 0
-    while i < len(words):
-        seg = " ".join(words[i : i + max_words]).strip()
-        if len(seg) >= 50:  # ignore tiny bits
-            chunks.append(seg)
         i += max(1, max_words - stride)
-    return chunks
-def _hash_text(text: str) -> str:
-    """
-    (Objective) Stable hash of text to detect source changes between runs.
-    """
-    return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()
-def _load_previous_hashes() -> Dict[str, str]:
-    """
-    (Objective) Load previous (file -> src_hash) mapping from meta.json if present.
-    """
-    if not META_PATH.exists():
-        return {}
-    try:
-        metas = json.loads(META_PATH.read_text(encoding="utf-8"))
-        hashes = {}
-        for m in metas:
-            # save the last seen hash per file (any chunk has same src_hash)
-            hashes[m["file"]] = m.get("src_hash", "")
-        return hashes
-    except Exception:
-        return {}
 def build_index() -> str:
-    """
-    (Objective) Build FAISS index from TXT+PDF+HTML under data/seed/.
-    Returns a human-friendly status string.
-    """
-    DATA_DIR.mkdir(exist_ok=True)
-    SEED_DIR.mkdir(parents=True, exist_ok=True)
-    DOCS_DIR.mkdir(exist_ok=True)
-    # Collect files (non-recursive)
-    files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in ALL_EXT]
-    if not files:
-        return "No files found under data/seed/. Supported: .txt, .pdf, .html"
-    prev_hashes = _load_previous_hashes()
-    docs: List[str] = []
-    metas: List[Dict] = []
-    updated_src_files = set()
-    for fp in sorted(files):
-        text = _load_source(fp)
-        if len(text.strip()) < 50:
-            # skip empty/near-empty sources
-            continue
-        src_hash = _hash_text(text)
-        if prev_hashes.get(str(fp), "") and prev_hashes[str(fp)] != src_hash:
-            updated_src_files.add(str(fp))
-        # Chunk and persist per-chunk text files for quick reading later
-        chunks = _chunk_text(text)
-        src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
-        for j, ch in enumerate(chunks):
-            chunk_file = DOCS_DIR / f"{src_id}_{j}.txt"
-            chunk_file.write_text(ch, encoding="utf-8")
-            metas.append({
-                "file": str(fp),
-                "chunk_file": str(chunk_file),
-                "chunk_id": f"{src_id}_{j}",
-                "src_hash": src_hash,
-            })
             docs.append(ch)
-    if not docs:
-        return "Found files but no usable content after parsing."
-    # Embed all chunks
     model = SentenceTransformer(EMBED_MODEL)
     emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
-    # Build FAISS index
-    d = emb.shape[1]
-    index = faiss.IndexFlatIP(d)
-    index.add(emb)
-    # Save
     faiss.write_index(index, str(INDEX_PATH))
     META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
-    changed_note = f"{len(updated_src_files)} source(s) updated since last index." if prev_hashes else "Initial index build."
-    return f"Indexed {len(docs)} chunks from {len(files)} file(s). {changed_note} Saved to {INDEX_PATH.name}."
 if __name__ == "__main__":
     print(build_index())

 """
+Step 7: Indexer for TXT + PDF + HTML.
+- Scans data/seed/ for .txt/.md, .pdf, .html/.htm/.xhtml
+- Extracts text, chunks with overlap, embeds (MiniLM), builds FAISS index
+- Saves:
+    data/index.faiss  (vectors)
+    data/meta.json    (chunk metadata)
+    docs_out/*.txt    (chunk files)
 """
+import json, hashlib
 from pathlib import Path
+from typing import List, Dict
 import faiss
 import numpy as np
 from pdfminer.high_level import extract_text as pdf_extract_text
 from bs4 import BeautifulSoup
 DATA_DIR = Path("data")
 SEED_DIR = DATA_DIR / "seed"
 DOCS_DIR = Path("docs_out")
 INDEX_PATH = DATA_DIR / "index.faiss"
 META_PATH = DATA_DIR / "meta.json"
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 TXT_EXT = {".txt", ".md"}
 PDF_EXT = {".pdf"}
 HTML_EXT = {".html", ".htm", ".xhtml"}
+def _read_txt(p: Path) -> str:
+    try: return p.read_text(encoding="utf-8", errors="ignore")
+    except: return ""
+def _read_pdf(p: Path) -> str:
+    try: return pdf_extract_text(str(p)) or ""
+    except: return ""
+def _read_html(p: Path) -> str:
     try:
+        raw = p.read_bytes()
         soup = BeautifulSoup(raw, "lxml")
+        for t in soup(["script","style","noscript","header","footer","nav"]): t.decompose()
+        return " ".join(soup.get_text(" ").split())
+    except: return ""
+def _load_source(p: Path) -> str:
+    ext = p.suffix.lower()
+    if ext in TXT_EXT: return _read_txt(p)
+    if ext in PDF_EXT: return _read_pdf(p)
+    if ext in HTML_EXT: return _read_html(p)
     return ""
+def _chunk(text: str, max_words=700, stride=200) -> List[str]:
+    w = text.split(); out=[]; i=0
+    while i < len(w):
+        seg = " ".join(w[i:i+max_words]).strip()
+        if len(seg) >= 50: out.append(seg)
         i += max(1, max_words - stride)
+    return out
 def build_index() -> str:
+    DATA_DIR.mkdir(exist_ok=True); SEED_DIR.mkdir(parents=True, exist_ok=True); DOCS_DIR.mkdir(exist_ok=True)
+    files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in TXT_EXT|PDF_EXT|HTML_EXT]
+    if not files: return "No files found under data/seed/. Supported: .txt .pdf .html"
+    docs: List[str] = []; metas: List[Dict] = []
+    for p in sorted(files):
+        text = _load_source(p)
+        if len(text.strip()) < 50: continue
+        src_id = hashlib.md5(str(p).encode()).hexdigest()[:10]
+        for j, ch in enumerate(_chunk(text)):
+            cf = DOCS_DIR / f"{src_id}_{j}.txt"
+            cf.write_text(ch, encoding="utf-8")
+            metas.append({"file": str(p), "chunk_file": str(cf), "chunk_id": f"{src_id}_{j}"})
             docs.append(ch)
+    if not docs: return "Found files but no usable content after parsing."
     model = SentenceTransformer(EMBED_MODEL)
     emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
+    index = faiss.IndexFlatIP(emb.shape[1]); index.add(emb)
     faiss.write_index(index, str(INDEX_PATH))
     META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
+    return f"Indexed {len(docs)} chunks from {len(files)} file(s). Saved to {INDEX_PATH.name}."
 if __name__ == "__main__":
     print(build_index())