Spaces:
Sleeping
Sleeping
""" | |
Step 7: Indexer for TXT + PDF + HTML with a clearer status message. | |
(Objective) | |
- Scans data/seed/ for .txt, .pdf, .html/.htm/.xhtml (non-recursive for now). | |
- Extracts text safely: | |
- TXT: read as UTF-8 | |
- PDF: pdfminer.six | |
- HTML: BeautifulSoup (remove scripts/styles/nav) | |
- Chunks text with overlap for better recall. | |
- Embeds chunks (all-MiniLM-L6-v2) and builds FAISS IP index (cosine-like). | |
- Saves: | |
data/index.faiss -> FAISS vector index | |
data/meta.json -> list of chunk metadata (file, chunk_file, chunk_id, src_hash) | |
docs_out/*.txt -> individual chunk files | |
Quality-of-life: | |
- Computes a hash per *source file content* to detect when a source changed. | |
- Returns a status string reporting files seen, chunks built, and updated file count. | |
""" | |
import os | |
import json | |
import hashlib | |
from pathlib import Path | |
from typing import List, Dict, Tuple | |
import faiss | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from pdfminer.high_level import extract_text as pdf_extract_text | |
from bs4 import BeautifulSoup | |
# ---- Paths (Objective) | |
DATA_DIR = Path("data") | |
SEED_DIR = DATA_DIR / "seed" | |
DOCS_DIR = Path("docs_out") | |
INDEX_PATH = DATA_DIR / "index.faiss" | |
META_PATH = DATA_DIR / "meta.json" | |
# ---- Embedding model (Objective) | |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
# ---- Supported extensions (Objective) | |
TXT_EXT = {".txt", ".md"} | |
PDF_EXT = {".pdf"} | |
HTML_EXT = {".html", ".htm", ".xhtml"} | |
ALL_EXT = TXT_EXT | PDF_EXT | HTML_EXT | |
def _read_txt(path: Path) -> str: | |
try: | |
return path.read_text(encoding="utf-8", errors="ignore") | |
except Exception: | |
return "" | |
def _read_pdf(path: Path) -> str: | |
try: | |
return pdf_extract_text(str(path)) or "" | |
except Exception: | |
return "" | |
def _read_html(path: Path) -> str: | |
try: | |
raw = path.read_bytes() | |
soup = BeautifulSoup(raw, "lxml") | |
# remove noisy tags | |
for tag in soup(["script", "style", "noscript", "header", "footer", "nav"]): | |
tag.decompose() | |
text = " ".join(soup.get_text(separator=" ").split()) | |
return text | |
except Exception: | |
return "" | |
def _load_source(path: Path) -> str: | |
""" | |
(Objective) Route by extension and return plain text. | |
""" | |
ext = path.suffix.lower() | |
if ext in TXT_EXT: | |
return _read_txt(path) | |
if ext in PDF_EXT: | |
return _read_pdf(path) | |
if ext in HTML_EXT: | |
return _read_html(path) | |
return "" | |
def _chunk_text(text: str, max_words: int = 700, stride: int = 200) -> List[str]: | |
""" | |
(Objective) Split long text into overlapping word chunks. | |
""" | |
words = text.split() | |
chunks: List[str] = [] | |
i = 0 | |
while i < len(words): | |
seg = " ".join(words[i : i + max_words]).strip() | |
if len(seg) >= 50: # ignore tiny bits | |
chunks.append(seg) | |
i += max(1, max_words - stride) | |
return chunks | |
def _hash_text(text: str) -> str: | |
""" | |
(Objective) Stable hash of text to detect source changes between runs. | |
""" | |
return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() | |
def _load_previous_hashes() -> Dict[str, str]: | |
""" | |
(Objective) Load previous (file -> src_hash) mapping from meta.json if present. | |
""" | |
if not META_PATH.exists(): | |
return {} | |
try: | |
metas = json.loads(META_PATH.read_text(encoding="utf-8")) | |
hashes = {} | |
for m in metas: | |
# save the last seen hash per file (any chunk has same src_hash) | |
hashes[m["file"]] = m.get("src_hash", "") | |
return hashes | |
except Exception: | |
return {} | |
def build_index() -> str: | |
""" | |
(Objective) Build FAISS index from TXT+PDF+HTML under data/seed/. | |
Returns a human-friendly status string. | |
""" | |
DATA_DIR.mkdir(exist_ok=True) | |
SEED_DIR.mkdir(parents=True, exist_ok=True) | |
DOCS_DIR.mkdir(exist_ok=True) | |
# Collect files (non-recursive) | |
files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in ALL_EXT] | |
if not files: | |
return "No files found under data/seed/. Supported: .txt, .pdf, .html" | |
prev_hashes = _load_previous_hashes() | |
docs: List[str] = [] | |
metas: List[Dict] = [] | |
updated_src_files = set() | |
for fp in sorted(files): | |
text = _load_source(fp) | |
if len(text.strip()) < 50: | |
# skip empty/near-empty sources | |
continue | |
src_hash = _hash_text(text) | |
if prev_hashes.get(str(fp), "") and prev_hashes[str(fp)] != src_hash: | |
updated_src_files.add(str(fp)) | |
# Chunk and persist per-chunk text files for quick reading later | |
chunks = _chunk_text(text) | |
src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10] | |
for j, ch in enumerate(chunks): | |
chunk_file = DOCS_DIR / f"{src_id}_{j}.txt" | |
chunk_file.write_text(ch, encoding="utf-8") | |
metas.append({ | |
"file": str(fp), | |
"chunk_file": str(chunk_file), | |
"chunk_id": f"{src_id}_{j}", | |
"src_hash": src_hash, | |
}) | |
docs.append(ch) | |
if not docs: | |
return "Found files but no usable content after parsing." | |
# Embed all chunks | |
model = SentenceTransformer(EMBED_MODEL) | |
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True) | |
# Build FAISS index | |
d = emb.shape[1] | |
index = faiss.IndexFlatIP(d) | |
index.add(emb) | |
# Save | |
faiss.write_index(index, str(INDEX_PATH)) | |
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8") | |
changed_note = f"{len(updated_src_files)} source(s) updated since last index." if prev_hashes else "Initial index build." | |
return f"Indexed {len(docs)} chunks from {len(files)} file(s). {changed_note} Saved to {INDEX_PATH.name}." | |
if __name__ == "__main__": | |
print(build_index()) | |