Spaces:
Sleeping
Sleeping
""" | |
Step 7: Indexer for TXT + PDF + HTML. | |
- Scans data/seed/ for .txt/.md, .pdf, .html/.htm/.xhtml | |
- Extracts text, chunks with overlap, embeds (MiniLM), builds FAISS index | |
- Saves: | |
data/index.faiss (vectors) | |
data/meta.json (chunk metadata) | |
docs_out/*.txt (chunk files) | |
""" | |
import json, hashlib | |
from pathlib import Path | |
from typing import List, Dict | |
import faiss | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from pdfminer.high_level import extract_text as pdf_extract_text | |
from bs4 import BeautifulSoup | |
DATA_DIR = Path("data") | |
SEED_DIR = DATA_DIR / "seed" | |
DOCS_DIR = Path("docs_out") | |
INDEX_PATH = DATA_DIR / "index.faiss" | |
META_PATH = DATA_DIR / "meta.json" | |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
TXT_EXT = {".txt", ".md"} | |
PDF_EXT = {".pdf"} | |
HTML_EXT = {".html", ".htm", ".xhtml"} | |
def _read_txt(p: Path) -> str: | |
try: return p.read_text(encoding="utf-8", errors="ignore") | |
except: return "" | |
def _read_pdf(p: Path) -> str: | |
try: return pdf_extract_text(str(p)) or "" | |
except: return "" | |
def _read_html(p: Path) -> str: | |
try: | |
raw = p.read_bytes() | |
soup = BeautifulSoup(raw, "lxml") | |
for t in soup(["script","style","noscript","header","footer","nav"]): t.decompose() | |
return " ".join(soup.get_text(" ").split()) | |
except: return "" | |
def _load_source(p: Path) -> str: | |
ext = p.suffix.lower() | |
if ext in TXT_EXT: return _read_txt(p) | |
if ext in PDF_EXT: return _read_pdf(p) | |
if ext in HTML_EXT: return _read_html(p) | |
return "" | |
def _chunk(text: str, max_words=700, stride=200) -> List[str]: | |
w = text.split(); out=[]; i=0 | |
while i < len(w): | |
seg = " ".join(w[i:i+max_words]).strip() | |
if len(seg) >= 50: out.append(seg) | |
i += max(1, max_words - stride) | |
return out | |
def build_index() -> str: | |
DATA_DIR.mkdir(exist_ok=True); SEED_DIR.mkdir(parents=True, exist_ok=True); DOCS_DIR.mkdir(exist_ok=True) | |
files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in TXT_EXT|PDF_EXT|HTML_EXT] | |
if not files: return "No files found under data/seed/. Supported: .txt .pdf .html" | |
docs: List[str] = []; metas: List[Dict] = [] | |
for p in sorted(files): | |
text = _load_source(p) | |
if len(text.strip()) < 50: continue | |
src_id = hashlib.md5(str(p).encode()).hexdigest()[:10] | |
for j, ch in enumerate(_chunk(text)): | |
cf = DOCS_DIR / f"{src_id}_{j}.txt" | |
cf.write_text(ch, encoding="utf-8") | |
metas.append({"file": str(p), "chunk_file": str(cf), "chunk_id": f"{src_id}_{j}"}) | |
docs.append(ch) | |
if not docs: return "Found files but no usable content after parsing." | |
model = SentenceTransformer(EMBED_MODEL) | |
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True) | |
index = faiss.IndexFlatIP(emb.shape[1]); index.add(emb) | |
faiss.write_index(index, str(INDEX_PATH)) | |
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8") | |
return f"Indexed {len(docs)} chunks from {len(files)} file(s). Saved to {INDEX_PATH.name}." | |
if __name__ == "__main__": | |
print(build_index()) | |