Spaces:
Sleeping
Sleeping
File size: 3,205 Bytes
c18afb0 f43b958 be24b1a f43b958 c18afb0 f43b958 be24b1a f43b958 c18afb0 be24b1a c18afb0 be24b1a c18afb0 be24b1a f43b958 c18afb0 f43b958 c18afb0 f43b958 be24b1a f43b958 be24b1a f43b958 be24b1a f43b958 be24b1a f43b958 be24b1a c18afb0 f43b958 c18afb0 f43b958 c18afb0 f43b958 c18afb0 f43b958 c18afb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
"""
Step 7: Indexer for TXT + PDF + HTML.
- Scans data/seed/ for .txt/.md, .pdf, .html/.htm/.xhtml
- Extracts text, chunks with overlap, embeds (MiniLM), builds FAISS index
- Saves:
data/index.faiss (vectors)
data/meta.json (chunk metadata)
docs_out/*.txt (chunk files)
"""
import json, hashlib
from pathlib import Path
from typing import List, Dict
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from pdfminer.high_level import extract_text as pdf_extract_text
from bs4 import BeautifulSoup
DATA_DIR = Path("data")
SEED_DIR = DATA_DIR / "seed"
DOCS_DIR = Path("docs_out")
INDEX_PATH = DATA_DIR / "index.faiss"
META_PATH = DATA_DIR / "meta.json"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
TXT_EXT = {".txt", ".md"}
PDF_EXT = {".pdf"}
HTML_EXT = {".html", ".htm", ".xhtml"}
def _read_txt(p: Path) -> str:
try: return p.read_text(encoding="utf-8", errors="ignore")
except: return ""
def _read_pdf(p: Path) -> str:
try: return pdf_extract_text(str(p)) or ""
except: return ""
def _read_html(p: Path) -> str:
try:
raw = p.read_bytes()
soup = BeautifulSoup(raw, "lxml")
for t in soup(["script","style","noscript","header","footer","nav"]): t.decompose()
return " ".join(soup.get_text(" ").split())
except: return ""
def _load_source(p: Path) -> str:
ext = p.suffix.lower()
if ext in TXT_EXT: return _read_txt(p)
if ext in PDF_EXT: return _read_pdf(p)
if ext in HTML_EXT: return _read_html(p)
return ""
def _chunk(text: str, max_words=700, stride=200) -> List[str]:
w = text.split(); out=[]; i=0
while i < len(w):
seg = " ".join(w[i:i+max_words]).strip()
if len(seg) >= 50: out.append(seg)
i += max(1, max_words - stride)
return out
def build_index() -> str:
DATA_DIR.mkdir(exist_ok=True); SEED_DIR.mkdir(parents=True, exist_ok=True); DOCS_DIR.mkdir(exist_ok=True)
files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in TXT_EXT|PDF_EXT|HTML_EXT]
if not files: return "No files found under data/seed/. Supported: .txt .pdf .html"
docs: List[str] = []; metas: List[Dict] = []
for p in sorted(files):
text = _load_source(p)
if len(text.strip()) < 50: continue
src_id = hashlib.md5(str(p).encode()).hexdigest()[:10]
for j, ch in enumerate(_chunk(text)):
cf = DOCS_DIR / f"{src_id}_{j}.txt"
cf.write_text(ch, encoding="utf-8")
metas.append({"file": str(p), "chunk_file": str(cf), "chunk_id": f"{src_id}_{j}"})
docs.append(ch)
if not docs: return "Found files but no usable content after parsing."
model = SentenceTransformer(EMBED_MODEL)
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
index = faiss.IndexFlatIP(emb.shape[1]); index.add(emb)
faiss.write_index(index, str(INDEX_PATH))
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
return f"Indexed {len(docs)} chunks from {len(files)} file(s). Saved to {INDEX_PATH.name}."
if __name__ == "__main__":
print(build_index())
|