Spaces:
Sleeping
Sleeping
""" | |
Step 3: Minimal indexer for TXT files under data/seed/. | |
What it does (Objective): | |
- Scans data/seed/ for *.txt files | |
- Splits each file into overlapping chunks (for better retrieval) | |
- Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small) | |
- Builds a FAISS index (inner-product / cosine-like) and saves: | |
- data/index.faiss (the vector index) | |
- data/meta.json (mapping from vector -> source chunk) | |
- docs_out/*.txt (chunked text files for easy loading later) | |
- Returns a short status string for the UI | |
Design notes (Objective): | |
- TXT-only for this step to avoid parser complexity. | |
- We use normalize_embeddings=True so inner product approximates cosine. | |
- Safe defaults: max_tokens=700 words, stride=200 words. | |
""" | |
import os | |
import json | |
from pathlib import Path | |
import hashlib | |
from typing import List, Dict | |
import numpy as np | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
# ---- Paths (Objective) | |
DATA_DIR = Path("data") | |
SEED_DIR = DATA_DIR / "seed" | |
DOCS_DIR = Path("docs_out") | |
INDEX_PATH = DATA_DIR / "index.faiss" | |
META_PATH = DATA_DIR / "meta.json" | |
# ---- Embedding model (Objective): small, fast, good enough for MVP | |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
def _read_txt(path: Path) -> str: | |
""" | |
Read a UTF-8 text file safely. | |
(Objective) Returns empty string on failure; we guard later. | |
""" | |
try: | |
return path.read_text(encoding="utf-8", errors="ignore") | |
except Exception: | |
return "" | |
def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]: | |
""" | |
Split long text into overlapping word chunks. | |
(Objective) Overlap helps retrieval recall. | |
""" | |
words = text.split() | |
chunks: List[str] = [] | |
i = 0 | |
while i < len(words): | |
chunk_words = words[i : i + max_tokens] | |
chunk = " ".join(chunk_words).strip() | |
if len(chunk) >= 50: # ignore tiny fragments | |
chunks.append(chunk) | |
# Move forward by (max - stride) to create overlap | |
i += max(1, max_tokens - stride) | |
return chunks | |
def build_index() -> str: | |
""" | |
Build the FAISS index from TXT files in data/seed/. | |
Saves index + metadata to disk. Returns a human-readable status. | |
(Objective) | |
""" | |
# Ensure folders exist | |
DATA_DIR.mkdir(exist_ok=True) | |
SEED_DIR.mkdir(parents=True, exist_ok=True) | |
DOCS_DIR.mkdir(exist_ok=True) | |
# Collect all .txt files | |
txt_files = sorted(SEED_DIR.glob("*.txt")) | |
if not txt_files: | |
return "No TXT files found in data/seed/. Add *.txt and try again." | |
# Read and chunk | |
docs: List[str] = [] | |
metas: List[Dict] = [] | |
for fp in txt_files: | |
raw = _read_txt(fp) | |
if not raw or len(raw.strip()) < 50: | |
# Skip empty/near-empty | |
continue | |
chunks = _chunk_text(raw) | |
# Stable ID per source file for nice chunk filenames | |
src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10] | |
for j, ch in enumerate(chunks): | |
outp = DOCS_DIR / f"{src_id}_{j}.txt" | |
outp.write_text(ch, encoding="utf-8") | |
metas.append({ | |
"file": str(fp), | |
"chunk_file": str(outp), | |
"chunk_id": f"{src_id}_{j}", | |
}) | |
docs.append(ch) | |
if not docs: | |
return "Found TXT files, but no usable content (after filtering)." | |
# Embed | |
model = SentenceTransformer(EMBED_MODEL) | |
# normalize_embeddings=True -> inner product becomes cosine-like | |
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True) | |
# Build FAISS IP index | |
d = emb.shape[1] | |
index = faiss.IndexFlatIP(d) | |
index.add(emb) | |
# Save index + metadata | |
faiss.write_index(index, str(INDEX_PATH)) | |
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8") | |
return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}." | |
if __name__ == "__main__": | |
# Allow running `python indexer.py` locally. In HF Spaces | |
# we'll call build_index() from the UI button in app.py. | |
print(build_index()) | |