Spaces:
Sleeping
Sleeping
File size: 4,183 Bytes
c18afb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
"""
Step 3: Minimal indexer for TXT files under data/seed/.
What it does (Objective):
- Scans data/seed/ for *.txt files
- Splits each file into overlapping chunks (for better retrieval)
- Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small)
- Builds a FAISS index (inner-product / cosine-like) and saves:
- data/index.faiss (the vector index)
- data/meta.json (mapping from vector -> source chunk)
- docs_out/*.txt (chunked text files for easy loading later)
- Returns a short status string for the UI
Design notes (Objective):
- TXT-only for this step to avoid parser complexity.
- We use normalize_embeddings=True so inner product approximates cosine.
- Safe defaults: max_tokens=700 words, stride=200 words.
"""
import os
import json
from pathlib import Path
import hashlib
from typing import List, Dict
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# ---- Paths (Objective)
DATA_DIR = Path("data")
SEED_DIR = DATA_DIR / "seed"
DOCS_DIR = Path("docs_out")
INDEX_PATH = DATA_DIR / "index.faiss"
META_PATH = DATA_DIR / "meta.json"
# ---- Embedding model (Objective): small, fast, good enough for MVP
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
def _read_txt(path: Path) -> str:
"""
Read a UTF-8 text file safely.
(Objective) Returns empty string on failure; we guard later.
"""
try:
return path.read_text(encoding="utf-8", errors="ignore")
except Exception:
return ""
def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]:
"""
Split long text into overlapping word chunks.
(Objective) Overlap helps retrieval recall.
"""
words = text.split()
chunks: List[str] = []
i = 0
while i < len(words):
chunk_words = words[i : i + max_tokens]
chunk = " ".join(chunk_words).strip()
if len(chunk) >= 50: # ignore tiny fragments
chunks.append(chunk)
# Move forward by (max - stride) to create overlap
i += max(1, max_tokens - stride)
return chunks
def build_index() -> str:
"""
Build the FAISS index from TXT files in data/seed/.
Saves index + metadata to disk. Returns a human-readable status.
(Objective)
"""
# Ensure folders exist
DATA_DIR.mkdir(exist_ok=True)
SEED_DIR.mkdir(parents=True, exist_ok=True)
DOCS_DIR.mkdir(exist_ok=True)
# Collect all .txt files
txt_files = sorted(SEED_DIR.glob("*.txt"))
if not txt_files:
return "No TXT files found in data/seed/. Add *.txt and try again."
# Read and chunk
docs: List[str] = []
metas: List[Dict] = []
for fp in txt_files:
raw = _read_txt(fp)
if not raw or len(raw.strip()) < 50:
# Skip empty/near-empty
continue
chunks = _chunk_text(raw)
# Stable ID per source file for nice chunk filenames
src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
for j, ch in enumerate(chunks):
outp = DOCS_DIR / f"{src_id}_{j}.txt"
outp.write_text(ch, encoding="utf-8")
metas.append({
"file": str(fp),
"chunk_file": str(outp),
"chunk_id": f"{src_id}_{j}",
})
docs.append(ch)
if not docs:
return "Found TXT files, but no usable content (after filtering)."
# Embed
model = SentenceTransformer(EMBED_MODEL)
# normalize_embeddings=True -> inner product becomes cosine-like
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
# Build FAISS IP index
d = emb.shape[1]
index = faiss.IndexFlatIP(d)
index.add(emb)
# Save index + metadata
faiss.write_index(index, str(INDEX_PATH))
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}."
if __name__ == "__main__":
# Allow running `python indexer.py` locally. In HF Spaces
# we'll call build_index() from the UI button in app.py.
print(build_index())
|