Spaces:
Sleeping
Sleeping
Create indexer.py
Browse files- indexer.py +132 -0
indexer.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Step 3: Minimal indexer for TXT files under data/seed/.
|
3 |
+
|
4 |
+
What it does (Objective):
|
5 |
+
- Scans data/seed/ for *.txt files
|
6 |
+
- Splits each file into overlapping chunks (for better retrieval)
|
7 |
+
- Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small)
|
8 |
+
- Builds a FAISS index (inner-product / cosine-like) and saves:
|
9 |
+
- data/index.faiss (the vector index)
|
10 |
+
- data/meta.json (mapping from vector -> source chunk)
|
11 |
+
- docs_out/*.txt (chunked text files for easy loading later)
|
12 |
+
- Returns a short status string for the UI
|
13 |
+
|
14 |
+
Design notes (Objective):
|
15 |
+
- TXT-only for this step to avoid parser complexity.
|
16 |
+
- We use normalize_embeddings=True so inner product approximates cosine.
|
17 |
+
- Safe defaults: max_tokens=700 words, stride=200 words.
|
18 |
+
"""
|
19 |
+
|
20 |
+
import os
|
21 |
+
import json
|
22 |
+
from pathlib import Path
|
23 |
+
import hashlib
|
24 |
+
from typing import List, Dict
|
25 |
+
|
26 |
+
import numpy as np
|
27 |
+
import faiss
|
28 |
+
from sentence_transformers import SentenceTransformer
|
29 |
+
|
30 |
+
# ---- Paths (Objective)
|
31 |
+
DATA_DIR = Path("data")
|
32 |
+
SEED_DIR = DATA_DIR / "seed"
|
33 |
+
DOCS_DIR = Path("docs_out")
|
34 |
+
INDEX_PATH = DATA_DIR / "index.faiss"
|
35 |
+
META_PATH = DATA_DIR / "meta.json"
|
36 |
+
|
37 |
+
# ---- Embedding model (Objective): small, fast, good enough for MVP
|
38 |
+
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
39 |
+
|
40 |
+
|
41 |
+
def _read_txt(path: Path) -> str:
|
42 |
+
"""
|
43 |
+
Read a UTF-8 text file safely.
|
44 |
+
(Objective) Returns empty string on failure; we guard later.
|
45 |
+
"""
|
46 |
+
try:
|
47 |
+
return path.read_text(encoding="utf-8", errors="ignore")
|
48 |
+
except Exception:
|
49 |
+
return ""
|
50 |
+
|
51 |
+
|
52 |
+
def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]:
|
53 |
+
"""
|
54 |
+
Split long text into overlapping word chunks.
|
55 |
+
(Objective) Overlap helps retrieval recall.
|
56 |
+
"""
|
57 |
+
words = text.split()
|
58 |
+
chunks: List[str] = []
|
59 |
+
i = 0
|
60 |
+
while i < len(words):
|
61 |
+
chunk_words = words[i : i + max_tokens]
|
62 |
+
chunk = " ".join(chunk_words).strip()
|
63 |
+
if len(chunk) >= 50: # ignore tiny fragments
|
64 |
+
chunks.append(chunk)
|
65 |
+
# Move forward by (max - stride) to create overlap
|
66 |
+
i += max(1, max_tokens - stride)
|
67 |
+
return chunks
|
68 |
+
|
69 |
+
|
70 |
+
def build_index() -> str:
|
71 |
+
"""
|
72 |
+
Build the FAISS index from TXT files in data/seed/.
|
73 |
+
Saves index + metadata to disk. Returns a human-readable status.
|
74 |
+
(Objective)
|
75 |
+
"""
|
76 |
+
# Ensure folders exist
|
77 |
+
DATA_DIR.mkdir(exist_ok=True)
|
78 |
+
SEED_DIR.mkdir(parents=True, exist_ok=True)
|
79 |
+
DOCS_DIR.mkdir(exist_ok=True)
|
80 |
+
|
81 |
+
# Collect all .txt files
|
82 |
+
txt_files = sorted(SEED_DIR.glob("*.txt"))
|
83 |
+
if not txt_files:
|
84 |
+
return "No TXT files found in data/seed/. Add *.txt and try again."
|
85 |
+
|
86 |
+
# Read and chunk
|
87 |
+
docs: List[str] = []
|
88 |
+
metas: List[Dict] = []
|
89 |
+
for fp in txt_files:
|
90 |
+
raw = _read_txt(fp)
|
91 |
+
if not raw or len(raw.strip()) < 50:
|
92 |
+
# Skip empty/near-empty
|
93 |
+
continue
|
94 |
+
|
95 |
+
chunks = _chunk_text(raw)
|
96 |
+
# Stable ID per source file for nice chunk filenames
|
97 |
+
src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
|
98 |
+
|
99 |
+
for j, ch in enumerate(chunks):
|
100 |
+
outp = DOCS_DIR / f"{src_id}_{j}.txt"
|
101 |
+
outp.write_text(ch, encoding="utf-8")
|
102 |
+
metas.append({
|
103 |
+
"file": str(fp),
|
104 |
+
"chunk_file": str(outp),
|
105 |
+
"chunk_id": f"{src_id}_{j}",
|
106 |
+
})
|
107 |
+
docs.append(ch)
|
108 |
+
|
109 |
+
if not docs:
|
110 |
+
return "Found TXT files, but no usable content (after filtering)."
|
111 |
+
|
112 |
+
# Embed
|
113 |
+
model = SentenceTransformer(EMBED_MODEL)
|
114 |
+
# normalize_embeddings=True -> inner product becomes cosine-like
|
115 |
+
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
|
116 |
+
|
117 |
+
# Build FAISS IP index
|
118 |
+
d = emb.shape[1]
|
119 |
+
index = faiss.IndexFlatIP(d)
|
120 |
+
index.add(emb)
|
121 |
+
|
122 |
+
# Save index + metadata
|
123 |
+
faiss.write_index(index, str(INDEX_PATH))
|
124 |
+
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
|
125 |
+
|
126 |
+
return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}."
|
127 |
+
|
128 |
+
|
129 |
+
if __name__ == "__main__":
|
130 |
+
# Allow running `python indexer.py` locally. In HF Spaces
|
131 |
+
# we'll call build_index() from the UI button in app.py.
|
132 |
+
print(build_index())
|