HemanM commited on
Commit
c18afb0
·
verified ·
1 Parent(s): c39e2ac

Create indexer.py

Browse files
Files changed (1) hide show
  1. indexer.py +132 -0
indexer.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 3: Minimal indexer for TXT files under data/seed/.
3
+
4
+ What it does (Objective):
5
+ - Scans data/seed/ for *.txt files
6
+ - Splits each file into overlapping chunks (for better retrieval)
7
+ - Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small)
8
+ - Builds a FAISS index (inner-product / cosine-like) and saves:
9
+ - data/index.faiss (the vector index)
10
+ - data/meta.json (mapping from vector -> source chunk)
11
+ - docs_out/*.txt (chunked text files for easy loading later)
12
+ - Returns a short status string for the UI
13
+
14
+ Design notes (Objective):
15
+ - TXT-only for this step to avoid parser complexity.
16
+ - We use normalize_embeddings=True so inner product approximates cosine.
17
+ - Safe defaults: max_tokens=700 words, stride=200 words.
18
+ """
19
+
20
+ import os
21
+ import json
22
+ from pathlib import Path
23
+ import hashlib
24
+ from typing import List, Dict
25
+
26
+ import numpy as np
27
+ import faiss
28
+ from sentence_transformers import SentenceTransformer
29
+
30
+ # ---- Paths (Objective)
31
+ DATA_DIR = Path("data")
32
+ SEED_DIR = DATA_DIR / "seed"
33
+ DOCS_DIR = Path("docs_out")
34
+ INDEX_PATH = DATA_DIR / "index.faiss"
35
+ META_PATH = DATA_DIR / "meta.json"
36
+
37
+ # ---- Embedding model (Objective): small, fast, good enough for MVP
38
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
39
+
40
+
41
+ def _read_txt(path: Path) -> str:
42
+ """
43
+ Read a UTF-8 text file safely.
44
+ (Objective) Returns empty string on failure; we guard later.
45
+ """
46
+ try:
47
+ return path.read_text(encoding="utf-8", errors="ignore")
48
+ except Exception:
49
+ return ""
50
+
51
+
52
+ def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]:
53
+ """
54
+ Split long text into overlapping word chunks.
55
+ (Objective) Overlap helps retrieval recall.
56
+ """
57
+ words = text.split()
58
+ chunks: List[str] = []
59
+ i = 0
60
+ while i < len(words):
61
+ chunk_words = words[i : i + max_tokens]
62
+ chunk = " ".join(chunk_words).strip()
63
+ if len(chunk) >= 50: # ignore tiny fragments
64
+ chunks.append(chunk)
65
+ # Move forward by (max - stride) to create overlap
66
+ i += max(1, max_tokens - stride)
67
+ return chunks
68
+
69
+
70
+ def build_index() -> str:
71
+ """
72
+ Build the FAISS index from TXT files in data/seed/.
73
+ Saves index + metadata to disk. Returns a human-readable status.
74
+ (Objective)
75
+ """
76
+ # Ensure folders exist
77
+ DATA_DIR.mkdir(exist_ok=True)
78
+ SEED_DIR.mkdir(parents=True, exist_ok=True)
79
+ DOCS_DIR.mkdir(exist_ok=True)
80
+
81
+ # Collect all .txt files
82
+ txt_files = sorted(SEED_DIR.glob("*.txt"))
83
+ if not txt_files:
84
+ return "No TXT files found in data/seed/. Add *.txt and try again."
85
+
86
+ # Read and chunk
87
+ docs: List[str] = []
88
+ metas: List[Dict] = []
89
+ for fp in txt_files:
90
+ raw = _read_txt(fp)
91
+ if not raw or len(raw.strip()) < 50:
92
+ # Skip empty/near-empty
93
+ continue
94
+
95
+ chunks = _chunk_text(raw)
96
+ # Stable ID per source file for nice chunk filenames
97
+ src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
98
+
99
+ for j, ch in enumerate(chunks):
100
+ outp = DOCS_DIR / f"{src_id}_{j}.txt"
101
+ outp.write_text(ch, encoding="utf-8")
102
+ metas.append({
103
+ "file": str(fp),
104
+ "chunk_file": str(outp),
105
+ "chunk_id": f"{src_id}_{j}",
106
+ })
107
+ docs.append(ch)
108
+
109
+ if not docs:
110
+ return "Found TXT files, but no usable content (after filtering)."
111
+
112
+ # Embed
113
+ model = SentenceTransformer(EMBED_MODEL)
114
+ # normalize_embeddings=True -> inner product becomes cosine-like
115
+ emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
116
+
117
+ # Build FAISS IP index
118
+ d = emb.shape[1]
119
+ index = faiss.IndexFlatIP(d)
120
+ index.add(emb)
121
+
122
+ # Save index + metadata
123
+ faiss.write_index(index, str(INDEX_PATH))
124
+ META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
125
+
126
+ return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}."
127
+
128
+
129
+ if __name__ == "__main__":
130
+ # Allow running `python indexer.py` locally. In HF Spaces
131
+ # we'll call build_index() from the UI button in app.py.
132
+ print(build_index())