HemanM commited on
Commit
be24b1a
·
verified ·
1 Parent(s): ef43d13

Update indexer.py

Browse files
Files changed (1) hide show
  1. indexer.py +125 -60
indexer.py CHANGED
@@ -1,31 +1,35 @@
1
  """
2
- Step 3: Minimal indexer for TXT files under data/seed/.
3
-
4
- What it does (Objective):
5
- - Scans data/seed/ for *.txt files
6
- - Splits each file into overlapping chunks (for better retrieval)
7
- - Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small)
8
- - Builds a FAISS index (inner-product / cosine-like) and saves:
9
- - data/index.faiss (the vector index)
10
- - data/meta.json (mapping from vector -> source chunk)
11
- - docs_out/*.txt (chunked text files for easy loading later)
12
- - Returns a short status string for the UI
13
-
14
- Design notes (Objective):
15
- - TXT-only for this step to avoid parser complexity.
16
- - We use normalize_embeddings=True so inner product approximates cosine.
17
- - Safe defaults: max_tokens=700 words, stride=200 words.
 
 
18
  """
19
 
20
  import os
21
  import json
22
- from pathlib import Path
23
  import hashlib
24
- from typing import List, Dict
 
25
 
26
- import numpy as np
27
  import faiss
 
28
  from sentence_transformers import SentenceTransformer
 
 
29
 
30
  # ---- Paths (Objective)
31
  DATA_DIR = Path("data")
@@ -34,99 +38,160 @@ DOCS_DIR = Path("docs_out")
34
  INDEX_PATH = DATA_DIR / "index.faiss"
35
  META_PATH = DATA_DIR / "meta.json"
36
 
37
- # ---- Embedding model (Objective): small, fast, good enough for MVP
38
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
39
 
 
 
 
 
 
 
40
 
41
  def _read_txt(path: Path) -> str:
42
- """
43
- Read a UTF-8 text file safely.
44
- (Objective) Returns empty string on failure; we guard later.
45
- """
46
  try:
47
  return path.read_text(encoding="utf-8", errors="ignore")
48
  except Exception:
49
  return ""
50
 
51
 
52
- def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  """
54
- Split long text into overlapping word chunks.
55
- (Objective) Overlap helps retrieval recall.
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  """
57
  words = text.split()
58
  chunks: List[str] = []
59
  i = 0
60
  while i < len(words):
61
- chunk_words = words[i : i + max_tokens]
62
- chunk = " ".join(chunk_words).strip()
63
- if len(chunk) >= 50: # ignore tiny fragments
64
- chunks.append(chunk)
65
- # Move forward by (max - stride) to create overlap
66
- i += max(1, max_tokens - stride)
67
  return chunks
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def build_index() -> str:
71
  """
72
- Build the FAISS index from TXT files in data/seed/.
73
- Saves index + metadata to disk. Returns a human-readable status.
74
- (Objective)
75
  """
76
- # Ensure folders exist
77
  DATA_DIR.mkdir(exist_ok=True)
78
  SEED_DIR.mkdir(parents=True, exist_ok=True)
79
  DOCS_DIR.mkdir(exist_ok=True)
80
 
81
- # Collect all .txt files
82
- txt_files = sorted(SEED_DIR.glob("*.txt"))
83
- if not txt_files:
84
- return "No TXT files found in data/seed/. Add *.txt and try again."
 
 
85
 
86
- # Read and chunk
87
  docs: List[str] = []
88
  metas: List[Dict] = []
89
- for fp in txt_files:
90
- raw = _read_txt(fp)
91
- if not raw or len(raw.strip()) < 50:
92
- # Skip empty/near-empty
 
 
93
  continue
94
 
95
- chunks = _chunk_text(raw)
96
- # Stable ID per source file for nice chunk filenames
 
 
 
 
97
  src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
98
 
99
  for j, ch in enumerate(chunks):
100
- outp = DOCS_DIR / f"{src_id}_{j}.txt"
101
- outp.write_text(ch, encoding="utf-8")
102
  metas.append({
103
  "file": str(fp),
104
- "chunk_file": str(outp),
105
  "chunk_id": f"{src_id}_{j}",
 
106
  })
107
  docs.append(ch)
108
 
109
  if not docs:
110
- return "Found TXT files, but no usable content (after filtering)."
111
 
112
- # Embed
113
  model = SentenceTransformer(EMBED_MODEL)
114
- # normalize_embeddings=True -> inner product becomes cosine-like
115
  emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
116
 
117
- # Build FAISS IP index
118
  d = emb.shape[1]
119
  index = faiss.IndexFlatIP(d)
120
  index.add(emb)
121
 
122
- # Save index + metadata
123
  faiss.write_index(index, str(INDEX_PATH))
124
  META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
125
 
126
- return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}."
127
-
 
128
 
129
  if __name__ == "__main__":
130
- # Allow running `python indexer.py` locally. In HF Spaces
131
- # we'll call build_index() from the UI button in app.py.
132
  print(build_index())
 
1
  """
2
+ Step 7: Indexer for TXT + PDF + HTML with a clearer status message.
3
+
4
+ (Objective)
5
+ - Scans data/seed/ for .txt, .pdf, .html/.htm/.xhtml (non-recursive for now).
6
+ - Extracts text safely:
7
+ - TXT: read as UTF-8
8
+ - PDF: pdfminer.six
9
+ - HTML: BeautifulSoup (remove scripts/styles/nav)
10
+ - Chunks text with overlap for better recall.
11
+ - Embeds chunks (all-MiniLM-L6-v2) and builds FAISS IP index (cosine-like).
12
+ - Saves:
13
+ data/index.faiss -> FAISS vector index
14
+ data/meta.json -> list of chunk metadata (file, chunk_file, chunk_id, src_hash)
15
+ docs_out/*.txt -> individual chunk files
16
+
17
+ Quality-of-life:
18
+ - Computes a hash per *source file content* to detect when a source changed.
19
+ - Returns a status string reporting files seen, chunks built, and updated file count.
20
  """
21
 
22
  import os
23
  import json
 
24
  import hashlib
25
+ from pathlib import Path
26
+ from typing import List, Dict, Tuple
27
 
 
28
  import faiss
29
+ import numpy as np
30
  from sentence_transformers import SentenceTransformer
31
+ from pdfminer.high_level import extract_text as pdf_extract_text
32
+ from bs4 import BeautifulSoup
33
 
34
  # ---- Paths (Objective)
35
  DATA_DIR = Path("data")
 
38
  INDEX_PATH = DATA_DIR / "index.faiss"
39
  META_PATH = DATA_DIR / "meta.json"
40
 
41
+ # ---- Embedding model (Objective)
42
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
43
 
44
+ # ---- Supported extensions (Objective)
45
+ TXT_EXT = {".txt", ".md"}
46
+ PDF_EXT = {".pdf"}
47
+ HTML_EXT = {".html", ".htm", ".xhtml"}
48
+ ALL_EXT = TXT_EXT | PDF_EXT | HTML_EXT
49
+
50
 
51
  def _read_txt(path: Path) -> str:
 
 
 
 
52
  try:
53
  return path.read_text(encoding="utf-8", errors="ignore")
54
  except Exception:
55
  return ""
56
 
57
 
58
+ def _read_pdf(path: Path) -> str:
59
+ try:
60
+ return pdf_extract_text(str(path)) or ""
61
+ except Exception:
62
+ return ""
63
+
64
+
65
+ def _read_html(path: Path) -> str:
66
+ try:
67
+ raw = path.read_bytes()
68
+ soup = BeautifulSoup(raw, "lxml")
69
+ # remove noisy tags
70
+ for tag in soup(["script", "style", "noscript", "header", "footer", "nav"]):
71
+ tag.decompose()
72
+ text = " ".join(soup.get_text(separator=" ").split())
73
+ return text
74
+ except Exception:
75
+ return ""
76
+
77
+
78
+ def _load_source(path: Path) -> str:
79
  """
80
+ (Objective) Route by extension and return plain text.
81
+ """
82
+ ext = path.suffix.lower()
83
+ if ext in TXT_EXT:
84
+ return _read_txt(path)
85
+ if ext in PDF_EXT:
86
+ return _read_pdf(path)
87
+ if ext in HTML_EXT:
88
+ return _read_html(path)
89
+ return ""
90
+
91
+
92
+ def _chunk_text(text: str, max_words: int = 700, stride: int = 200) -> List[str]:
93
+ """
94
+ (Objective) Split long text into overlapping word chunks.
95
  """
96
  words = text.split()
97
  chunks: List[str] = []
98
  i = 0
99
  while i < len(words):
100
+ seg = " ".join(words[i : i + max_words]).strip()
101
+ if len(seg) >= 50: # ignore tiny bits
102
+ chunks.append(seg)
103
+ i += max(1, max_words - stride)
 
 
104
  return chunks
105
 
106
 
107
+ def _hash_text(text: str) -> str:
108
+ """
109
+ (Objective) Stable hash of text to detect source changes between runs.
110
+ """
111
+ return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()
112
+
113
+
114
+ def _load_previous_hashes() -> Dict[str, str]:
115
+ """
116
+ (Objective) Load previous (file -> src_hash) mapping from meta.json if present.
117
+ """
118
+ if not META_PATH.exists():
119
+ return {}
120
+ try:
121
+ metas = json.loads(META_PATH.read_text(encoding="utf-8"))
122
+ hashes = {}
123
+ for m in metas:
124
+ # save the last seen hash per file (any chunk has same src_hash)
125
+ hashes[m["file"]] = m.get("src_hash", "")
126
+ return hashes
127
+ except Exception:
128
+ return {}
129
+
130
+
131
  def build_index() -> str:
132
  """
133
+ (Objective) Build FAISS index from TXT+PDF+HTML under data/seed/.
134
+ Returns a human-friendly status string.
 
135
  """
 
136
  DATA_DIR.mkdir(exist_ok=True)
137
  SEED_DIR.mkdir(parents=True, exist_ok=True)
138
  DOCS_DIR.mkdir(exist_ok=True)
139
 
140
+ # Collect files (non-recursive)
141
+ files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in ALL_EXT]
142
+ if not files:
143
+ return "No files found under data/seed/. Supported: .txt, .pdf, .html"
144
+
145
+ prev_hashes = _load_previous_hashes()
146
 
 
147
  docs: List[str] = []
148
  metas: List[Dict] = []
149
+ updated_src_files = set()
150
+
151
+ for fp in sorted(files):
152
+ text = _load_source(fp)
153
+ if len(text.strip()) < 50:
154
+ # skip empty/near-empty sources
155
  continue
156
 
157
+ src_hash = _hash_text(text)
158
+ if prev_hashes.get(str(fp), "") and prev_hashes[str(fp)] != src_hash:
159
+ updated_src_files.add(str(fp))
160
+
161
+ # Chunk and persist per-chunk text files for quick reading later
162
+ chunks = _chunk_text(text)
163
  src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
164
 
165
  for j, ch in enumerate(chunks):
166
+ chunk_file = DOCS_DIR / f"{src_id}_{j}.txt"
167
+ chunk_file.write_text(ch, encoding="utf-8")
168
  metas.append({
169
  "file": str(fp),
170
+ "chunk_file": str(chunk_file),
171
  "chunk_id": f"{src_id}_{j}",
172
+ "src_hash": src_hash,
173
  })
174
  docs.append(ch)
175
 
176
  if not docs:
177
+ return "Found files but no usable content after parsing."
178
 
179
+ # Embed all chunks
180
  model = SentenceTransformer(EMBED_MODEL)
 
181
  emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
182
 
183
+ # Build FAISS index
184
  d = emb.shape[1]
185
  index = faiss.IndexFlatIP(d)
186
  index.add(emb)
187
 
188
+ # Save
189
  faiss.write_index(index, str(INDEX_PATH))
190
  META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
191
 
192
+ changed_note = f"{len(updated_src_files)} source(s) updated since last index." if prev_hashes else "Initial index build."
193
+ return f"Indexed {len(docs)} chunks from {len(files)} file(s). {changed_note} Saved to {INDEX_PATH.name}."
194
+
195
 
196
  if __name__ == "__main__":
 
 
197
  print(build_index())