HemanM commited on
Commit
f43b958
·
verified ·
1 Parent(s): a7a0dd9

Update indexer.py

Browse files
Files changed (1) hide show
  1. indexer.py +48 -154
indexer.py CHANGED
@@ -1,29 +1,17 @@
1
  """
2
- Step 7: Indexer for TXT + PDF + HTML with a clearer status message.
3
-
4
- (Objective)
5
- - Scans data/seed/ for .txt, .pdf, .html/.htm/.xhtml (non-recursive for now).
6
- - Extracts text safely:
7
- - TXT: read as UTF-8
8
- - PDF: pdfminer.six
9
- - HTML: BeautifulSoup (remove scripts/styles/nav)
10
- - Chunks text with overlap for better recall.
11
- - Embeds chunks (all-MiniLM-L6-v2) and builds FAISS IP index (cosine-like).
12
- - Saves:
13
- data/index.faiss -> FAISS vector index
14
- data/meta.json -> list of chunk metadata (file, chunk_file, chunk_id, src_hash)
15
- docs_out/*.txt -> individual chunk files
16
 
17
- Quality-of-life:
18
- - Computes a hash per *source file content* to detect when a source changed.
19
- - Returns a status string reporting files seen, chunks built, and updated file count.
 
 
 
20
  """
21
 
22
- import os
23
- import json
24
- import hashlib
25
  from pathlib import Path
26
- from typing import List, Dict, Tuple
27
 
28
  import faiss
29
  import numpy as np
@@ -31,167 +19,73 @@ from sentence_transformers import SentenceTransformer
31
  from pdfminer.high_level import extract_text as pdf_extract_text
32
  from bs4 import BeautifulSoup
33
 
34
- # ---- Paths (Objective)
35
  DATA_DIR = Path("data")
36
  SEED_DIR = DATA_DIR / "seed"
37
  DOCS_DIR = Path("docs_out")
38
  INDEX_PATH = DATA_DIR / "index.faiss"
39
  META_PATH = DATA_DIR / "meta.json"
40
 
41
- # ---- Embedding model (Objective)
42
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
43
 
44
- # ---- Supported extensions (Objective)
45
  TXT_EXT = {".txt", ".md"}
46
  PDF_EXT = {".pdf"}
47
  HTML_EXT = {".html", ".htm", ".xhtml"}
48
- ALL_EXT = TXT_EXT | PDF_EXT | HTML_EXT
49
 
 
 
 
50
 
51
- def _read_txt(path: Path) -> str:
52
- try:
53
- return path.read_text(encoding="utf-8", errors="ignore")
54
- except Exception:
55
- return ""
56
 
57
-
58
- def _read_pdf(path: Path) -> str:
59
  try:
60
- return pdf_extract_text(str(path)) or ""
61
- except Exception:
62
- return ""
63
-
64
-
65
- def _read_html(path: Path) -> str:
66
- try:
67
- raw = path.read_bytes()
68
  soup = BeautifulSoup(raw, "lxml")
69
- # remove noisy tags
70
- for tag in soup(["script", "style", "noscript", "header", "footer", "nav"]):
71
- tag.decompose()
72
- text = " ".join(soup.get_text(separator=" ").split())
73
- return text
74
- except Exception:
75
- return ""
76
-
77
-
78
- def _load_source(path: Path) -> str:
79
- """
80
- (Objective) Route by extension and return plain text.
81
- """
82
- ext = path.suffix.lower()
83
- if ext in TXT_EXT:
84
- return _read_txt(path)
85
- if ext in PDF_EXT:
86
- return _read_pdf(path)
87
- if ext in HTML_EXT:
88
- return _read_html(path)
89
  return ""
90
 
91
-
92
- def _chunk_text(text: str, max_words: int = 700, stride: int = 200) -> List[str]:
93
- """
94
- (Objective) Split long text into overlapping word chunks.
95
- """
96
- words = text.split()
97
- chunks: List[str] = []
98
- i = 0
99
- while i < len(words):
100
- seg = " ".join(words[i : i + max_words]).strip()
101
- if len(seg) >= 50: # ignore tiny bits
102
- chunks.append(seg)
103
  i += max(1, max_words - stride)
104
- return chunks
105
-
106
-
107
- def _hash_text(text: str) -> str:
108
- """
109
- (Objective) Stable hash of text to detect source changes between runs.
110
- """
111
- return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()
112
-
113
-
114
- def _load_previous_hashes() -> Dict[str, str]:
115
- """
116
- (Objective) Load previous (file -> src_hash) mapping from meta.json if present.
117
- """
118
- if not META_PATH.exists():
119
- return {}
120
- try:
121
- metas = json.loads(META_PATH.read_text(encoding="utf-8"))
122
- hashes = {}
123
- for m in metas:
124
- # save the last seen hash per file (any chunk has same src_hash)
125
- hashes[m["file"]] = m.get("src_hash", "")
126
- return hashes
127
- except Exception:
128
- return {}
129
-
130
 
131
  def build_index() -> str:
132
- """
133
- (Objective) Build FAISS index from TXT+PDF+HTML under data/seed/.
134
- Returns a human-friendly status string.
135
- """
136
- DATA_DIR.mkdir(exist_ok=True)
137
- SEED_DIR.mkdir(parents=True, exist_ok=True)
138
- DOCS_DIR.mkdir(exist_ok=True)
139
-
140
- # Collect files (non-recursive)
141
- files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in ALL_EXT]
142
- if not files:
143
- return "No files found under data/seed/. Supported: .txt, .pdf, .html"
144
-
145
- prev_hashes = _load_previous_hashes()
146
-
147
- docs: List[str] = []
148
- metas: List[Dict] = []
149
- updated_src_files = set()
150
-
151
- for fp in sorted(files):
152
- text = _load_source(fp)
153
- if len(text.strip()) < 50:
154
- # skip empty/near-empty sources
155
- continue
156
-
157
- src_hash = _hash_text(text)
158
- if prev_hashes.get(str(fp), "") and prev_hashes[str(fp)] != src_hash:
159
- updated_src_files.add(str(fp))
160
-
161
- # Chunk and persist per-chunk text files for quick reading later
162
- chunks = _chunk_text(text)
163
- src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
164
-
165
- for j, ch in enumerate(chunks):
166
- chunk_file = DOCS_DIR / f"{src_id}_{j}.txt"
167
- chunk_file.write_text(ch, encoding="utf-8")
168
- metas.append({
169
- "file": str(fp),
170
- "chunk_file": str(chunk_file),
171
- "chunk_id": f"{src_id}_{j}",
172
- "src_hash": src_hash,
173
- })
174
  docs.append(ch)
175
 
176
- if not docs:
177
- return "Found files but no usable content after parsing."
178
 
179
- # Embed all chunks
180
  model = SentenceTransformer(EMBED_MODEL)
181
  emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
182
-
183
- # Build FAISS index
184
- d = emb.shape[1]
185
- index = faiss.IndexFlatIP(d)
186
- index.add(emb)
187
-
188
- # Save
189
  faiss.write_index(index, str(INDEX_PATH))
190
  META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
191
-
192
- changed_note = f"{len(updated_src_files)} source(s) updated since last index." if prev_hashes else "Initial index build."
193
- return f"Indexed {len(docs)} chunks from {len(files)} file(s). {changed_note} Saved to {INDEX_PATH.name}."
194
-
195
 
196
  if __name__ == "__main__":
197
  print(build_index())
 
1
  """
2
+ Step 7: Indexer for TXT + PDF + HTML.
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ - Scans data/seed/ for .txt/.md, .pdf, .html/.htm/.xhtml
5
+ - Extracts text, chunks with overlap, embeds (MiniLM), builds FAISS index
6
+ - Saves:
7
+ data/index.faiss (vectors)
8
+ data/meta.json (chunk metadata)
9
+ docs_out/*.txt (chunk files)
10
  """
11
 
12
+ import json, hashlib
 
 
13
  from pathlib import Path
14
+ from typing import List, Dict
15
 
16
  import faiss
17
  import numpy as np
 
19
  from pdfminer.high_level import extract_text as pdf_extract_text
20
  from bs4 import BeautifulSoup
21
 
 
22
  DATA_DIR = Path("data")
23
  SEED_DIR = DATA_DIR / "seed"
24
  DOCS_DIR = Path("docs_out")
25
  INDEX_PATH = DATA_DIR / "index.faiss"
26
  META_PATH = DATA_DIR / "meta.json"
27
 
 
28
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
29
 
 
30
  TXT_EXT = {".txt", ".md"}
31
  PDF_EXT = {".pdf"}
32
  HTML_EXT = {".html", ".htm", ".xhtml"}
 
33
 
34
+ def _read_txt(p: Path) -> str:
35
+ try: return p.read_text(encoding="utf-8", errors="ignore")
36
+ except: return ""
37
 
38
+ def _read_pdf(p: Path) -> str:
39
+ try: return pdf_extract_text(str(p)) or ""
40
+ except: return ""
 
 
41
 
42
+ def _read_html(p: Path) -> str:
 
43
  try:
44
+ raw = p.read_bytes()
 
 
 
 
 
 
 
45
  soup = BeautifulSoup(raw, "lxml")
46
+ for t in soup(["script","style","noscript","header","footer","nav"]): t.decompose()
47
+ return " ".join(soup.get_text(" ").split())
48
+ except: return ""
49
+
50
+ def _load_source(p: Path) -> str:
51
+ ext = p.suffix.lower()
52
+ if ext in TXT_EXT: return _read_txt(p)
53
+ if ext in PDF_EXT: return _read_pdf(p)
54
+ if ext in HTML_EXT: return _read_html(p)
 
 
 
 
 
 
 
 
 
 
 
55
  return ""
56
 
57
+ def _chunk(text: str, max_words=700, stride=200) -> List[str]:
58
+ w = text.split(); out=[]; i=0
59
+ while i < len(w):
60
+ seg = " ".join(w[i:i+max_words]).strip()
61
+ if len(seg) >= 50: out.append(seg)
 
 
 
 
 
 
 
62
  i += max(1, max_words - stride)
63
+ return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  def build_index() -> str:
66
+ DATA_DIR.mkdir(exist_ok=True); SEED_DIR.mkdir(parents=True, exist_ok=True); DOCS_DIR.mkdir(exist_ok=True)
67
+ files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in TXT_EXT|PDF_EXT|HTML_EXT]
68
+ if not files: return "No files found under data/seed/. Supported: .txt .pdf .html"
69
+
70
+ docs: List[str] = []; metas: List[Dict] = []
71
+ for p in sorted(files):
72
+ text = _load_source(p)
73
+ if len(text.strip()) < 50: continue
74
+ src_id = hashlib.md5(str(p).encode()).hexdigest()[:10]
75
+ for j, ch in enumerate(_chunk(text)):
76
+ cf = DOCS_DIR / f"{src_id}_{j}.txt"
77
+ cf.write_text(ch, encoding="utf-8")
78
+ metas.append({"file": str(p), "chunk_file": str(cf), "chunk_id": f"{src_id}_{j}"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  docs.append(ch)
80
 
81
+ if not docs: return "Found files but no usable content after parsing."
 
82
 
 
83
  model = SentenceTransformer(EMBED_MODEL)
84
  emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
85
+ index = faiss.IndexFlatIP(emb.shape[1]); index.add(emb)
 
 
 
 
 
 
86
  faiss.write_index(index, str(INDEX_PATH))
87
  META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
88
+ return f"Indexed {len(docs)} chunks from {len(files)} file(s). Saved to {INDEX_PATH.name}."
 
 
 
89
 
90
  if __name__ == "__main__":
91
  print(build_index())