Spaces:
Sleeping
Sleeping
Update indexer.py
Browse files- indexer.py +48 -154
indexer.py
CHANGED
@@ -1,29 +1,17 @@
|
|
1 |
"""
|
2 |
-
Step 7: Indexer for TXT + PDF + HTML
|
3 |
-
|
4 |
-
(Objective)
|
5 |
-
- Scans data/seed/ for .txt, .pdf, .html/.htm/.xhtml (non-recursive for now).
|
6 |
-
- Extracts text safely:
|
7 |
-
- TXT: read as UTF-8
|
8 |
-
- PDF: pdfminer.six
|
9 |
-
- HTML: BeautifulSoup (remove scripts/styles/nav)
|
10 |
-
- Chunks text with overlap for better recall.
|
11 |
-
- Embeds chunks (all-MiniLM-L6-v2) and builds FAISS IP index (cosine-like).
|
12 |
-
- Saves:
|
13 |
-
data/index.faiss -> FAISS vector index
|
14 |
-
data/meta.json -> list of chunk metadata (file, chunk_file, chunk_id, src_hash)
|
15 |
-
docs_out/*.txt -> individual chunk files
|
16 |
|
17 |
-
|
18 |
-
-
|
19 |
-
-
|
|
|
|
|
|
|
20 |
"""
|
21 |
|
22 |
-
import
|
23 |
-
import json
|
24 |
-
import hashlib
|
25 |
from pathlib import Path
|
26 |
-
from typing import List, Dict
|
27 |
|
28 |
import faiss
|
29 |
import numpy as np
|
@@ -31,167 +19,73 @@ from sentence_transformers import SentenceTransformer
|
|
31 |
from pdfminer.high_level import extract_text as pdf_extract_text
|
32 |
from bs4 import BeautifulSoup
|
33 |
|
34 |
-
# ---- Paths (Objective)
|
35 |
DATA_DIR = Path("data")
|
36 |
SEED_DIR = DATA_DIR / "seed"
|
37 |
DOCS_DIR = Path("docs_out")
|
38 |
INDEX_PATH = DATA_DIR / "index.faiss"
|
39 |
META_PATH = DATA_DIR / "meta.json"
|
40 |
|
41 |
-
# ---- Embedding model (Objective)
|
42 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
43 |
|
44 |
-
# ---- Supported extensions (Objective)
|
45 |
TXT_EXT = {".txt", ".md"}
|
46 |
PDF_EXT = {".pdf"}
|
47 |
HTML_EXT = {".html", ".htm", ".xhtml"}
|
48 |
-
ALL_EXT = TXT_EXT | PDF_EXT | HTML_EXT
|
49 |
|
|
|
|
|
|
|
50 |
|
51 |
-
def
|
52 |
-
try:
|
53 |
-
|
54 |
-
except Exception:
|
55 |
-
return ""
|
56 |
|
57 |
-
|
58 |
-
def _read_pdf(path: Path) -> str:
|
59 |
try:
|
60 |
-
|
61 |
-
except Exception:
|
62 |
-
return ""
|
63 |
-
|
64 |
-
|
65 |
-
def _read_html(path: Path) -> str:
|
66 |
-
try:
|
67 |
-
raw = path.read_bytes()
|
68 |
soup = BeautifulSoup(raw, "lxml")
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
def _load_source(path: Path) -> str:
|
79 |
-
"""
|
80 |
-
(Objective) Route by extension and return plain text.
|
81 |
-
"""
|
82 |
-
ext = path.suffix.lower()
|
83 |
-
if ext in TXT_EXT:
|
84 |
-
return _read_txt(path)
|
85 |
-
if ext in PDF_EXT:
|
86 |
-
return _read_pdf(path)
|
87 |
-
if ext in HTML_EXT:
|
88 |
-
return _read_html(path)
|
89 |
return ""
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
words = text.split()
|
97 |
-
chunks: List[str] = []
|
98 |
-
i = 0
|
99 |
-
while i < len(words):
|
100 |
-
seg = " ".join(words[i : i + max_words]).strip()
|
101 |
-
if len(seg) >= 50: # ignore tiny bits
|
102 |
-
chunks.append(seg)
|
103 |
i += max(1, max_words - stride)
|
104 |
-
return
|
105 |
-
|
106 |
-
|
107 |
-
def _hash_text(text: str) -> str:
|
108 |
-
"""
|
109 |
-
(Objective) Stable hash of text to detect source changes between runs.
|
110 |
-
"""
|
111 |
-
return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()
|
112 |
-
|
113 |
-
|
114 |
-
def _load_previous_hashes() -> Dict[str, str]:
|
115 |
-
"""
|
116 |
-
(Objective) Load previous (file -> src_hash) mapping from meta.json if present.
|
117 |
-
"""
|
118 |
-
if not META_PATH.exists():
|
119 |
-
return {}
|
120 |
-
try:
|
121 |
-
metas = json.loads(META_PATH.read_text(encoding="utf-8"))
|
122 |
-
hashes = {}
|
123 |
-
for m in metas:
|
124 |
-
# save the last seen hash per file (any chunk has same src_hash)
|
125 |
-
hashes[m["file"]] = m.get("src_hash", "")
|
126 |
-
return hashes
|
127 |
-
except Exception:
|
128 |
-
return {}
|
129 |
-
|
130 |
|
131 |
def build_index() -> str:
|
132 |
-
|
133 |
-
(
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
prev_hashes = _load_previous_hashes()
|
146 |
-
|
147 |
-
docs: List[str] = []
|
148 |
-
metas: List[Dict] = []
|
149 |
-
updated_src_files = set()
|
150 |
-
|
151 |
-
for fp in sorted(files):
|
152 |
-
text = _load_source(fp)
|
153 |
-
if len(text.strip()) < 50:
|
154 |
-
# skip empty/near-empty sources
|
155 |
-
continue
|
156 |
-
|
157 |
-
src_hash = _hash_text(text)
|
158 |
-
if prev_hashes.get(str(fp), "") and prev_hashes[str(fp)] != src_hash:
|
159 |
-
updated_src_files.add(str(fp))
|
160 |
-
|
161 |
-
# Chunk and persist per-chunk text files for quick reading later
|
162 |
-
chunks = _chunk_text(text)
|
163 |
-
src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
|
164 |
-
|
165 |
-
for j, ch in enumerate(chunks):
|
166 |
-
chunk_file = DOCS_DIR / f"{src_id}_{j}.txt"
|
167 |
-
chunk_file.write_text(ch, encoding="utf-8")
|
168 |
-
metas.append({
|
169 |
-
"file": str(fp),
|
170 |
-
"chunk_file": str(chunk_file),
|
171 |
-
"chunk_id": f"{src_id}_{j}",
|
172 |
-
"src_hash": src_hash,
|
173 |
-
})
|
174 |
docs.append(ch)
|
175 |
|
176 |
-
if not docs:
|
177 |
-
return "Found files but no usable content after parsing."
|
178 |
|
179 |
-
# Embed all chunks
|
180 |
model = SentenceTransformer(EMBED_MODEL)
|
181 |
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
|
182 |
-
|
183 |
-
# Build FAISS index
|
184 |
-
d = emb.shape[1]
|
185 |
-
index = faiss.IndexFlatIP(d)
|
186 |
-
index.add(emb)
|
187 |
-
|
188 |
-
# Save
|
189 |
faiss.write_index(index, str(INDEX_PATH))
|
190 |
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
|
191 |
-
|
192 |
-
changed_note = f"{len(updated_src_files)} source(s) updated since last index." if prev_hashes else "Initial index build."
|
193 |
-
return f"Indexed {len(docs)} chunks from {len(files)} file(s). {changed_note} Saved to {INDEX_PATH.name}."
|
194 |
-
|
195 |
|
196 |
if __name__ == "__main__":
|
197 |
print(build_index())
|
|
|
1 |
"""
|
2 |
+
Step 7: Indexer for TXT + PDF + HTML.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
- Scans data/seed/ for .txt/.md, .pdf, .html/.htm/.xhtml
|
5 |
+
- Extracts text, chunks with overlap, embeds (MiniLM), builds FAISS index
|
6 |
+
- Saves:
|
7 |
+
data/index.faiss (vectors)
|
8 |
+
data/meta.json (chunk metadata)
|
9 |
+
docs_out/*.txt (chunk files)
|
10 |
"""
|
11 |
|
12 |
+
import json, hashlib
|
|
|
|
|
13 |
from pathlib import Path
|
14 |
+
from typing import List, Dict
|
15 |
|
16 |
import faiss
|
17 |
import numpy as np
|
|
|
19 |
from pdfminer.high_level import extract_text as pdf_extract_text
|
20 |
from bs4 import BeautifulSoup
|
21 |
|
|
|
22 |
DATA_DIR = Path("data")
|
23 |
SEED_DIR = DATA_DIR / "seed"
|
24 |
DOCS_DIR = Path("docs_out")
|
25 |
INDEX_PATH = DATA_DIR / "index.faiss"
|
26 |
META_PATH = DATA_DIR / "meta.json"
|
27 |
|
|
|
28 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
29 |
|
|
|
30 |
TXT_EXT = {".txt", ".md"}
|
31 |
PDF_EXT = {".pdf"}
|
32 |
HTML_EXT = {".html", ".htm", ".xhtml"}
|
|
|
33 |
|
34 |
+
def _read_txt(p: Path) -> str:
|
35 |
+
try: return p.read_text(encoding="utf-8", errors="ignore")
|
36 |
+
except: return ""
|
37 |
|
38 |
+
def _read_pdf(p: Path) -> str:
|
39 |
+
try: return pdf_extract_text(str(p)) or ""
|
40 |
+
except: return ""
|
|
|
|
|
41 |
|
42 |
+
def _read_html(p: Path) -> str:
|
|
|
43 |
try:
|
44 |
+
raw = p.read_bytes()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
soup = BeautifulSoup(raw, "lxml")
|
46 |
+
for t in soup(["script","style","noscript","header","footer","nav"]): t.decompose()
|
47 |
+
return " ".join(soup.get_text(" ").split())
|
48 |
+
except: return ""
|
49 |
+
|
50 |
+
def _load_source(p: Path) -> str:
|
51 |
+
ext = p.suffix.lower()
|
52 |
+
if ext in TXT_EXT: return _read_txt(p)
|
53 |
+
if ext in PDF_EXT: return _read_pdf(p)
|
54 |
+
if ext in HTML_EXT: return _read_html(p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
return ""
|
56 |
|
57 |
+
def _chunk(text: str, max_words=700, stride=200) -> List[str]:
|
58 |
+
w = text.split(); out=[]; i=0
|
59 |
+
while i < len(w):
|
60 |
+
seg = " ".join(w[i:i+max_words]).strip()
|
61 |
+
if len(seg) >= 50: out.append(seg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
i += max(1, max_words - stride)
|
63 |
+
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
def build_index() -> str:
|
66 |
+
DATA_DIR.mkdir(exist_ok=True); SEED_DIR.mkdir(parents=True, exist_ok=True); DOCS_DIR.mkdir(exist_ok=True)
|
67 |
+
files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in TXT_EXT|PDF_EXT|HTML_EXT]
|
68 |
+
if not files: return "No files found under data/seed/. Supported: .txt .pdf .html"
|
69 |
+
|
70 |
+
docs: List[str] = []; metas: List[Dict] = []
|
71 |
+
for p in sorted(files):
|
72 |
+
text = _load_source(p)
|
73 |
+
if len(text.strip()) < 50: continue
|
74 |
+
src_id = hashlib.md5(str(p).encode()).hexdigest()[:10]
|
75 |
+
for j, ch in enumerate(_chunk(text)):
|
76 |
+
cf = DOCS_DIR / f"{src_id}_{j}.txt"
|
77 |
+
cf.write_text(ch, encoding="utf-8")
|
78 |
+
metas.append({"file": str(p), "chunk_file": str(cf), "chunk_id": f"{src_id}_{j}"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
docs.append(ch)
|
80 |
|
81 |
+
if not docs: return "Found files but no usable content after parsing."
|
|
|
82 |
|
|
|
83 |
model = SentenceTransformer(EMBED_MODEL)
|
84 |
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
|
85 |
+
index = faiss.IndexFlatIP(emb.shape[1]); index.add(emb)
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
faiss.write_index(index, str(INDEX_PATH))
|
87 |
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
|
88 |
+
return f"Indexed {len(docs)} chunks from {len(files)} file(s). Saved to {INDEX_PATH.name}."
|
|
|
|
|
|
|
89 |
|
90 |
if __name__ == "__main__":
|
91 |
print(build_index())
|