Spaces:
Sleeping
Sleeping
Update indexer.py
Browse files- indexer.py +125 -60
indexer.py
CHANGED
@@ -1,31 +1,35 @@
|
|
1 |
"""
|
2 |
-
Step
|
3 |
-
|
4 |
-
|
5 |
-
- Scans data/seed/ for
|
6 |
-
-
|
7 |
-
-
|
8 |
-
-
|
9 |
-
-
|
10 |
-
|
11 |
-
|
12 |
-
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
-
|
|
|
|
|
18 |
"""
|
19 |
|
20 |
import os
|
21 |
import json
|
22 |
-
from pathlib import Path
|
23 |
import hashlib
|
24 |
-
from
|
|
|
25 |
|
26 |
-
import numpy as np
|
27 |
import faiss
|
|
|
28 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
29 |
|
30 |
# ---- Paths (Objective)
|
31 |
DATA_DIR = Path("data")
|
@@ -34,99 +38,160 @@ DOCS_DIR = Path("docs_out")
|
|
34 |
INDEX_PATH = DATA_DIR / "index.faiss"
|
35 |
META_PATH = DATA_DIR / "meta.json"
|
36 |
|
37 |
-
# ---- Embedding model (Objective)
|
38 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def _read_txt(path: Path) -> str:
|
42 |
-
"""
|
43 |
-
Read a UTF-8 text file safely.
|
44 |
-
(Objective) Returns empty string on failure; we guard later.
|
45 |
-
"""
|
46 |
try:
|
47 |
return path.read_text(encoding="utf-8", errors="ignore")
|
48 |
except Exception:
|
49 |
return ""
|
50 |
|
51 |
|
52 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
"""
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
"""
|
57 |
words = text.split()
|
58 |
chunks: List[str] = []
|
59 |
i = 0
|
60 |
while i < len(words):
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
# Move forward by (max - stride) to create overlap
|
66 |
-
i += max(1, max_tokens - stride)
|
67 |
return chunks
|
68 |
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
def build_index() -> str:
|
71 |
"""
|
72 |
-
Build
|
73 |
-
|
74 |
-
(Objective)
|
75 |
"""
|
76 |
-
# Ensure folders exist
|
77 |
DATA_DIR.mkdir(exist_ok=True)
|
78 |
SEED_DIR.mkdir(parents=True, exist_ok=True)
|
79 |
DOCS_DIR.mkdir(exist_ok=True)
|
80 |
|
81 |
-
# Collect
|
82 |
-
|
83 |
-
if not
|
84 |
-
return "No
|
|
|
|
|
85 |
|
86 |
-
# Read and chunk
|
87 |
docs: List[str] = []
|
88 |
metas: List[Dict] = []
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
93 |
continue
|
94 |
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
97 |
src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
|
98 |
|
99 |
for j, ch in enumerate(chunks):
|
100 |
-
|
101 |
-
|
102 |
metas.append({
|
103 |
"file": str(fp),
|
104 |
-
"chunk_file": str(
|
105 |
"chunk_id": f"{src_id}_{j}",
|
|
|
106 |
})
|
107 |
docs.append(ch)
|
108 |
|
109 |
if not docs:
|
110 |
-
return "Found
|
111 |
|
112 |
-
# Embed
|
113 |
model = SentenceTransformer(EMBED_MODEL)
|
114 |
-
# normalize_embeddings=True -> inner product becomes cosine-like
|
115 |
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
|
116 |
|
117 |
-
# Build FAISS
|
118 |
d = emb.shape[1]
|
119 |
index = faiss.IndexFlatIP(d)
|
120 |
index.add(emb)
|
121 |
|
122 |
-
# Save
|
123 |
faiss.write_index(index, str(INDEX_PATH))
|
124 |
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
|
125 |
|
126 |
-
|
127 |
-
|
|
|
128 |
|
129 |
if __name__ == "__main__":
|
130 |
-
# Allow running `python indexer.py` locally. In HF Spaces
|
131 |
-
# we'll call build_index() from the UI button in app.py.
|
132 |
print(build_index())
|
|
|
1 |
"""
|
2 |
+
Step 7: Indexer for TXT + PDF + HTML with a clearer status message.
|
3 |
+
|
4 |
+
(Objective)
|
5 |
+
- Scans data/seed/ for .txt, .pdf, .html/.htm/.xhtml (non-recursive for now).
|
6 |
+
- Extracts text safely:
|
7 |
+
- TXT: read as UTF-8
|
8 |
+
- PDF: pdfminer.six
|
9 |
+
- HTML: BeautifulSoup (remove scripts/styles/nav)
|
10 |
+
- Chunks text with overlap for better recall.
|
11 |
+
- Embeds chunks (all-MiniLM-L6-v2) and builds FAISS IP index (cosine-like).
|
12 |
+
- Saves:
|
13 |
+
data/index.faiss -> FAISS vector index
|
14 |
+
data/meta.json -> list of chunk metadata (file, chunk_file, chunk_id, src_hash)
|
15 |
+
docs_out/*.txt -> individual chunk files
|
16 |
+
|
17 |
+
Quality-of-life:
|
18 |
+
- Computes a hash per *source file content* to detect when a source changed.
|
19 |
+
- Returns a status string reporting files seen, chunks built, and updated file count.
|
20 |
"""
|
21 |
|
22 |
import os
|
23 |
import json
|
|
|
24 |
import hashlib
|
25 |
+
from pathlib import Path
|
26 |
+
from typing import List, Dict, Tuple
|
27 |
|
|
|
28 |
import faiss
|
29 |
+
import numpy as np
|
30 |
from sentence_transformers import SentenceTransformer
|
31 |
+
from pdfminer.high_level import extract_text as pdf_extract_text
|
32 |
+
from bs4 import BeautifulSoup
|
33 |
|
34 |
# ---- Paths (Objective)
|
35 |
DATA_DIR = Path("data")
|
|
|
38 |
INDEX_PATH = DATA_DIR / "index.faiss"
|
39 |
META_PATH = DATA_DIR / "meta.json"
|
40 |
|
41 |
+
# ---- Embedding model (Objective)
|
42 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
43 |
|
44 |
+
# ---- Supported extensions (Objective)
|
45 |
+
TXT_EXT = {".txt", ".md"}
|
46 |
+
PDF_EXT = {".pdf"}
|
47 |
+
HTML_EXT = {".html", ".htm", ".xhtml"}
|
48 |
+
ALL_EXT = TXT_EXT | PDF_EXT | HTML_EXT
|
49 |
+
|
50 |
|
51 |
def _read_txt(path: Path) -> str:
|
|
|
|
|
|
|
|
|
52 |
try:
|
53 |
return path.read_text(encoding="utf-8", errors="ignore")
|
54 |
except Exception:
|
55 |
return ""
|
56 |
|
57 |
|
58 |
+
def _read_pdf(path: Path) -> str:
|
59 |
+
try:
|
60 |
+
return pdf_extract_text(str(path)) or ""
|
61 |
+
except Exception:
|
62 |
+
return ""
|
63 |
+
|
64 |
+
|
65 |
+
def _read_html(path: Path) -> str:
|
66 |
+
try:
|
67 |
+
raw = path.read_bytes()
|
68 |
+
soup = BeautifulSoup(raw, "lxml")
|
69 |
+
# remove noisy tags
|
70 |
+
for tag in soup(["script", "style", "noscript", "header", "footer", "nav"]):
|
71 |
+
tag.decompose()
|
72 |
+
text = " ".join(soup.get_text(separator=" ").split())
|
73 |
+
return text
|
74 |
+
except Exception:
|
75 |
+
return ""
|
76 |
+
|
77 |
+
|
78 |
+
def _load_source(path: Path) -> str:
|
79 |
"""
|
80 |
+
(Objective) Route by extension and return plain text.
|
81 |
+
"""
|
82 |
+
ext = path.suffix.lower()
|
83 |
+
if ext in TXT_EXT:
|
84 |
+
return _read_txt(path)
|
85 |
+
if ext in PDF_EXT:
|
86 |
+
return _read_pdf(path)
|
87 |
+
if ext in HTML_EXT:
|
88 |
+
return _read_html(path)
|
89 |
+
return ""
|
90 |
+
|
91 |
+
|
92 |
+
def _chunk_text(text: str, max_words: int = 700, stride: int = 200) -> List[str]:
|
93 |
+
"""
|
94 |
+
(Objective) Split long text into overlapping word chunks.
|
95 |
"""
|
96 |
words = text.split()
|
97 |
chunks: List[str] = []
|
98 |
i = 0
|
99 |
while i < len(words):
|
100 |
+
seg = " ".join(words[i : i + max_words]).strip()
|
101 |
+
if len(seg) >= 50: # ignore tiny bits
|
102 |
+
chunks.append(seg)
|
103 |
+
i += max(1, max_words - stride)
|
|
|
|
|
104 |
return chunks
|
105 |
|
106 |
|
107 |
+
def _hash_text(text: str) -> str:
|
108 |
+
"""
|
109 |
+
(Objective) Stable hash of text to detect source changes between runs.
|
110 |
+
"""
|
111 |
+
return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()
|
112 |
+
|
113 |
+
|
114 |
+
def _load_previous_hashes() -> Dict[str, str]:
|
115 |
+
"""
|
116 |
+
(Objective) Load previous (file -> src_hash) mapping from meta.json if present.
|
117 |
+
"""
|
118 |
+
if not META_PATH.exists():
|
119 |
+
return {}
|
120 |
+
try:
|
121 |
+
metas = json.loads(META_PATH.read_text(encoding="utf-8"))
|
122 |
+
hashes = {}
|
123 |
+
for m in metas:
|
124 |
+
# save the last seen hash per file (any chunk has same src_hash)
|
125 |
+
hashes[m["file"]] = m.get("src_hash", "")
|
126 |
+
return hashes
|
127 |
+
except Exception:
|
128 |
+
return {}
|
129 |
+
|
130 |
+
|
131 |
def build_index() -> str:
|
132 |
"""
|
133 |
+
(Objective) Build FAISS index from TXT+PDF+HTML under data/seed/.
|
134 |
+
Returns a human-friendly status string.
|
|
|
135 |
"""
|
|
|
136 |
DATA_DIR.mkdir(exist_ok=True)
|
137 |
SEED_DIR.mkdir(parents=True, exist_ok=True)
|
138 |
DOCS_DIR.mkdir(exist_ok=True)
|
139 |
|
140 |
+
# Collect files (non-recursive)
|
141 |
+
files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in ALL_EXT]
|
142 |
+
if not files:
|
143 |
+
return "No files found under data/seed/. Supported: .txt, .pdf, .html"
|
144 |
+
|
145 |
+
prev_hashes = _load_previous_hashes()
|
146 |
|
|
|
147 |
docs: List[str] = []
|
148 |
metas: List[Dict] = []
|
149 |
+
updated_src_files = set()
|
150 |
+
|
151 |
+
for fp in sorted(files):
|
152 |
+
text = _load_source(fp)
|
153 |
+
if len(text.strip()) < 50:
|
154 |
+
# skip empty/near-empty sources
|
155 |
continue
|
156 |
|
157 |
+
src_hash = _hash_text(text)
|
158 |
+
if prev_hashes.get(str(fp), "") and prev_hashes[str(fp)] != src_hash:
|
159 |
+
updated_src_files.add(str(fp))
|
160 |
+
|
161 |
+
# Chunk and persist per-chunk text files for quick reading later
|
162 |
+
chunks = _chunk_text(text)
|
163 |
src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]
|
164 |
|
165 |
for j, ch in enumerate(chunks):
|
166 |
+
chunk_file = DOCS_DIR / f"{src_id}_{j}.txt"
|
167 |
+
chunk_file.write_text(ch, encoding="utf-8")
|
168 |
metas.append({
|
169 |
"file": str(fp),
|
170 |
+
"chunk_file": str(chunk_file),
|
171 |
"chunk_id": f"{src_id}_{j}",
|
172 |
+
"src_hash": src_hash,
|
173 |
})
|
174 |
docs.append(ch)
|
175 |
|
176 |
if not docs:
|
177 |
+
return "Found files but no usable content after parsing."
|
178 |
|
179 |
+
# Embed all chunks
|
180 |
model = SentenceTransformer(EMBED_MODEL)
|
|
|
181 |
emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
|
182 |
|
183 |
+
# Build FAISS index
|
184 |
d = emb.shape[1]
|
185 |
index = faiss.IndexFlatIP(d)
|
186 |
index.add(emb)
|
187 |
|
188 |
+
# Save
|
189 |
faiss.write_index(index, str(INDEX_PATH))
|
190 |
META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
|
191 |
|
192 |
+
changed_note = f"{len(updated_src_files)} source(s) updated since last index." if prev_hashes else "Initial index build."
|
193 |
+
return f"Indexed {len(docs)} chunks from {len(files)} file(s). {changed_note} Saved to {INDEX_PATH.name}."
|
194 |
+
|
195 |
|
196 |
if __name__ == "__main__":
|
|
|
|
|
197 |
print(build_index())
|