File size: 3,205 Bytes
c18afb0
f43b958
be24b1a
f43b958
 
 
 
 
 
c18afb0
 
f43b958
be24b1a
f43b958
c18afb0
 
be24b1a
c18afb0
be24b1a
 
c18afb0
 
 
 
 
 
 
 
 
be24b1a
 
 
 
f43b958
 
 
c18afb0
f43b958
 
 
c18afb0
f43b958
be24b1a
f43b958
be24b1a
f43b958
 
 
 
 
 
 
 
 
be24b1a
 
f43b958
 
 
 
 
be24b1a
f43b958
be24b1a
c18afb0
f43b958
 
 
 
 
 
 
 
 
 
 
 
 
c18afb0
 
f43b958
c18afb0
 
 
f43b958
c18afb0
 
f43b958
c18afb0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Step 7: Indexer for TXT + PDF + HTML.

- Scans data/seed/ for .txt/.md, .pdf, .html/.htm/.xhtml
- Extracts text, chunks with overlap, embeds (MiniLM), builds FAISS index
- Saves:
    data/index.faiss  (vectors)
    data/meta.json    (chunk metadata)
    docs_out/*.txt    (chunk files)
"""

import json, hashlib
from pathlib import Path
from typing import List, Dict

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from pdfminer.high_level import extract_text as pdf_extract_text
from bs4 import BeautifulSoup

DATA_DIR = Path("data")
SEED_DIR = DATA_DIR / "seed"
DOCS_DIR = Path("docs_out")
INDEX_PATH = DATA_DIR / "index.faiss"
META_PATH = DATA_DIR / "meta.json"

EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

TXT_EXT = {".txt", ".md"}
PDF_EXT = {".pdf"}
HTML_EXT = {".html", ".htm", ".xhtml"}

def _read_txt(p: Path) -> str:
    try: return p.read_text(encoding="utf-8", errors="ignore")
    except: return ""

def _read_pdf(p: Path) -> str:
    try: return pdf_extract_text(str(p)) or ""
    except: return ""

def _read_html(p: Path) -> str:
    try:
        raw = p.read_bytes()
        soup = BeautifulSoup(raw, "lxml")
        for t in soup(["script","style","noscript","header","footer","nav"]): t.decompose()
        return " ".join(soup.get_text(" ").split())
    except: return ""

def _load_source(p: Path) -> str:
    ext = p.suffix.lower()
    if ext in TXT_EXT: return _read_txt(p)
    if ext in PDF_EXT: return _read_pdf(p)
    if ext in HTML_EXT: return _read_html(p)
    return ""

def _chunk(text: str, max_words=700, stride=200) -> List[str]:
    w = text.split(); out=[]; i=0
    while i < len(w):
        seg = " ".join(w[i:i+max_words]).strip()
        if len(seg) >= 50: out.append(seg)
        i += max(1, max_words - stride)
    return out

def build_index() -> str:
    DATA_DIR.mkdir(exist_ok=True); SEED_DIR.mkdir(parents=True, exist_ok=True); DOCS_DIR.mkdir(exist_ok=True)
    files = [p for p in SEED_DIR.iterdir() if p.is_file() and p.suffix.lower() in TXT_EXT|PDF_EXT|HTML_EXT]
    if not files: return "No files found under data/seed/. Supported: .txt .pdf .html"

    docs: List[str] = []; metas: List[Dict] = []
    for p in sorted(files):
        text = _load_source(p)
        if len(text.strip()) < 50: continue
        src_id = hashlib.md5(str(p).encode()).hexdigest()[:10]
        for j, ch in enumerate(_chunk(text)):
            cf = DOCS_DIR / f"{src_id}_{j}.txt"
            cf.write_text(ch, encoding="utf-8")
            metas.append({"file": str(p), "chunk_file": str(cf), "chunk_id": f"{src_id}_{j}"})
            docs.append(ch)

    if not docs: return "Found files but no usable content after parsing."

    model = SentenceTransformer(EMBED_MODEL)
    emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
    index = faiss.IndexFlatIP(emb.shape[1]); index.add(emb)
    faiss.write_index(index, str(INDEX_PATH))
    META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")
    return f"Indexed {len(docs)} chunks from {len(files)} file(s). Saved to {INDEX_PATH.name}."

if __name__ == "__main__":
    print(build_index())