File size: 4,183 Bytes
c18afb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
Step 3: Minimal indexer for TXT files under data/seed/.

What it does (Objective):
- Scans data/seed/ for *.txt files
- Splits each file into overlapping chunks (for better retrieval)
- Creates sentence embeddings with all-MiniLM-L6-v2 (fast, small)
- Builds a FAISS index (inner-product / cosine-like) and saves:
    - data/index.faiss  (the vector index)
    - data/meta.json    (mapping from vector -> source chunk)
    - docs_out/*.txt    (chunked text files for easy loading later)
- Returns a short status string for the UI

Design notes (Objective):
- TXT-only for this step to avoid parser complexity.
- We use normalize_embeddings=True so inner product approximates cosine.
- Safe defaults: max_tokens=700 words, stride=200 words.
"""

import os
import json
from pathlib import Path
import hashlib
from typing import List, Dict

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# ---- Paths (Objective)
DATA_DIR = Path("data")
SEED_DIR = DATA_DIR / "seed"
DOCS_DIR = Path("docs_out")
INDEX_PATH = DATA_DIR / "index.faiss"
META_PATH = DATA_DIR / "meta.json"

# ---- Embedding model (Objective): small, fast, good enough for MVP
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"


def _read_txt(path: Path) -> str:
    """
    Read a UTF-8 text file safely.
    (Objective) Returns empty string on failure; we guard later.
    """
    try:
        return path.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        return ""


def _chunk_text(text: str, max_tokens: int = 700, stride: int = 200) -> List[str]:
    """
    Split long text into overlapping word chunks.
    (Objective) Overlap helps retrieval recall.
    """
    words = text.split()
    chunks: List[str] = []
    i = 0
    while i < len(words):
        chunk_words = words[i : i + max_tokens]
        chunk = " ".join(chunk_words).strip()
        if len(chunk) >= 50:  # ignore tiny fragments
            chunks.append(chunk)
        # Move forward by (max - stride) to create overlap
        i += max(1, max_tokens - stride)
    return chunks


def build_index() -> str:
    """
    Build the FAISS index from TXT files in data/seed/.
    Saves index + metadata to disk. Returns a human-readable status.
    (Objective)
    """
    # Ensure folders exist
    DATA_DIR.mkdir(exist_ok=True)
    SEED_DIR.mkdir(parents=True, exist_ok=True)
    DOCS_DIR.mkdir(exist_ok=True)

    # Collect all .txt files
    txt_files = sorted(SEED_DIR.glob("*.txt"))
    if not txt_files:
        return "No TXT files found in data/seed/. Add *.txt and try again."

    # Read and chunk
    docs: List[str] = []
    metas: List[Dict] = []
    for fp in txt_files:
        raw = _read_txt(fp)
        if not raw or len(raw.strip()) < 50:
            # Skip empty/near-empty
            continue

        chunks = _chunk_text(raw)
        # Stable ID per source file for nice chunk filenames
        src_id = hashlib.md5(str(fp).encode("utf-8")).hexdigest()[:10]

        for j, ch in enumerate(chunks):
            outp = DOCS_DIR / f"{src_id}_{j}.txt"
            outp.write_text(ch, encoding="utf-8")
            metas.append({
                "file": str(fp),
                "chunk_file": str(outp),
                "chunk_id": f"{src_id}_{j}",
            })
            docs.append(ch)

    if not docs:
        return "Found TXT files, but no usable content (after filtering)."

    # Embed
    model = SentenceTransformer(EMBED_MODEL)
    # normalize_embeddings=True -> inner product becomes cosine-like
    emb = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)

    # Build FAISS IP index
    d = emb.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(emb)

    # Save index + metadata
    faiss.write_index(index, str(INDEX_PATH))
    META_PATH.write_text(json.dumps(metas, ensure_ascii=False, indent=2), encoding="utf-8")

    return f"Indexed {len(docs)} chunks from {len(txt_files)} file(s). Saved to {INDEX_PATH.name}."


if __name__ == "__main__":
    # Allow running `python indexer.py` locally. In HF Spaces
    # we'll call build_index() from the UI button in app.py.
    print(build_index())