import faiss import os import numpy as np from sentence_transformers import SentenceTransformer import pdfplumber model = SentenceTransformer("all-MiniLM-L6-v2") # small, fast index = None doc_chunks = [] def read_pdf(path): with pdfplumber.open(path) as pdf: return "\n".join([page.extract_text() or "" for page in pdf.pages]) def chunk_text(text, chunk_size=250): words = text.split() return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] def build_index_from_file(file_path): global index, doc_chunks ext = os.path.splitext(file_path)[-1].lower() if ext == ".pdf": text = read_pdf(file_path) else: with open(file_path, "r", encoding="utf-8") as f: text = f.read() doc_chunks = chunk_text(text) embeddings = model.encode(doc_chunks, convert_to_numpy=True) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(np.array(embeddings)) def retrieve(query, top_k=3): if index is None: return "" query_vec = model.encode([query]) D, I = index.search(np.array(query_vec), top_k) return "\n\n".join([doc_chunks[i] for i in I[0]])