Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

App Files Files Community

HemanM commited on 22 days ago

Commit

9ee54df

verified ·

1 Parent(s): 1849681

Create rag_search.py

Browse files

Files changed (1) hide show

rag_search.py +106 -0

rag_search.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+Step 4: Retrieval helper (loads FAISS + metadata and searches top-k chunks).
+What this module provides:
+- RAGSearcher: class that loads the FAISS index and metadata created by indexer.py
+- search(query, k): returns a list of hit dicts [{score, text, meta}]
+- summarize_hits(hits): tiny, extractive-style summary (placeholder for Step 5 Evo)
+- format_sources(hits): collapses to a neat "Sources:" list
+"""
+from pathlib import Path
+import json
+from typing import List, Dict
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+# Paths must match indexer.py
+DATA_DIR = Path("data")
+INDEX_PATH = DATA_DIR / "index.faiss"
+META_PATH = DATA_DIR / "meta.json"
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+class RAGSearcher:
+    """
+    Loads the FAISS index + metadata and performs semantic search.
+    If files are missing, it raises a RuntimeError (the UI will catch this and show a friendly message).
+    """
+    def __init__(self):
+        if not INDEX_PATH.exists() or not META_PATH.exists():
+            raise RuntimeError(
+                "Index not found. Build it first with the 'Build/Refresh Index' button."
+            )
+        # Load FAISS index and metadata
+        self.index = faiss.read_index(str(INDEX_PATH))
+        self.metas: List[Dict] = json.loads(META_PATH.read_text(encoding="utf-8"))
+        # Load the embedding model (small + fast)
+        self.model = SentenceTransformer(EMBED_MODEL)
+    def search(self, query: str, k: int = 6) -> List[Dict]:
+        """
+        Returns top-k hits with score, text, and meta fields.
+        - score ~ cosine similarity (because we normalized at indexing time)
+        """
+        if not query or len(query.strip()) < 3:
+            return []
+        # Encode the query to the same space used by the index
+        qvec = self.model.encode(
+            [query], convert_to_numpy=True, normalize_embeddings=True
+        )
+        scores, idxs = self.index.search(qvec, k)
+        hits: List[Dict] = []
+        for score, idx in zip(scores[0], idxs[0]):
+            if idx < 0:
+                continue
+            meta = self.metas[int(idx)]
+            text = Path(meta["chunk_file"]).read_text(encoding="utf-8")
+            hits.append(
+                {
+                    "score": float(score),
+                    "text": text,
+                    "meta": meta,  # contains: file, chunk_file, chunk_id
+                }
+            )
+        return hits
+def summarize_hits(hits: List[Dict], max_points: int = 4) -> str:
+    """
+    Very small, safe extractive "summary":
+    - Take the first few hits and slice the first ~350 chars of each as bullet points.
+    - This is a placeholder. In Step 5, we'll replace with Evo synthesis.
+    """
+    if not hits:
+        return "I couldn't find relevant information. Try rephrasing your question."
+    bullets = []
+    for h in hits[:max_points]:
+        snippet = " ".join(h["text"].strip().split())
+        if len(snippet) > 350:
+            snippet = snippet[:350] + "..."
+        bullets.append(f"- {snippet}")
+    return "\n".join(bullets)
+def format_sources(hits: List[Dict], max_files: int = 5) -> str:
+    """
+    Collapses the hit list to unique source files, and returns a short bulleted list.
+    """
+    if not hits:
+        return "Sources: (none)"
+    seen = []
+    order = []
+    for h in hits:
+        f = h["meta"]["file"]
+        if f not in seen:
+            seen.append(f)
+            order.append(f)
+        if len(order) >= max_files:
+            break
+    bullets = [f"- `{Path(f).name}`" for f in order]
+    return "Sources:\n" + "\n".join(bullets)