Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

App Files Files Community

HemanM commited on 21 days ago

Commit

02be56c

verified ·

1 Parent(s): 6da249d

Update rag_search.py

Browse files

Files changed (1) hide show

rag_search.py +55 -42

rag_search.py CHANGED Viewed

@@ -1,22 +1,21 @@
 """
-Step 4: Retrieval helper (loads FAISS + metadata and searches top-k chunks).
-What this module provides:
-- RAGSearcher: class that loads the FAISS index and metadata created by indexer.py
-- search(query, k): returns a list of hit dicts [{score, text, meta}]
-- summarize_hits(hits): tiny, extractive-style summary (placeholder for Step 5 Evo)
-- format_sources(hits): collapses to a neat "Sources:" list
 """
 from pathlib import Path
 import json
-from typing import List, Dict
 import faiss
 import numpy as np
 from sentence_transformers import SentenceTransformer
-# Paths must match indexer.py
 DATA_DIR = Path("data")
 INDEX_PATH = DATA_DIR / "index.faiss"
 META_PATH = DATA_DIR / "meta.json"
@@ -24,58 +23,31 @@ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 class RAGSearcher:
-    """
-    Loads the FAISS index + metadata and performs semantic search.
-    If files are missing, it raises a RuntimeError (the UI will catch this and show a friendly message).
-    """
     def __init__(self):
         if not INDEX_PATH.exists() or not META_PATH.exists():
             raise RuntimeError(
                 "Index not found. Build it first with the 'Build/Refresh Index' button."
             )
-        # Load FAISS index and metadata
         self.index = faiss.read_index(str(INDEX_PATH))
         self.metas: List[Dict] = json.loads(META_PATH.read_text(encoding="utf-8"))
-        # Load the embedding model (small + fast)
         self.model = SentenceTransformer(EMBED_MODEL)
     def search(self, query: str, k: int = 6) -> List[Dict]:
-        """
-        Returns top-k hits with score, text, and meta fields.
-        - score ~ cosine similarity (because we normalized at indexing time)
-        """
         if not query or len(query.strip()) < 3:
             return []
-        # Encode the query to the same space used by the index
-        qvec = self.model.encode(
-            [query], convert_to_numpy=True, normalize_embeddings=True
-        )
         scores, idxs = self.index.search(qvec, k)
         hits: List[Dict] = []
         for score, idx in zip(scores[0], idxs[0]):
             if idx < 0:
                 continue
             meta = self.metas[int(idx)]
             text = Path(meta["chunk_file"]).read_text(encoding="utf-8")
-            hits.append(
-                {
-                    "score": float(score),
-                    "text": text,
-                    "meta": meta,  # contains: file, chunk_file, chunk_id
-                }
-            )
         return hits
 def summarize_hits(hits: List[Dict], max_points: int = 4) -> str:
-    """
-    Very small, safe extractive "summary":
-    - Take the first few hits and slice the first ~350 chars of each as bullet points.
-    - This is a placeholder. In Step 5, we'll replace with Evo synthesis.
-    """
     if not hits:
         return "I couldn't find relevant information. Try rephrasing your question."
     bullets = []
@@ -88,13 +60,9 @@ def summarize_hits(hits: List[Dict], max_points: int = 4) -> str:
 def format_sources(hits: List[Dict], max_files: int = 5) -> str:
-    """
-    Collapses the hit list to unique source files, and returns a short bulleted list.
-    """
     if not hits:
         return "Sources: (none)"
-    seen = []
-    order = []
     for h in hits:
         f = h["meta"]["file"]
         if f not in seen:
@@ -104,3 +72,48 @@ def format_sources(hits: List[Dict], max_files: int = 5) -> str:
             break
     bullets = [f"- `{Path(f).name}`" for f in order]
     return "Sources:\n" + "\n".join(bullets)

 """
+Step 6: Retrieval helper + link extraction.
+New in Step 6 (Objective):
+- extract_links(hits): finds http/https URLs inside retrieved chunks
+- split_form_links(links): filters links that look like "forms" (name or path)
+- links_markdown(title, links): renders a clickable list for the UI
 """
 from pathlib import Path
 import json
+import re
+from typing import List, Dict, Tuple
 import faiss
 import numpy as np
 from sentence_transformers import SentenceTransformer
 DATA_DIR = Path("data")
 INDEX_PATH = DATA_DIR / "index.faiss"
 META_PATH = DATA_DIR / "meta.json"
 class RAGSearcher:
     def __init__(self):
         if not INDEX_PATH.exists() or not META_PATH.exists():
             raise RuntimeError(
                 "Index not found. Build it first with the 'Build/Refresh Index' button."
             )
         self.index = faiss.read_index(str(INDEX_PATH))
         self.metas: List[Dict] = json.loads(META_PATH.read_text(encoding="utf-8"))
         self.model = SentenceTransformer(EMBED_MODEL)
     def search(self, query: str, k: int = 6) -> List[Dict]:
         if not query or len(query.strip()) < 3:
             return []
+        qvec = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
         scores, idxs = self.index.search(qvec, k)
         hits: List[Dict] = []
         for score, idx in zip(scores[0], idxs[0]):
             if idx < 0:
                 continue
             meta = self.metas[int(idx)]
             text = Path(meta["chunk_file"]).read_text(encoding="utf-8")
+            hits.append({"score": float(score), "text": text, "meta": meta})
         return hits
 def summarize_hits(hits: List[Dict], max_points: int = 4) -> str:
     if not hits:
         return "I couldn't find relevant information. Try rephrasing your question."
     bullets = []
 def format_sources(hits: List[Dict], max_files: int = 5) -> str:
     if not hits:
         return "Sources: (none)"
+    seen, order = [], []
     for h in hits:
         f = h["meta"]["file"]
         if f not in seen:
             break
     bullets = [f"- `{Path(f).name}`" for f in order]
     return "Sources:\n" + "\n".join(bullets)
+# ---- NEW: Link extraction (Objective)
+_URL_RE = re.compile(r"(https?://[^\s\)\]]+)", re.IGNORECASE)
+def extract_links(hits: List[Dict], max_links: int = 12) -> List[str]:
+    """
+    Scan the retrieved text for URLs (http/https).
+    - Deduplicate while preserving order.
+    - Return up to max_links.
+    """
+    seen = set()
+    ordered: List[str] = []
+    for h in hits:
+        for m in _URL_RE.findall(h["text"]):
+            url = m.strip().rstrip(".,);]")
+            if url not in seen:
+                seen.add(url)
+                ordered.append(url)
+            if len(ordered) >= max_links:
+                return ordered
+    return ordered
+_FORM_HINTS = ("form", "application", "apply", "download", "pdf")
+def split_form_links(links: List[str]) -> Tuple[List[str], List[str]]:
+    """
+    Separate links that look like "forms" based on common keywords in
+    the URL path or filename. Returns (form_links, other_links).
+    """
+    forms, others = [], []
+    for u in links:
+        low = u.lower()
+        if any(h in low for h in _FORM_HINTS):
+            forms.append(u)
+        else:
+            others.append(u)
+    return forms, others
+def links_markdown(title: str, links: List[str]) -> str:
+    if not links:
+        return f"**{title}:** (none)"
+    items = "\n".join([f"- [{u}]({u})" for u in links])
+    return f"**{title}:**\n{items}"