Spaces:
Sleeping
Sleeping
""" | |
Step 6: Retrieval helper + link extraction. | |
New in Step 6 (Objective): | |
- extract_links(hits): finds http/https URLs inside retrieved chunks | |
- split_form_links(links): filters links that look like "forms" (name or path) | |
- links_markdown(title, links): renders a clickable list for the UI | |
""" | |
from pathlib import Path | |
import json | |
import re | |
from typing import List, Dict, Tuple | |
import faiss | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
DATA_DIR = Path("data") | |
INDEX_PATH = DATA_DIR / "index.faiss" | |
META_PATH = DATA_DIR / "meta.json" | |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
class RAGSearcher: | |
def __init__(self): | |
if not INDEX_PATH.exists() or not META_PATH.exists(): | |
raise RuntimeError( | |
"Index not found. Build it first with the 'Build/Refresh Index' button." | |
) | |
self.index = faiss.read_index(str(INDEX_PATH)) | |
self.metas: List[Dict] = json.loads(META_PATH.read_text(encoding="utf-8")) | |
self.model = SentenceTransformer(EMBED_MODEL) | |
def search(self, query: str, k: int = 6) -> List[Dict]: | |
if not query or len(query.strip()) < 3: | |
return [] | |
qvec = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True) | |
scores, idxs = self.index.search(qvec, k) | |
hits: List[Dict] = [] | |
for score, idx in zip(scores[0], idxs[0]): | |
if idx < 0: | |
continue | |
meta = self.metas[int(idx)] | |
text = Path(meta["chunk_file"]).read_text(encoding="utf-8") | |
hits.append({"score": float(score), "text": text, "meta": meta}) | |
return hits | |
def summarize_hits(hits: List[Dict], max_points: int = 4) -> str: | |
if not hits: | |
return "I couldn't find relevant information. Try rephrasing your question." | |
bullets = [] | |
for h in hits[:max_points]: | |
snippet = " ".join(h["text"].strip().split()) | |
if len(snippet) > 350: | |
snippet = snippet[:350] + "..." | |
bullets.append(f"- {snippet}") | |
return "\n".join(bullets) | |
def format_sources(hits: List[Dict], max_files: int = 5) -> str: | |
if not hits: | |
return "Sources: (none)" | |
seen, order = [], [] | |
for h in hits: | |
f = h["meta"]["file"] | |
if f not in seen: | |
seen.append(f) | |
order.append(f) | |
if len(order) >= max_files: | |
break | |
bullets = [f"- `{Path(f).name}`" for f in order] | |
return "Sources:\n" + "\n".join(bullets) | |
# ---- NEW: Link extraction (Objective) | |
_URL_RE = re.compile(r"(https?://[^\s\)\]]+)", re.IGNORECASE) | |
def extract_links(hits: List[Dict], max_links: int = 12) -> List[str]: | |
""" | |
Scan the retrieved text for URLs (http/https). | |
- Deduplicate while preserving order. | |
- Return up to max_links. | |
""" | |
seen = set() | |
ordered: List[str] = [] | |
for h in hits: | |
for m in _URL_RE.findall(h["text"]): | |
url = m.strip().rstrip(".,);]") | |
if url not in seen: | |
seen.add(url) | |
ordered.append(url) | |
if len(ordered) >= max_links: | |
return ordered | |
return ordered | |
_FORM_HINTS = ("form", "application", "apply", "download", "pdf") | |
def split_form_links(links: List[str]) -> Tuple[List[str], List[str]]: | |
""" | |
Separate links that look like "forms" based on common keywords in | |
the URL path or filename. Returns (form_links, other_links). | |
""" | |
forms, others = [], [] | |
for u in links: | |
low = u.lower() | |
if any(h in low for h in _FORM_HINTS): | |
forms.append(u) | |
else: | |
others.append(u) | |
return forms, others | |
def links_markdown(title: str, links: List[str]) -> str: | |
if not links: | |
return f"**{title}:** (none)" | |
items = "\n".join([f"- [{u}]({u})" for u in links]) | |
return f"**{title}:**\n{items}" | |