Spaces:
Running
Running
""" | |
Web Search - Feed LLMs with fresh sources | |
========================================== | |
Prerequisites | |
------------- | |
$ pip install gradio httpx trafilatura python-dateutil | |
Environment | |
----------- | |
export SERPER_API_KEY="YOUR‑KEY‑HERE" | |
""" | |
import os, json, asyncio, httpx, trafilatura, gradio as gr | |
from dateutil import parser as dateparser | |
from pathlib import Path | |
from limits import RateLimitItem, parse | |
from limits.aio.storage import MemoryStorage | |
from limits.aio.strategies import MovingWindowRateLimiter | |
from fastapi import FastAPI, Request, HTTPException | |
from fastapi.responses import JSONResponse | |
SERPER_API_KEY = os.getenv("SERPER_API_KEY") | |
SERPER_ENDPOINT = "https://google.serper.dev/news" | |
HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"} | |
# Rate limiting | |
app = FastAPI() | |
storage = MemoryStorage() | |
limiter = MovingWindowRateLimiter(storage) | |
rate_limit = parse("200/hour") | |
async def http_exception_handler(request: Request, exc: HTTPException): | |
return JSONResponse(status_code=exc.status_code, content={"message": exc.detail}) | |
### 1 ─ Serper call ------------------------------------------------------------- | |
async def get_serper_news(query: str, num: int = 4) -> list[dict]: | |
if not await limiter.hit(rate_limit, "global"): | |
raise HTTPException(status_code=429, detail="Too Many Requests") | |
payload = {"q": query, "type": "news", "num": num, "page": 1} | |
async with httpx.AsyncClient(timeout=15) as client: | |
resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload) | |
resp.raise_for_status() | |
return resp.json()["news"] | |
### 2 ─ Concurrent HTML downloads ---------------------------------------------- | |
async def fetch_html_many(urls: list[str]) -> list[dict]: | |
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: | |
tasks = [client.get(u) for u in urls] | |
responses = await asyncio.gather(*tasks, return_exceptions=True) | |
html_pages = [] | |
for r in responses: | |
if isinstance(r, Exception): | |
html_pages.append("") # keep positions aligned | |
else: | |
html_pages.append(r.text) | |
return html_pages | |
### 3 ─ Main‑content extraction ------------------------------------------------- | |
def extract_main_text(html: str) -> str: | |
if not html: | |
return "" | |
# Trafilatura auto‑detects language, removes boilerplate & returns plain text. | |
return ( | |
trafilatura.extract(html, include_formatting=False, include_comments=False) | |
or "" | |
) | |
### 4 ─ Orchestration ----------------------------------------------------------- | |
async def build_context(query: str, k: int = 4) -> str: | |
news_items = await get_serper_news(query, num=k) | |
urls = [n["link"] for n in news_items] | |
raw_pages = await fetch_html_many(urls) | |
chunks = [] | |
for meta, html in zip(news_items, raw_pages): | |
body = extract_main_text(html) | |
if not body: | |
continue # skip if extraction failed | |
# Normalise Serper’s relative date (“21 hours ago”) to ISO date | |
try: | |
date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime( | |
"%Y-%m-%d" | |
) | |
except Exception: | |
date_iso = meta.get("date", "") | |
chunk = ( | |
f"## {meta['title']}\n" | |
f"**Source:** {meta['source']} " | |
f"**Date:** {date_iso}\n" | |
f"{meta['link']}\n\n" | |
f"{body.strip()}\n" | |
) | |
chunks.append(chunk) | |
return "\n---\n".join(chunks) or "No extractable content found." | |
### 5 ─ Gradio user interface --------------------------------------------------- | |
async def handler(user_query: str, k: int) -> str: | |
if not SERPER_API_KEY: | |
return "✖️ SERPER_API_KEY is not set." | |
return await build_context(user_query, k) | |
with gr.Blocks(title="WebSearch") as demo: | |
gr.Markdown("# 🔍 Web Search\n" "Feed LLMs with fresh sources.") | |
query = gr.Textbox(label="Query", placeholder='e.g. "apple inc"') | |
top_k = gr.Slider(1, 20, value=4, label="How many results?") | |
out = gr.Textbox(label="Extracted Context", lines=25) | |
run = gr.Button("Fetch") | |
run.click(handler, inputs=[query, top_k], outputs=out) | |
if __name__ == "__main__": | |
# Launch in shareable mode when running on Colab/VMs; edit as you wish. | |
demo.launch() | |