|
""" |
|
Web Search - Feed LLMs with fresh sources |
|
========================================== |
|
|
|
Prerequisites |
|
------------- |
|
$ pip install gradio httpx trafilatura python-dateutil |
|
|
|
Environment |
|
----------- |
|
export SERPER_API_KEY="YOUR‑KEY‑HERE" |
|
""" |
|
|
|
import os, asyncio, httpx, trafilatura, gradio as gr |
|
from dateutil import parser as dateparser |
|
from limits import parse |
|
from limits.aio.storage import MemoryStorage |
|
from limits.aio.strategies import MovingWindowRateLimiter |
|
from fastapi import FastAPI, Request, HTTPException |
|
from fastapi.responses import JSONResponse |
|
|
|
SERPER_API_KEY = os.getenv("SERPER_API_KEY") |
|
SERPER_ENDPOINT = "https://google.serper.dev/news" |
|
HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"} |
|
|
|
|
|
app = FastAPI() |
|
storage = MemoryStorage() |
|
limiter = MovingWindowRateLimiter(storage) |
|
rate_limit = parse("200/hour") |
|
|
|
|
|
@app.exception_handler(HTTPException) |
|
async def http_exception_handler(request: Request, exc: HTTPException): |
|
return JSONResponse(status_code=exc.status_code, content={"message": exc.detail}) |
|
|
|
|
|
|
|
@app.post("/serper-news") |
|
async def get_serper_news(query: str, num: int = 4) -> list[dict]: |
|
if not await limiter.hit(rate_limit, "global"): |
|
raise HTTPException(status_code=429, detail="Too Many Requests") |
|
|
|
payload = {"q": query, "type": "news", "num": num, "page": 1} |
|
async with httpx.AsyncClient(timeout=15) as client: |
|
resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload) |
|
resp.raise_for_status() |
|
return resp.json()["news"] |
|
|
|
|
|
|
|
async def fetch_html_many(urls: list[str]) -> list[dict]: |
|
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: |
|
tasks = [client.get(u) for u in urls] |
|
responses = await asyncio.gather(*tasks, return_exceptions=True) |
|
html_pages = [] |
|
for r in responses: |
|
if isinstance(r, Exception): |
|
html_pages.append("") |
|
else: |
|
html_pages.append(r.text) |
|
return html_pages |
|
|
|
|
|
|
|
def extract_main_text(html: str) -> str: |
|
if not html: |
|
return "" |
|
|
|
return ( |
|
trafilatura.extract(html, include_formatting=False, include_comments=False) |
|
or "" |
|
) |
|
|
|
|
|
|
|
async def build_context(query: str, k: int = 4) -> str: |
|
news_items = await get_serper_news(query, num=k) |
|
urls = [n["link"] for n in news_items] |
|
raw_pages = await fetch_html_many(urls) |
|
|
|
chunks = [] |
|
for meta, html in zip(news_items, raw_pages): |
|
body = extract_main_text(html) |
|
if not body: |
|
continue |
|
|
|
try: |
|
date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime( |
|
"%Y-%m-%d" |
|
) |
|
except Exception: |
|
date_iso = meta.get("date", "") |
|
chunk = ( |
|
f"## {meta['title']}\n" |
|
f"**Source:** {meta['source']} " |
|
f"**Date:** {date_iso}\n" |
|
f"{meta['link']}\n\n" |
|
f"{body.strip()}\n" |
|
) |
|
chunks.append(chunk) |
|
|
|
return "\n---\n".join(chunks) or "No extractable content found." |
|
|
|
|
|
|
|
async def handler(user_query: str, k: int) -> str: |
|
if not SERPER_API_KEY: |
|
return "✖️ SERPER_API_KEY is not set." |
|
return await build_context(user_query, k) |
|
|
|
|
|
with gr.Blocks(title="WebSearch") as demo: |
|
gr.Markdown("# 🔍 Web Search\n" "Feed LLMs with fresh sources.") |
|
query = gr.Textbox(label="Query", placeholder='e.g. "apple inc"') |
|
top_k = gr.Slider(1, 20, value=4, label="How many results?") |
|
out = gr.Textbox(label="Extracted Context", lines=25) |
|
run = gr.Button("Fetch") |
|
run.click(handler, inputs=[query, top_k], outputs=out) |
|
|
|
if __name__ == "__main__": |
|
|
|
demo.launch() |
|
|