Spaces:

victor
/

websearch

Running

File size: 4,445 Bytes

"""
Web Search - Feed LLMs with fresh sources
==========================================

Prerequisites
-------------
$ pip install gradio httpx trafilatura python-dateutil

Environment
-----------
export SERPER_API_KEY="YOUR‑KEY‑HERE"
"""

import os, json, asyncio, httpx, trafilatura, gradio as gr
from dateutil import parser as dateparser
from pathlib import Path
from limits import RateLimitItem, parse
from limits.aio.storage import MemoryStorage
from limits.aio.strategies import MovingWindowRateLimiter
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse

SERPER_API_KEY = os.getenv("SERPER_API_KEY")
SERPER_ENDPOINT = "https://google.serper.dev/news"
HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}

# Rate limiting
app = FastAPI()
storage = MemoryStorage()
limiter = MovingWindowRateLimiter(storage)
rate_limit = parse("200/hour")


@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException):
    return JSONResponse(status_code=exc.status_code, content={"message": exc.detail})


### 1 ─ Serper call -------------------------------------------------------------
@app.post("/serper-news")
async def get_serper_news(query: str, num: int = 4) -> list[dict]:
    if not await limiter.hit(rate_limit, "global"):
        raise HTTPException(status_code=429, detail="Too Many Requests")

    payload = {"q": query, "type": "news", "num": num, "page": 1}
    async with httpx.AsyncClient(timeout=15) as client:
        resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload)
    resp.raise_for_status()
    return resp.json()["news"]


### 2 ─ Concurrent HTML downloads ----------------------------------------------
async def fetch_html_many(urls: list[str]) -> list[dict]:
    async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
        tasks = [client.get(u) for u in urls]
        responses = await asyncio.gather(*tasks, return_exceptions=True)
    html_pages = []
    for r in responses:
        if isinstance(r, Exception):
            html_pages.append("")  # keep positions aligned
        else:
            html_pages.append(r.text)
    return html_pages


### 3 ─ Main‑content extraction -------------------------------------------------
def extract_main_text(html: str) -> str:
    if not html:
        return ""
    # Trafilatura auto‑detects language, removes boilerplate & returns plain text.
    return (
        trafilatura.extract(html, include_formatting=False, include_comments=False)
        or ""
    )


### 4 ─ Orchestration -----------------------------------------------------------
async def build_context(query: str, k: int = 4) -> str:
    news_items = await get_serper_news(query, num=k)
    urls = [n["link"] for n in news_items]
    raw_pages = await fetch_html_many(urls)

    chunks = []
    for meta, html in zip(news_items, raw_pages):
        body = extract_main_text(html)
        if not body:
            continue  # skip if extraction failed
        # Normalise Serper’s relative date (“21 hours ago”) to ISO date
        try:
            date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime(
                "%Y-%m-%d"
            )
        except Exception:
            date_iso = meta.get("date", "")
        chunk = (
            f"## {meta['title']}\n"
            f"**Source:** {meta['source']}   "
            f"**Date:** {date_iso}\n"
            f"{meta['link']}\n\n"
            f"{body.strip()}\n"
        )
        chunks.append(chunk)

    return "\n---\n".join(chunks) or "No extractable content found."


### 5 ─ Gradio user interface ---------------------------------------------------
async def handler(user_query: str, k: int) -> str:
    if not SERPER_API_KEY:
        return "✖️ SERPER_API_KEY is not set."
    return await build_context(user_query, k)


with gr.Blocks(title="WebSearch") as demo:
    gr.Markdown("# 🔍 Web Search\n" "Feed LLMs with fresh sources.")
    query = gr.Textbox(label="Query", placeholder='e.g. "apple inc"')
    top_k = gr.Slider(1, 20, value=4, label="How many results?")
    out = gr.Textbox(label="Extracted Context", lines=25)
    run = gr.Button("Fetch")
    run.click(handler, inputs=[query, top_k], outputs=out)

if __name__ == "__main__":
    # Launch in shareable mode when running on Colab/VMs; edit as you wish.
    demo.launch()