File size: 4,445 Bytes
e90574b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531d6f9
 
 
 
 
e90574b
 
 
 
 
531d6f9
 
 
 
 
 
 
 
 
 
 
e90574b
 
531d6f9
eb17722
531d6f9
 
 
e90574b
 
 
 
 
 
 
 
531d6f9
e90574b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb17722
e90574b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb17722
e90574b
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
Web Search - Feed LLMs with fresh sources
==========================================

Prerequisites
-------------
$ pip install gradio httpx trafilatura python-dateutil

Environment
-----------
export SERPER_API_KEY="YOUR‑KEY‑HERE"
"""

import os, json, asyncio, httpx, trafilatura, gradio as gr
from dateutil import parser as dateparser
from pathlib import Path
from limits import RateLimitItem, parse
from limits.aio.storage import MemoryStorage
from limits.aio.strategies import MovingWindowRateLimiter
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse

SERPER_API_KEY = os.getenv("SERPER_API_KEY")
SERPER_ENDPOINT = "https://google.serper.dev/news"
HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}

# Rate limiting
app = FastAPI()
storage = MemoryStorage()
limiter = MovingWindowRateLimiter(storage)
rate_limit = parse("200/hour")


@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException):
    return JSONResponse(status_code=exc.status_code, content={"message": exc.detail})


### 1 ─ Serper call -------------------------------------------------------------
@app.post("/serper-news")
async def get_serper_news(query: str, num: int = 4) -> list[dict]:
    if not await limiter.hit(rate_limit, "global"):
        raise HTTPException(status_code=429, detail="Too Many Requests")

    payload = {"q": query, "type": "news", "num": num, "page": 1}
    async with httpx.AsyncClient(timeout=15) as client:
        resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload)
    resp.raise_for_status()
    return resp.json()["news"]


### 2 ─ Concurrent HTML downloads ----------------------------------------------
async def fetch_html_many(urls: list[str]) -> list[dict]:
    async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
        tasks = [client.get(u) for u in urls]
        responses = await asyncio.gather(*tasks, return_exceptions=True)
    html_pages = []
    for r in responses:
        if isinstance(r, Exception):
            html_pages.append("")  # keep positions aligned
        else:
            html_pages.append(r.text)
    return html_pages


### 3 ─ Main‑content extraction -------------------------------------------------
def extract_main_text(html: str) -> str:
    if not html:
        return ""
    # Trafilatura auto‑detects language, removes boilerplate & returns plain text.
    return (
        trafilatura.extract(html, include_formatting=False, include_comments=False)
        or ""
    )


### 4 ─ Orchestration -----------------------------------------------------------
async def build_context(query: str, k: int = 4) -> str:
    news_items = await get_serper_news(query, num=k)
    urls = [n["link"] for n in news_items]
    raw_pages = await fetch_html_many(urls)

    chunks = []
    for meta, html in zip(news_items, raw_pages):
        body = extract_main_text(html)
        if not body:
            continue  # skip if extraction failed
        # Normalise Serper’s relative date (“21 hours ago”) to ISO date
        try:
            date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime(
                "%Y-%m-%d"
            )
        except Exception:
            date_iso = meta.get("date", "")
        chunk = (
            f"## {meta['title']}\n"
            f"**Source:** {meta['source']}   "
            f"**Date:** {date_iso}\n"
            f"{meta['link']}\n\n"
            f"{body.strip()}\n"
        )
        chunks.append(chunk)

    return "\n---\n".join(chunks) or "No extractable content found."


### 5 ─ Gradio user interface ---------------------------------------------------
async def handler(user_query: str, k: int) -> str:
    if not SERPER_API_KEY:
        return "✖️ SERPER_API_KEY is not set."
    return await build_context(user_query, k)


with gr.Blocks(title="WebSearch") as demo:
    gr.Markdown("# 🔍 Web Search\n" "Feed LLMs with fresh sources.")
    query = gr.Textbox(label="Query", placeholder='e.g. "apple inc"')
    top_k = gr.Slider(1, 20, value=4, label="How many results?")
    out = gr.Textbox(label="Extracted Context", lines=25)
    run = gr.Button("Fetch")
    run.click(handler, inputs=[query, top_k], outputs=out)

if __name__ == "__main__":
    # Launch in shareable mode when running on Colab/VMs; edit as you wish.
    demo.launch()