Spaces:
Running
Running
File size: 4,445 Bytes
e90574b 531d6f9 e90574b 531d6f9 e90574b 531d6f9 eb17722 531d6f9 e90574b 531d6f9 e90574b eb17722 e90574b eb17722 e90574b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
"""
Web Search - Feed LLMs with fresh sources
==========================================
Prerequisites
-------------
$ pip install gradio httpx trafilatura python-dateutil
Environment
-----------
export SERPER_API_KEY="YOUR‑KEY‑HERE"
"""
import os, json, asyncio, httpx, trafilatura, gradio as gr
from dateutil import parser as dateparser
from pathlib import Path
from limits import RateLimitItem, parse
from limits.aio.storage import MemoryStorage
from limits.aio.strategies import MovingWindowRateLimiter
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse
SERPER_API_KEY = os.getenv("SERPER_API_KEY")
SERPER_ENDPOINT = "https://google.serper.dev/news"
HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}
# Rate limiting
app = FastAPI()
storage = MemoryStorage()
limiter = MovingWindowRateLimiter(storage)
rate_limit = parse("200/hour")
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException):
return JSONResponse(status_code=exc.status_code, content={"message": exc.detail})
### 1 ─ Serper call -------------------------------------------------------------
@app.post("/serper-news")
async def get_serper_news(query: str, num: int = 4) -> list[dict]:
if not await limiter.hit(rate_limit, "global"):
raise HTTPException(status_code=429, detail="Too Many Requests")
payload = {"q": query, "type": "news", "num": num, "page": 1}
async with httpx.AsyncClient(timeout=15) as client:
resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload)
resp.raise_for_status()
return resp.json()["news"]
### 2 ─ Concurrent HTML downloads ----------------------------------------------
async def fetch_html_many(urls: list[str]) -> list[dict]:
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
tasks = [client.get(u) for u in urls]
responses = await asyncio.gather(*tasks, return_exceptions=True)
html_pages = []
for r in responses:
if isinstance(r, Exception):
html_pages.append("") # keep positions aligned
else:
html_pages.append(r.text)
return html_pages
### 3 ─ Main‑content extraction -------------------------------------------------
def extract_main_text(html: str) -> str:
if not html:
return ""
# Trafilatura auto‑detects language, removes boilerplate & returns plain text.
return (
trafilatura.extract(html, include_formatting=False, include_comments=False)
or ""
)
### 4 ─ Orchestration -----------------------------------------------------------
async def build_context(query: str, k: int = 4) -> str:
news_items = await get_serper_news(query, num=k)
urls = [n["link"] for n in news_items]
raw_pages = await fetch_html_many(urls)
chunks = []
for meta, html in zip(news_items, raw_pages):
body = extract_main_text(html)
if not body:
continue # skip if extraction failed
# Normalise Serper’s relative date (“21 hours ago”) to ISO date
try:
date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime(
"%Y-%m-%d"
)
except Exception:
date_iso = meta.get("date", "")
chunk = (
f"## {meta['title']}\n"
f"**Source:** {meta['source']} "
f"**Date:** {date_iso}\n"
f"{meta['link']}\n\n"
f"{body.strip()}\n"
)
chunks.append(chunk)
return "\n---\n".join(chunks) or "No extractable content found."
### 5 ─ Gradio user interface ---------------------------------------------------
async def handler(user_query: str, k: int) -> str:
if not SERPER_API_KEY:
return "✖️ SERPER_API_KEY is not set."
return await build_context(user_query, k)
with gr.Blocks(title="WebSearch") as demo:
gr.Markdown("# 🔍 Web Search\n" "Feed LLMs with fresh sources.")
query = gr.Textbox(label="Query", placeholder='e.g. "apple inc"')
top_k = gr.Slider(1, 20, value=4, label="How many results?")
out = gr.Textbox(label="Extracted Context", lines=25)
run = gr.Button("Fetch")
run.click(handler, inputs=[query, top_k], outputs=out)
if __name__ == "__main__":
# Launch in shareable mode when running on Colab/VMs; edit as you wish.
demo.launch()
|