websearch

Sleeping

App Files Files Community

websearch / app.py

victor HF Staff

Refactor imports in app.py to streamline dependencies

edda836 about 2 months ago

raw

history blame

4.4 kB

	"""
	Web Search - Feed LLMs with fresh sources
	==========================================

	Prerequisites
	-------------
	$ pip install gradio httpx trafilatura python-dateutil

	Environment
	-----------
	export SERPER_API_KEY="YOUR‑KEY‑HERE"
	"""

	import os, asyncio, httpx, trafilatura, gradio as gr
	from dateutil import parser as dateparser
	from limits import parse
	from limits.aio.storage import MemoryStorage
	from limits.aio.strategies import MovingWindowRateLimiter
	from fastapi import FastAPI, Request, HTTPException
	from fastapi.responses import JSONResponse

	SERPER_API_KEY = os.getenv("SERPER_API_KEY")
	SERPER_ENDPOINT = "https://google.serper.dev/news"
	HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}

	# Rate limiting
	app = FastAPI()
	storage = MemoryStorage()
	limiter = MovingWindowRateLimiter(storage)
	rate_limit = parse("200/hour")


	@app.exception_handler(HTTPException)
	async def http_exception_handler(request: Request, exc: HTTPException):
	return JSONResponse(status_code=exc.status_code, content={"message": exc.detail})


	### 1 ─ Serper call -------------------------------------------------------------
	@app.post("/serper-news")
	async def get_serper_news(query: str, num: int = 4) -> list[dict]:
	if not await limiter.hit(rate_limit, "global"):
	raise HTTPException(status_code=429, detail="Too Many Requests")

	payload = {"q": query, "type": "news", "num": num, "page": 1}
	async with httpx.AsyncClient(timeout=15) as client:
	resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload)
	resp.raise_for_status()
	return resp.json()["news"]


	### 2 ─ Concurrent HTML downloads ----------------------------------------------
	async def fetch_html_many(urls: list[str]) -> list[dict]:
	async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
	tasks = [client.get(u) for u in urls]
	responses = await asyncio.gather(*tasks, return_exceptions=True)
	html_pages = []
	for r in responses:
	if isinstance(r, Exception):
	html_pages.append("") # keep positions aligned
	else:
	html_pages.append(r.text)
	return html_pages


	### 3 ─ Main‑content extraction -------------------------------------------------
	def extract_main_text(html: str) -> str:
	if not html:
	return ""
	# Trafilatura auto‑detects language, removes boilerplate & returns plain text.
	return (
	trafilatura.extract(html, include_formatting=False, include_comments=False)
	or ""
	)


	### 4 ─ Orchestration -----------------------------------------------------------
	async def build_context(query: str, k: int = 4) -> str:
	news_items = await get_serper_news(query, num=k)
	urls = [n["link"] for n in news_items]
	raw_pages = await fetch_html_many(urls)

	chunks = []
	for meta, html in zip(news_items, raw_pages):
	body = extract_main_text(html)
	if not body:
	continue # skip if extraction failed
	# Normalise Serper’s relative date (“21 hours ago”) to ISO date
	try:
	date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime(
	"%Y-%m-%d"
	)
	except Exception:
	date_iso = meta.get("date", "")
	chunk = (
	f"## {meta['title']}\n"
	f"Source: {meta['source']} "
	f"Date: {date_iso}\n"
	f"{meta['link']}\n\n"
	f"{body.strip()}\n"
	)
	chunks.append(chunk)

	return "\n---\n".join(chunks) or "No extractable content found."


	### 5 ─ Gradio user interface ---------------------------------------------------
	async def handler(user_query: str, k: int) -> str:
	if not SERPER_API_KEY:
	return "✖️ SERPER_API_KEY is not set."
	return await build_context(user_query, k)


	with gr.Blocks(title="WebSearch") as demo:
	gr.Markdown("# 🔍 Web Search\n" "Feed LLMs with fresh sources.")
	query = gr.Textbox(label="Query", placeholder='e.g. "apple inc"')
	top_k = gr.Slider(1, 20, value=4, label="How many results?")
	out = gr.Textbox(label="Extracted Context", lines=25)
	run = gr.Button("Fetch")
	run.click(handler, inputs=[query, top_k], outputs=out)

	if __name__ == "__main__":
	# Launch in shareable mode when running on Colab/VMs; edit as you wish.
	demo.launch()