websearch

Sleeping

App Files Files Community

websearch / app.py

victor HF Staff

Update app.py

eb17722 about 2 months ago

raw

history blame

3.72 kB

	"""
	Web Search - Feed LLMs with fresh sources
	==========================================

	Prerequisites
	-------------
	$ pip install gradio httpx trafilatura python-dateutil

	Environment
	-----------
	export SERPER_API_KEY="YOUR‑KEY‑HERE"
	"""

	import os, json, asyncio, httpx, trafilatura, gradio as gr
	from dateutil import parser as dateparser
	from pathlib import Path

	SERPER_API_KEY = os.getenv("SERPER_API_KEY")
	SERPER_ENDPOINT = "https://google.serper.dev/news"
	HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}


	### 1 ─ Serper call -------------------------------------------------------------
	async def get_serper_news(query: str, num: int = 4) -> list[dict]:
	payload = {"q": query, "type": "news", "num": num, "page": 1}
	async with httpx.AsyncClient(timeout=15) as client:
	resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload)
	resp.raise_for_status()
	return resp.json()["news"]


	### 2 ─ Concurrent HTML downloads ----------------------------------------------
	async def fetch_html_many(urls: list[str]) -> list[str]:
	async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
	tasks = [client.get(u) for u in urls]
	responses = await asyncio.gather(*tasks, return_exceptions=True)
	html_pages = []
	for r in responses:
	if isinstance(r, Exception):
	html_pages.append("") # keep positions aligned
	else:
	html_pages.append(r.text)
	return html_pages


	### 3 ─ Main‑content extraction -------------------------------------------------
	def extract_main_text(html: str) -> str:
	if not html:
	return ""
	# Trafilatura auto‑detects language, removes boilerplate & returns plain text.
	return (
	trafilatura.extract(html, include_formatting=False, include_comments=False)
	or ""
	)


	### 4 ─ Orchestration -----------------------------------------------------------
	async def build_context(query: str, k: int = 4) -> str:
	news_items = await get_serper_news(query, num=k)
	urls = [n["link"] for n in news_items]
	raw_pages = await fetch_html_many(urls)

	chunks = []
	for meta, html in zip(news_items, raw_pages):
	body = extract_main_text(html)
	if not body:
	continue # skip if extraction failed
	# Normalise Serper’s relative date (“21 hours ago”) to ISO date
	try:
	date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime(
	"%Y-%m-%d"
	)
	except Exception:
	date_iso = meta.get("date", "")
	chunk = (
	f"## {meta['title']}\n"
	f"Source: {meta['source']} "
	f"Date: {date_iso}\n"
	f"{meta['link']}\n\n"
	f"{body.strip()}\n"
	)
	chunks.append(chunk)

	return "\n---\n".join(chunks) or "No extractable content found."


	### 5 ─ Gradio user interface ---------------------------------------------------
	async def handler(user_query: str, k: int) -> str:
	if not SERPER_API_KEY:
	return "✖️ SERPER_API_KEY is not set."
	return await build_context(user_query, k)


	with gr.Blocks(title="WebSearch") as demo:
	gr.Markdown("# 🔍 Web Search\n" "Feed LLMs with fresh sources.")
	query = gr.Textbox(label="Query", placeholder='e.g. "apple inc"')
	top_k = gr.Slider(1, 20, value=4, label="How many results?")
	out = gr.Textbox(label="Extracted Context", lines=25)
	run = gr.Button("Fetch")
	run.click(handler, inputs=[query, top_k], outputs=out)

	if __name__ == "__main__":
	# Launch in shareable mode when running on Colab/VMs; edit as you wish.
	demo.launch()