victor HF Staff commited on
Commit
e90574b
·
1 Parent(s): add19e8

Add Gradio web search application and update README with usage instructions

Browse files
Files changed (3) hide show
  1. README.md +28 -1
  2. app.py +104 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -9,4 +9,31 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
  ---
11
 
12
+ # Gradio News‑to‑Context Service
13
+
14
+ ## Prerequisites
15
+
16
+ `$ pip install gradio httpx trafilatura python-dateutil`
17
+
18
+ ## Environment
19
+
20
+ `export SERPER_API_KEY="YOUR‑KEY‑HERE"`
21
+
22
+ ## How it works – design notes
23
+
24
+ | Step | Technique | Why it matters |
25
+ |---|---|---|
26
+ | API search | Serper’s Google‑News JSON is fast, cost‑effective and immune to Google’s bot‑blocking. | |
27
+ | Concurrency | `httpx.AsyncClient` + `asyncio.gather` gets 10 articles in < 2 s on typical broadband. | |
28
+ | Extraction | Trafilatura consistently tops accuracy charts for main‑content extraction and needs no browser or heavy ML models. | |
29
+ | Date parsing | `python‑dateutil` converts fuzzy strings (“16 hours ago”) into ISO YYYY‑MM‑DD so the LLM sees absolute dates. | |
30
+ | LLM‑friendly output | Markdown headings and horizontal rules make chunk boundaries explicit; hyperlinks preserved for optional citation. | |
31
+
32
+ ## Extending in production
33
+
34
+ * **Caching** – add `aiocache` or Redis to avoid re‑fetching identical URLs within TTL.
35
+ * **Long‑content trimming** – if each article can exceed your LLM’s context window, pipe `body` through a sentence‑ranker or GPT‑based summariser before concatenation.
36
+ * **Paywalls / PDFs** – guard `extract_main_text` with fallback libraries (e.g. `readability‑lxml` or `pymupdf`) for unusual formats.
37
+ * **Rate‑limiting** – Serper free tier allows 100 req/day; wrap the call with exponential‑backoff on HTTP 429.
38
+
39
+ Drop this file into any Python‑3.10+ environment, set `SERPER_API_KEY`, pip install the three libraries, and you have a ready‑to‑embed “query‑» context” micro‑service for your LLM pipeline.
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web Search - Feed LLMs with fresh sources
3
+ ==========================================
4
+
5
+ Prerequisites
6
+ -------------
7
+ $ pip install gradio httpx trafilatura python-dateutil
8
+
9
+ Environment
10
+ -----------
11
+ export SERPER_API_KEY="YOUR‑KEY‑HERE"
12
+ """
13
+
14
+ import os, json, asyncio, httpx, trafilatura, gradio as gr
15
+ from dateutil import parser as dateparser
16
+ from pathlib import Path
17
+
18
+ SERPER_API_KEY = os.getenv("SERPER_API_KEY")
19
+ SERPER_ENDPOINT = "https://google.serper.dev/news"
20
+ HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}
21
+
22
+
23
+ ### 1 ─ Serper call -------------------------------------------------------------
24
+ async def get_serper_news(query: str, num: int = 10) -> list[dict]:
25
+ payload = {"q": query, "type": "news", "num": num, "page": 1}
26
+ async with httpx.AsyncClient(timeout=15) as client:
27
+ resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload)
28
+ resp.raise_for_status()
29
+ return resp.json()["news"]
30
+
31
+
32
+ ### 2 ─ Concurrent HTML downloads ----------------------------------------------
33
+ async def fetch_html_many(urls: list[str]) -> list[str]:
34
+ async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
35
+ tasks = [client.get(u) for u in urls]
36
+ responses = await asyncio.gather(*tasks, return_exceptions=True)
37
+ html_pages = []
38
+ for r in responses:
39
+ if isinstance(r, Exception):
40
+ html_pages.append("") # keep positions aligned
41
+ else:
42
+ html_pages.append(r.text)
43
+ return html_pages
44
+
45
+
46
+ ### 3 ─ Main‑content extraction -------------------------------------------------
47
+ def extract_main_text(html: str) -> str:
48
+ if not html:
49
+ return ""
50
+ # Trafilatura auto‑detects language, removes boilerplate & returns plain text.
51
+ return (
52
+ trafilatura.extract(html, include_formatting=False, include_comments=False)
53
+ or ""
54
+ )
55
+
56
+
57
+ ### 4 ─ Orchestration -----------------------------------------------------------
58
+ async def build_context(query: str, k: int = 10) -> str:
59
+ news_items = await get_serper_news(query, num=k)
60
+ urls = [n["link"] for n in news_items]
61
+ raw_pages = await fetch_html_many(urls)
62
+
63
+ chunks = []
64
+ for meta, html in zip(news_items, raw_pages):
65
+ body = extract_main_text(html)
66
+ if not body:
67
+ continue # skip if extraction failed
68
+ # Normalise Serper’s relative date (“21 hours ago”) to ISO date
69
+ try:
70
+ date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime(
71
+ "%Y-%m-%d"
72
+ )
73
+ except Exception:
74
+ date_iso = meta.get("date", "")
75
+ chunk = (
76
+ f"## {meta['title']}\n"
77
+ f"**Source:** {meta['source']} "
78
+ f"**Date:** {date_iso}\n"
79
+ f"{meta['link']}\n\n"
80
+ f"{body.strip()}\n"
81
+ )
82
+ chunks.append(chunk)
83
+
84
+ return "\n---\n".join(chunks) or "No extractable content found."
85
+
86
+
87
+ ### 5 ─ Gradio user interface ---------------------------------------------------
88
+ async def handler(user_query: str, k: int) -> str:
89
+ if not SERPER_API_KEY:
90
+ return "✖️ SERPER_API_KEY is not set."
91
+ return await build_context(user_query, k)
92
+
93
+
94
+ with gr.Blocks(title="WebSearch") as demo:
95
+ gr.Markdown("# 🔍 Web Search\n" "Feed LLMs with fresh sources.")
96
+ query = gr.Textbox(label="Query", placeholder='e.g. "apple inc"')
97
+ top_k = gr.Slider(1, 20, value=10, label="How many results?")
98
+ out = gr.Textbox(label="Extracted Context", lines=25)
99
+ run = gr.Button("Fetch")
100
+ run.click(handler, inputs=[query, top_k], outputs=out)
101
+
102
+ if __name__ == "__main__":
103
+ # Launch in shareable mode when running on Colab/VMs; edit as you wish.
104
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ httpx
3
+ trafilatura
4
+ python-dateutil