Spaces:
Running
Running
""" | |
Web Search MCP Server - Feed LLMs with fresh sources | |
==================================================== | |
Prerequisites | |
------------- | |
$ pip install "gradio[mcp]" httpx trafilatura python-dateutil limits | |
Environment | |
----------- | |
export SERPER_API_KEY="YOUR-KEY-HERE" | |
Usage | |
----- | |
python app_mcp.py | |
Then connect to: http://localhost:7860/gradio_api/mcp/sse | |
""" | |
import os | |
import asyncio | |
from typing import Optional | |
import httpx | |
import trafilatura | |
import gradio as gr | |
from dateutil import parser as dateparser | |
from limits import parse | |
from limits.aio.storage import MemoryStorage | |
from limits.aio.strategies import MovingWindowRateLimiter | |
# Configuration | |
SERPER_API_KEY = os.getenv("SERPER_API_KEY") | |
SERPER_ENDPOINT = "https://google.serper.dev/news" | |
HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"} | |
# Rate limiting | |
storage = MemoryStorage() | |
limiter = MovingWindowRateLimiter(storage) | |
rate_limit = parse("200/hour") | |
async def search_web(query: str, num_results: Optional[int] = 4) -> str: | |
""" | |
Search the web for recent news and information, returning extracted content. | |
This tool searches for recent news articles related to your query and extracts | |
the main content from each article, providing you with fresh, relevant information | |
from the web. | |
Args: | |
query (str): The search query. This is REQUIRED. Examples: "apple inc earnings", | |
"climate change 2024", "AI developments" | |
num_results (int): Number of results to fetch. This is OPTIONAL. Default is 4. | |
Range: 1-20. More results = more context but longer response time. | |
Returns: | |
str: Formatted text containing extracted article content with metadata (title, | |
source, date, URL, and main text) for each result, separated by dividers. | |
Returns error message if API key is missing or search fails. | |
Examples: | |
- search_web("OpenAI news", 5) - Get 5 recent news articles about OpenAI | |
- search_web("python 3.13 features") - Get 4 articles about Python 3.13 | |
- search_web("stock market today", 10) - Get 10 articles about today's market | |
""" | |
if not SERPER_API_KEY: | |
return "Error: SERPER_API_KEY environment variable is not set. Please set it to use this tool." | |
# Validate and constrain num_results | |
if num_results is None: | |
num_results = 4 | |
num_results = max(1, min(20, num_results)) | |
try: | |
# Check rate limit | |
if not await limiter.hit(rate_limit, "global"): | |
return "Error: Rate limit exceeded. Please try again later (limit: 200 requests per hour)." | |
# Search for news | |
payload = {"q": query, "type": "news", "num": num_results, "page": 1} | |
async with httpx.AsyncClient(timeout=15) as client: | |
resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload) | |
if resp.status_code != 200: | |
return f"Error: Search API returned status {resp.status_code}. Please check your API key and try again." | |
news_items = resp.json().get("news", []) | |
if not news_items: | |
return ( | |
f"No results found for query: '{query}'. Try a different search term." | |
) | |
# Fetch HTML content concurrently | |
urls = [n["link"] for n in news_items] | |
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: | |
tasks = [client.get(u) for u in urls] | |
responses = await asyncio.gather(*tasks, return_exceptions=True) | |
# Extract and format content | |
chunks = [] | |
successful_extractions = 0 | |
for meta, response in zip(news_items, responses): | |
if isinstance(response, Exception): | |
continue | |
# Extract main text content | |
body = trafilatura.extract( | |
response.text, include_formatting=False, include_comments=False | |
) | |
if not body: | |
continue | |
successful_extractions += 1 | |
# Parse and format date | |
try: | |
date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime( | |
"%Y-%m-%d" | |
) | |
except Exception: | |
date_iso = meta.get("date", "Unknown") | |
# Format the chunk | |
chunk = ( | |
f"## {meta['title']}\n" | |
f"**Source:** {meta['source']} " | |
f"**Date:** {date_iso}\n" | |
f"**URL:** {meta['link']}\n\n" | |
f"{body.strip()}\n" | |
) | |
chunks.append(chunk) | |
if not chunks: | |
return f"Found {len(news_items)} results for '{query}', but couldn't extract readable content from any of them. The websites might be blocking automated access." | |
result = "\n---\n".join(chunks) | |
summary = f"Successfully extracted content from {successful_extractions} out of {len(news_items)} search results for query: '{query}'\n\n---\n\n" | |
return summary + result | |
except Exception as e: | |
return f"Error occurred while searching: {str(e)}. Please try again or check your query." | |
# Create Gradio interface | |
with gr.Blocks(title="Web Search MCP Server") as demo: | |
gr.Markdown( | |
""" | |
# π Web Search MCP Server | |
This MCP server provides web search capabilities to LLMs. It searches for recent news | |
and extracts the main content from articles. | |
**Note:** This interface is primarily designed for MCP tool usage by LLMs, but you can | |
also test it manually below. | |
""" | |
) | |
with gr.Row(): | |
query_input = gr.Textbox( | |
label="Search Query", | |
placeholder='e.g. "OpenAI news", "climate change 2024", "AI developments"', | |
info="Required: Enter your search query", | |
) | |
num_results_input = gr.Slider( | |
minimum=1, | |
maximum=20, | |
value=4, | |
step=1, | |
label="Number of Results", | |
info="Optional: How many articles to fetch (default: 4)", | |
) | |
output = gr.Textbox( | |
label="Extracted Content", | |
lines=25, | |
max_lines=50, | |
info="The extracted article content will appear here", | |
) | |
search_button = gr.Button("Search", variant="primary") | |
# Add examples | |
gr.Examples( | |
examples=[ | |
["OpenAI GPT-5 news", 5], | |
["climate change 2024", 4], | |
["artificial intelligence breakthroughs", 8], | |
["stock market today", 6], | |
["python programming updates", 4], | |
], | |
inputs=[query_input, num_results_input], | |
outputs=output, | |
fn=search_web, | |
cache_examples=False, | |
) | |
search_button.click( | |
fn=search_web, inputs=[query_input, num_results_input], outputs=output | |
) | |
if __name__ == "__main__": | |
# Launch with MCP server enabled | |
# The MCP endpoint will be available at: http://localhost:7860/gradio_api/mcp/sse | |
demo.launch(mcp_server=True, show_api=True) | |