File size: 7,095 Bytes
e90574b
6ef48c6
 
e90574b
 
 
6ef48c6
e90574b
 
 
6ef48c6
 
 
 
 
 
e90574b
 
6ef48c6
 
 
 
 
 
e90574b
edda836
531d6f9
 
e90574b
6ef48c6
e90574b
 
 
 
531d6f9
 
 
 
 
 
6ef48c6
 
 
 
 
 
 
 
 
 
 
 
 
e90574b
6ef48c6
 
 
 
e90574b
6ef48c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e90574b
 
6ef48c6
 
 
 
 
e90574b
6ef48c6
 
 
e90574b
6ef48c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e90574b
6ef48c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e90574b
 
 
6ef48c6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Web Search MCP Server - Feed LLMs with fresh sources
====================================================

Prerequisites
-------------
$ pip install "gradio[mcp]" httpx trafilatura python-dateutil limits

Environment
-----------
export SERPER_API_KEY="YOUR-KEY-HERE"

Usage
-----
python app_mcp.py
Then connect to: http://localhost:7860/gradio_api/mcp/sse
"""

import os
import asyncio
from typing import Optional
import httpx
import trafilatura
import gradio as gr
from dateutil import parser as dateparser
from limits import parse
from limits.aio.storage import MemoryStorage
from limits.aio.strategies import MovingWindowRateLimiter

# Configuration
SERPER_API_KEY = os.getenv("SERPER_API_KEY")
SERPER_ENDPOINT = "https://google.serper.dev/news"
HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}

# Rate limiting
storage = MemoryStorage()
limiter = MovingWindowRateLimiter(storage)
rate_limit = parse("200/hour")


async def search_web(query: str, num_results: Optional[int] = 4) -> str:
    """
    Search the web for recent news and information, returning extracted content.

    This tool searches for recent news articles related to your query and extracts
    the main content from each article, providing you with fresh, relevant information
    from the web.

    Args:
        query (str): The search query. This is REQUIRED. Examples: "apple inc earnings",
                    "climate change 2024", "AI developments"
        num_results (int): Number of results to fetch. This is OPTIONAL. Default is 4.
                          Range: 1-20. More results = more context but longer response time.

    Returns:
        str: Formatted text containing extracted article content with metadata (title,
             source, date, URL, and main text) for each result, separated by dividers.
             Returns error message if API key is missing or search fails.

    Examples:
        - search_web("OpenAI news", 5) - Get 5 recent news articles about OpenAI
        - search_web("python 3.13 features") - Get 4 articles about Python 3.13
        - search_web("stock market today", 10) - Get 10 articles about today's market
    """
    if not SERPER_API_KEY:
        return "Error: SERPER_API_KEY environment variable is not set. Please set it to use this tool."

    # Validate and constrain num_results
    if num_results is None:
        num_results = 4
    num_results = max(1, min(20, num_results))

    try:
        # Check rate limit
        if not await limiter.hit(rate_limit, "global"):
            return "Error: Rate limit exceeded. Please try again later (limit: 200 requests per hour)."

        # Search for news
        payload = {"q": query, "type": "news", "num": num_results, "page": 1}
        async with httpx.AsyncClient(timeout=15) as client:
            resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload)

        if resp.status_code != 200:
            return f"Error: Search API returned status {resp.status_code}. Please check your API key and try again."

        news_items = resp.json().get("news", [])
        if not news_items:
            return (
                f"No results found for query: '{query}'. Try a different search term."
            )

        # Fetch HTML content concurrently
        urls = [n["link"] for n in news_items]
        async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
            tasks = [client.get(u) for u in urls]
            responses = await asyncio.gather(*tasks, return_exceptions=True)

        # Extract and format content
        chunks = []
        successful_extractions = 0

        for meta, response in zip(news_items, responses):
            if isinstance(response, Exception):
                continue

            # Extract main text content
            body = trafilatura.extract(
                response.text, include_formatting=False, include_comments=False
            )

            if not body:
                continue

            successful_extractions += 1

            # Parse and format date
            try:
                date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime(
                    "%Y-%m-%d"
                )
            except Exception:
                date_iso = meta.get("date", "Unknown")

            # Format the chunk
            chunk = (
                f"## {meta['title']}\n"
                f"**Source:** {meta['source']}   "
                f"**Date:** {date_iso}\n"
                f"**URL:** {meta['link']}\n\n"
                f"{body.strip()}\n"
            )
            chunks.append(chunk)

        if not chunks:
            return f"Found {len(news_items)} results for '{query}', but couldn't extract readable content from any of them. The websites might be blocking automated access."

        result = "\n---\n".join(chunks)
        summary = f"Successfully extracted content from {successful_extractions} out of {len(news_items)} search results for query: '{query}'\n\n---\n\n"

        return summary + result

    except Exception as e:
        return f"Error occurred while searching: {str(e)}. Please try again or check your query."


# Create Gradio interface
with gr.Blocks(title="Web Search MCP Server") as demo:
    gr.Markdown(
        """
        # πŸ” Web Search MCP Server
        
        This MCP server provides web search capabilities to LLMs. It searches for recent news 
        and extracts the main content from articles.
        
        **Note:** This interface is primarily designed for MCP tool usage by LLMs, but you can 
        also test it manually below.
        """
    )

    with gr.Row():
        query_input = gr.Textbox(
            label="Search Query",
            placeholder='e.g. "OpenAI news", "climate change 2024", "AI developments"',
            info="Required: Enter your search query",
        )
        num_results_input = gr.Slider(
            minimum=1,
            maximum=20,
            value=4,
            step=1,
            label="Number of Results",
            info="Optional: How many articles to fetch (default: 4)",
        )

    output = gr.Textbox(
        label="Extracted Content",
        lines=25,
        max_lines=50,
        info="The extracted article content will appear here",
    )

    search_button = gr.Button("Search", variant="primary")

    # Add examples
    gr.Examples(
        examples=[
            ["OpenAI GPT-5 news", 5],
            ["climate change 2024", 4],
            ["artificial intelligence breakthroughs", 8],
            ["stock market today", 6],
            ["python programming updates", 4],
        ],
        inputs=[query_input, num_results_input],
        outputs=output,
        fn=search_web,
        cache_examples=False,
    )

    search_button.click(
        fn=search_web, inputs=[query_input, num_results_input], outputs=output
    )


if __name__ == "__main__":
    # Launch with MCP server enabled
    # The MCP endpoint will be available at: http://localhost:7860/gradio_api/mcp/sse
    demo.launch(mcp_server=True, show_api=True)