Spaces:

robin0307
/

NewsMCP

Running

File size: 8,621 Bytes

# %%
import requests
from bs4 import BeautifulSoup
import gradio as gr

def parse_news_item(html: str) -> dict:
    """
    Parse HTML of a news item to extract link, time, headline, and text.
    
    Args:
        html: The HTML string of a news item.
        
    Returns:
        A dictionary containing link, time, headline, and text.
        
    Raises:
        Exception: For parsing errors or other unexpected errors.
    """
    try:
        soup = BeautifulSoup(html, "html.parser")

        # Get the anchor tag containing the link
        link_tag = soup.find("a", href=True)
        link = link_tag["href"] if link_tag else None

        # Get the headline inside <h3>
        headline_tag = soup.find("h3", class_="story__headline")
        headline = headline_tag.get_text(strip=True) if headline_tag else None

        # Get the text inside <p>
        text_tag = soup.find("p", class_="story__text")
        text = text_tag.get_text(strip=True) if text_tag else None

        # Get the time inside <time>
        time_tag = soup.find("time")
        time = time_tag.get_text(strip=True) if time_tag else None

        return {
            "link": link,
            "time": time,
            "headline": headline,
            "text": text,
        }
    except Exception as e:
        print(f"Error parsing news item: {e}")
        raise


# %%
def search_news(keyword, page=1) -> list:
    """
    Fetch news articles related to a keyword from udn.com.
    
    Args:
        keyword: The search keyword for news articles.
        page: The page number to fetch (default is 1).
    
    Returns:
        A list of dictionaries containing link, time, headline and text of news article data.
        
    Raises:
        requests.RequestException: If there's an error fetching data from the URL.
        Exception: For other unexpected errors.
    """
    try:
        url = f"https://money.udn.com/search/result/1001/{keyword}/{page}"
        response = requests.get(url)
        
        if response.status_code != 200:
            raise requests.RequestException(f"Failed to retrieve data: {response.status_code}")
        
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.select('div > div > main > section > ul > li')
        
        results = []
        for article in articles:
            try:
                article_html = article.prettify()
                data = parse_news_item(article_html)
                # change dict to list
                data_list = list(data.values())
                results.append(data_list)
            except Exception as e:
                print(f"Error parsing article: {e}")
                continue
        
        return results
    except requests.RequestException as e:
        print(f"Network error in search_news: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error in search_news: {e}")
        raise

# search_news('台積電', 1)  # Example usage to fetch news articles related to '台積電'

# %%
# write a function to get the url and parse the content
def get_content(url) -> dict:
    """
    Fetch and parse the content of a given URL.
    
    Args:
        url: The URL to fetch and parse.

    Returns:
        A dictionary containing the title, text content, and HTML of the page.
        
    Raises:
        requests.RequestException: If there's an error fetching data from the URL.
        Exception: For other unexpected errors.
    """
    try:
        response = requests.get(url)

        if response.status_code != 200:
            raise requests.RequestException(f"Failed to retrieve {url}: {response.status_code}")

        soup = BeautifulSoup(response.text, 'html.parser')

        # using select to get the text inside the #article_body
        # This assumes the content is inside an element with id="article_body"
        article_body = soup.select_one('#article_body')
        text_content = ''
        if article_body:
            text_content = article_body.get_text(separator='\n', strip=True)

        return {
            'link': url,
            'title': soup.title.string if soup.title else 'No title',
            'text': text_content
        }
    except requests.RequestException as e:
        print(f"Network error in get_content: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error in get_content: {e}")
        raise

# %%
from smolagents import Tool, CodeAgent, LiteLLMModel, ToolCollection, ActionStep, FinalAnswerStep
import os

model_name = os.environ.get("AI_MODEL", "openrouter/qwen/qwen-2.5-coder-32b-instruct:free")
model = LiteLLMModel(model_name, api_key=os.environ["OPENROUTER_API_KEY"])
url = "https://robin0307-newsmcp.hf.space/gradio_api/mcp/sse"
server_parameters = {"url": url, "transport": "sse"}

def newsAgent(task: str) -> str:
    """
    News Agent to handle the news task.
    
    Args:
        task: The task description.

    Returns:
        The result of the Task.
        
    Raises:
        Exception: For errors during agent execution.
    """
    try:
        result = ""
        with ToolCollection.from_mcp(server_parameters, trust_remote_code=True) as mcp_tools:
            agent = CodeAgent(tools=[*mcp_tools.tools[:2]], model=model)
            for event in agent.run(task, stream=True, max_steps=5):
                if isinstance(event, ActionStep):
                    result += f"\n## ======Step {event.step_number}======\n### Action\n```python\n{event.code_action}\n```\n### Observation\n{event.observations}"
                    # yield result
                if isinstance(event, FinalAnswerStep):
                    result += f"\n## ======Final======\n{event.output}"
                    # yield result
        return result
    except Exception as e:
        error_msg = f"Error in newsAgent: {e}"
        print(error_msg)
        raise Exception(error_msg) from e

# get_content('https://money.udn.com/money/story/5612/8832289?from=edn_search_result')  # Example usage to fetch content from a specific URL

# %%
# using the gradio to create two tab 
# 1. search news
# 2. get content from url
def main():
    with gr.Blocks() as demo:
        gr.Markdown("# News Search and Content Fetcher")
        
        with gr.Tab("Search News"):
            keyword = gr.Textbox(label="Keyword", placeholder="Enter keyword to search news")
            page = gr.Number(label="Page Number", value=1, step=1)
            search_button = gr.Button("Search")
            search_results = gr.DataFrame(label="Search Results", headers=["Link", "Time", "Headline", "Text"])
            # Examples for Search News tab
            gr.Examples(
                examples=[
                    ["AI", 1],
                    ["華碩", 2]
                ],
                inputs=[keyword, page],
                outputs=search_results,
                fn=search_news,
                cache_examples=False
            )
            search_button.click(search_news, inputs=[keyword, page], outputs=search_results)
            
        
        with gr.Tab("Get Content from URL"):
            url_input = gr.Textbox(label="URL", placeholder="Enter URL to fetch content")
            content_output = gr.JSON(label="Content Output")
            # Examples for Get Content of News tab
            gr.Examples(
                examples=[
                    ["https://money.udn.com/money/story/5722/8870335?from=edn_search_result"],
                    ["https://money.udn.com/money/story/5612/8868152?from=edn_search_result"]
                ],
                inputs=[url_input],
                outputs=content_output,
                fn=get_content,
                cache_examples=False
            )
            url_input.submit(get_content, inputs=url_input, outputs=content_output)

        with gr.Tab("News Agent"):
            agent_input = gr.Textbox(label="Task", placeholder="Enter the task")
            # run_button = gr.Button("Run")
            result_output = gr.Markdown(label="Result")
             # Examples for Get Content of News tab
            gr.Examples(
                examples=[
                    ["華碩今日新聞"],
                    ["華碩和Nvidia今日新聞"]
                ],
                inputs=[agent_input],
                outputs=result_output,
                fn=newsAgent,
                cache_examples=True
            )
            agent_input.submit(newsAgent, inputs=agent_input, outputs=result_output)

    demo.launch(mcp_server=True, server_name="0.0.0.0",allowed_paths=["/"], share=True)

if __name__ == "__main__":
    main()