|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
import gradio as gr |
|
|
|
def parse_news_item(html: str) -> dict: |
|
""" |
|
Parse HTML of a news item to extract link, time, headline, and text. |
|
|
|
Args: |
|
html: The HTML string of a news item. |
|
|
|
Returns: |
|
A dictionary containing link, time, headline, and text. |
|
|
|
Raises: |
|
Exception: For parsing errors or other unexpected errors. |
|
""" |
|
try: |
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
|
|
link_tag = soup.find("a", href=True) |
|
link = link_tag["href"] if link_tag else None |
|
|
|
|
|
headline_tag = soup.find("h3", class_="story__headline") |
|
headline = headline_tag.get_text(strip=True) if headline_tag else None |
|
|
|
|
|
text_tag = soup.find("p", class_="story__text") |
|
text = text_tag.get_text(strip=True) if text_tag else None |
|
|
|
|
|
time_tag = soup.find("time") |
|
time = time_tag.get_text(strip=True) if time_tag else None |
|
|
|
return { |
|
"link": link, |
|
"time": time, |
|
"headline": headline, |
|
"text": text, |
|
} |
|
except Exception as e: |
|
print(f"Error parsing news item: {e}") |
|
raise |
|
|
|
|
|
|
|
def search_news(keyword, page=1) -> list: |
|
""" |
|
Fetch news articles related to a keyword from udn.com. |
|
|
|
Args: |
|
keyword: The search keyword for news articles. |
|
page: The page number to fetch (default is 1). |
|
|
|
Returns: |
|
A list of dictionaries containing link, time, headline and text of news article data. |
|
|
|
Raises: |
|
requests.RequestException: If there's an error fetching data from the URL. |
|
Exception: For other unexpected errors. |
|
""" |
|
try: |
|
url = f"https://money.udn.com/search/result/1001/{keyword}/{page}" |
|
response = requests.get(url) |
|
|
|
if response.status_code != 200: |
|
raise requests.RequestException(f"Failed to retrieve data: {response.status_code}") |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
articles = soup.select('div > div > main > section > ul > li') |
|
|
|
results = [] |
|
for article in articles: |
|
try: |
|
article_html = article.prettify() |
|
data = parse_news_item(article_html) |
|
|
|
data_list = list(data.values()) |
|
results.append(data_list) |
|
except Exception as e: |
|
print(f"Error parsing article: {e}") |
|
continue |
|
|
|
return results |
|
except requests.RequestException as e: |
|
print(f"Network error in search_news: {e}") |
|
raise |
|
except Exception as e: |
|
print(f"Unexpected error in search_news: {e}") |
|
raise |
|
|
|
|
|
|
|
|
|
|
|
def get_content(url) -> dict: |
|
""" |
|
Fetch and parse the content of a given URL. |
|
|
|
Args: |
|
url: The URL to fetch and parse. |
|
|
|
Returns: |
|
A dictionary containing the title, text content, and HTML of the page. |
|
|
|
Raises: |
|
requests.RequestException: If there's an error fetching data from the URL. |
|
Exception: For other unexpected errors. |
|
""" |
|
try: |
|
response = requests.get(url) |
|
|
|
if response.status_code != 200: |
|
raise requests.RequestException(f"Failed to retrieve {url}: {response.status_code}") |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
|
|
article_body = soup.select_one('#article_body') |
|
text_content = '' |
|
if article_body: |
|
text_content = article_body.get_text(separator='\n', strip=True) |
|
|
|
return { |
|
'link': url, |
|
'title': soup.title.string if soup.title else 'No title', |
|
'text': text_content |
|
} |
|
except requests.RequestException as e: |
|
print(f"Network error in get_content: {e}") |
|
raise |
|
except Exception as e: |
|
print(f"Unexpected error in get_content: {e}") |
|
raise |
|
|
|
|
|
from smolagents import Tool, CodeAgent, LiteLLMModel, ToolCollection, ActionStep, FinalAnswerStep |
|
import os |
|
|
|
model_name = os.environ.get("AI_MODEL", "openrouter/qwen/qwen-2.5-coder-32b-instruct:free") |
|
model = LiteLLMModel(model_name, api_key=os.environ["OPENROUTER_API_KEY"]) |
|
url = "https://robin0307-newsmcp.hf.space/gradio_api/mcp/sse" |
|
server_parameters = {"url": url, "transport": "sse"} |
|
|
|
def newsAgent(task: str) -> str: |
|
""" |
|
News Agent to handle the news task. |
|
|
|
Args: |
|
task: The task description. |
|
|
|
Returns: |
|
The result of the Task. |
|
|
|
Raises: |
|
Exception: For errors during agent execution. |
|
""" |
|
try: |
|
result = "" |
|
with ToolCollection.from_mcp(server_parameters, trust_remote_code=True) as mcp_tools: |
|
agent = CodeAgent(tools=[*mcp_tools.tools[:2]], model=model) |
|
for event in agent.run(task, stream=True, max_steps=5): |
|
if isinstance(event, ActionStep): |
|
result += f"\n## ======Step {event.step_number}======\n### Action\n```python\n{event.code_action}\n```\n### Observation\n{event.observations}" |
|
|
|
if isinstance(event, FinalAnswerStep): |
|
result += f"\n## ======Final======\n{event.output}" |
|
|
|
return result |
|
except Exception as e: |
|
error_msg = f"Error in newsAgent: {e}" |
|
print(error_msg) |
|
raise Exception(error_msg) from e |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
with gr.Blocks() as demo: |
|
gr.Markdown("# News Search and Content Fetcher") |
|
|
|
with gr.Tab("Search News"): |
|
keyword = gr.Textbox(label="Keyword", placeholder="Enter keyword to search news") |
|
page = gr.Number(label="Page Number", value=1, step=1) |
|
search_button = gr.Button("Search") |
|
search_results = gr.DataFrame(label="Search Results", headers=["Link", "Time", "Headline", "Text"]) |
|
|
|
gr.Examples( |
|
examples=[ |
|
["AI", 1], |
|
["華碩", 2] |
|
], |
|
inputs=[keyword, page], |
|
outputs=search_results, |
|
fn=search_news, |
|
cache_examples=False |
|
) |
|
search_button.click(search_news, inputs=[keyword, page], outputs=search_results) |
|
|
|
|
|
with gr.Tab("Get Content from URL"): |
|
url_input = gr.Textbox(label="URL", placeholder="Enter URL to fetch content") |
|
content_output = gr.JSON(label="Content Output") |
|
|
|
gr.Examples( |
|
examples=[ |
|
["https://money.udn.com/money/story/5722/8870335?from=edn_search_result"], |
|
["https://money.udn.com/money/story/5612/8868152?from=edn_search_result"] |
|
], |
|
inputs=[url_input], |
|
outputs=content_output, |
|
fn=get_content, |
|
cache_examples=False |
|
) |
|
url_input.submit(get_content, inputs=url_input, outputs=content_output) |
|
|
|
with gr.Tab("News Agent"): |
|
agent_input = gr.Textbox(label="Task", placeholder="Enter the task") |
|
|
|
result_output = gr.Markdown(label="Result") |
|
|
|
gr.Examples( |
|
examples=[ |
|
["華碩今日新聞"], |
|
["華碩和Nvidia今日新聞"] |
|
], |
|
inputs=[agent_input], |
|
outputs=result_output, |
|
fn=newsAgent, |
|
cache_examples=True |
|
) |
|
agent_input.submit(newsAgent, inputs=agent_input, outputs=result_output) |
|
|
|
demo.launch(mcp_server=True, server_name="0.0.0.0",allowed_paths=["/"], share=True) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|