File size: 8,621 Bytes
6185b4f 9f63939 6185b4f 9f63939 6185b4f 07cc8e5 6185b4f 07cc8e5 6185b4f 07cc8e5 6185b4f 07cc8e5 6185b4f 07cc8e5 6185b4f 07cc8e5 6185b4f e225706 6185b4f e225706 07cc8e5 6185b4f 07cc8e5 6185b4f e225706 6185b4f 07cc8e5 6185b4f 07cc8e5 6185b4f 9f63939 07cc8e5 9f63939 07cc8e5 9f63939 07cc8e5 6185b4f e225706 6185b4f e225706 6185b4f e225706 6185b4f 9f63939 e225706 6185b4f 9f63939 6185b4f 9f63939 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
# %%
import requests
from bs4 import BeautifulSoup
import gradio as gr
def parse_news_item(html: str) -> dict:
"""
Parse HTML of a news item to extract link, time, headline, and text.
Args:
html: The HTML string of a news item.
Returns:
A dictionary containing link, time, headline, and text.
Raises:
Exception: For parsing errors or other unexpected errors.
"""
try:
soup = BeautifulSoup(html, "html.parser")
# Get the anchor tag containing the link
link_tag = soup.find("a", href=True)
link = link_tag["href"] if link_tag else None
# Get the headline inside <h3>
headline_tag = soup.find("h3", class_="story__headline")
headline = headline_tag.get_text(strip=True) if headline_tag else None
# Get the text inside <p>
text_tag = soup.find("p", class_="story__text")
text = text_tag.get_text(strip=True) if text_tag else None
# Get the time inside <time>
time_tag = soup.find("time")
time = time_tag.get_text(strip=True) if time_tag else None
return {
"link": link,
"time": time,
"headline": headline,
"text": text,
}
except Exception as e:
print(f"Error parsing news item: {e}")
raise
# %%
def search_news(keyword, page=1) -> list:
"""
Fetch news articles related to a keyword from udn.com.
Args:
keyword: The search keyword for news articles.
page: The page number to fetch (default is 1).
Returns:
A list of dictionaries containing link, time, headline and text of news article data.
Raises:
requests.RequestException: If there's an error fetching data from the URL.
Exception: For other unexpected errors.
"""
try:
url = f"https://money.udn.com/search/result/1001/{keyword}/{page}"
response = requests.get(url)
if response.status_code != 200:
raise requests.RequestException(f"Failed to retrieve data: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.select('div > div > main > section > ul > li')
results = []
for article in articles:
try:
article_html = article.prettify()
data = parse_news_item(article_html)
# change dict to list
data_list = list(data.values())
results.append(data_list)
except Exception as e:
print(f"Error parsing article: {e}")
continue
return results
except requests.RequestException as e:
print(f"Network error in search_news: {e}")
raise
except Exception as e:
print(f"Unexpected error in search_news: {e}")
raise
# search_news('台積電', 1) # Example usage to fetch news articles related to '台積電'
# %%
# write a function to get the url and parse the content
def get_content(url) -> dict:
"""
Fetch and parse the content of a given URL.
Args:
url: The URL to fetch and parse.
Returns:
A dictionary containing the title, text content, and HTML of the page.
Raises:
requests.RequestException: If there's an error fetching data from the URL.
Exception: For other unexpected errors.
"""
try:
response = requests.get(url)
if response.status_code != 200:
raise requests.RequestException(f"Failed to retrieve {url}: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')
# using select to get the text inside the #article_body
# This assumes the content is inside an element with id="article_body"
article_body = soup.select_one('#article_body')
text_content = ''
if article_body:
text_content = article_body.get_text(separator='\n', strip=True)
return {
'link': url,
'title': soup.title.string if soup.title else 'No title',
'text': text_content
}
except requests.RequestException as e:
print(f"Network error in get_content: {e}")
raise
except Exception as e:
print(f"Unexpected error in get_content: {e}")
raise
# %%
from smolagents import Tool, CodeAgent, LiteLLMModel, ToolCollection, ActionStep, FinalAnswerStep
import os
model_name = os.environ.get("AI_MODEL", "openrouter/qwen/qwen-2.5-coder-32b-instruct:free")
model = LiteLLMModel(model_name, api_key=os.environ["OPENROUTER_API_KEY"])
url = "https://robin0307-newsmcp.hf.space/gradio_api/mcp/sse"
server_parameters = {"url": url, "transport": "sse"}
def newsAgent(task: str) -> str:
"""
News Agent to handle the news task.
Args:
task: The task description.
Returns:
The result of the Task.
Raises:
Exception: For errors during agent execution.
"""
try:
result = ""
with ToolCollection.from_mcp(server_parameters, trust_remote_code=True) as mcp_tools:
agent = CodeAgent(tools=[*mcp_tools.tools[:2]], model=model)
for event in agent.run(task, stream=True, max_steps=5):
if isinstance(event, ActionStep):
result += f"\n## ======Step {event.step_number}======\n### Action\n```python\n{event.code_action}\n```\n### Observation\n{event.observations}"
# yield result
if isinstance(event, FinalAnswerStep):
result += f"\n## ======Final======\n{event.output}"
# yield result
return result
except Exception as e:
error_msg = f"Error in newsAgent: {e}"
print(error_msg)
raise Exception(error_msg) from e
# get_content('https://money.udn.com/money/story/5612/8832289?from=edn_search_result') # Example usage to fetch content from a specific URL
# %%
# using the gradio to create two tab
# 1. search news
# 2. get content from url
def main():
with gr.Blocks() as demo:
gr.Markdown("# News Search and Content Fetcher")
with gr.Tab("Search News"):
keyword = gr.Textbox(label="Keyword", placeholder="Enter keyword to search news")
page = gr.Number(label="Page Number", value=1, step=1)
search_button = gr.Button("Search")
search_results = gr.DataFrame(label="Search Results", headers=["Link", "Time", "Headline", "Text"])
# Examples for Search News tab
gr.Examples(
examples=[
["AI", 1],
["華碩", 2]
],
inputs=[keyword, page],
outputs=search_results,
fn=search_news,
cache_examples=False
)
search_button.click(search_news, inputs=[keyword, page], outputs=search_results)
with gr.Tab("Get Content from URL"):
url_input = gr.Textbox(label="URL", placeholder="Enter URL to fetch content")
content_output = gr.JSON(label="Content Output")
# Examples for Get Content of News tab
gr.Examples(
examples=[
["https://money.udn.com/money/story/5722/8870335?from=edn_search_result"],
["https://money.udn.com/money/story/5612/8868152?from=edn_search_result"]
],
inputs=[url_input],
outputs=content_output,
fn=get_content,
cache_examples=False
)
url_input.submit(get_content, inputs=url_input, outputs=content_output)
with gr.Tab("News Agent"):
agent_input = gr.Textbox(label="Task", placeholder="Enter the task")
# run_button = gr.Button("Run")
result_output = gr.Markdown(label="Result")
# Examples for Get Content of News tab
gr.Examples(
examples=[
["華碩今日新聞"],
["華碩和Nvidia今日新聞"]
],
inputs=[agent_input],
outputs=result_output,
fn=newsAgent,
cache_examples=True
)
agent_input.submit(newsAgent, inputs=agent_input, outputs=result_output)
demo.launch(mcp_server=True, server_name="0.0.0.0",allowed_paths=["/"], share=True)
if __name__ == "__main__":
main()
|