dygoo commited on
Commit
ae517af
·
verified ·
1 Parent(s): 2344737

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -44
app.py CHANGED
@@ -14,59 +14,75 @@ from Gradio_UI import GradioUI
14
 
15
  search_tool = DuckDuckGoSearchTool()
16
 
17
-
18
  @tool
19
  def get_latest_news() -> Dict[str, List[str]]:
20
  """
21
- Tool returns news headlines from news sites.
22
-
23
  Returns:
24
- Dict[str, List[str]]: A dictionary where the keys are the news site URLs and the values are lists of headlines.
25
-
26
- Notes:
27
- The function uses a predefined `news_sites` list and a `site_config` dictionary to determine the HTML tag and class
28
- to extract headlines from each site. The `site_config` dictionary should have the following structure:
29
- {
30
- "site_url": {'tag': 'html_tag', 'class': 'css_class'}
31
- }
32
- If a site is not found in `site_config`, it defaults to {'tag': 'h2', 'class': 'headline'}.
33
  """
34
- news_sites = ["https://www.cnn.com/", "https://www.politico.com/"]
35
  site_config = {
36
- "https://www.cnn.com/": {'tag': 'h2', 'class': 'headline'},
37
- "https://www.politico.com/": {'tag': 'h2', 'class': 'headline'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  }
39
-
40
  headlines = {}
41
- for site in news_sites:
 
 
 
 
42
  try:
43
- config = site_config.get(site, {'tag': 'h2', 'class': 'headline'})
44
- response = requests.get(site)
45
  response.raise_for_status()
46
  soup = BeautifulSoup(response.content, 'html.parser')
47
- site_headlines = soup.find_all(config['tag'], class_=config['class'])
48
- headlines[site] = [headline.text for headline in site_headlines]
49
- except requests.RequestException as e:
50
- headlines[site] = f"Error fetching news: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  return headlines
52
-
53
-
54
-
55
-
56
- @tool
57
- def get_current_time_in_timezone(timezone: str) -> str:
58
- """A tool that fetches the current local time in a specified timezone.
59
- Args:
60
- timezone: A string representing a valid timezone (e.g., 'America/New_York').
61
- """
62
- try:
63
- # Create timezone object
64
- tz = pytz.timezone(timezone)
65
- # Get current time in that timezone
66
- local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
67
- return f"The current local time in {timezone} is: {local_time}"
68
- except Exception as e:
69
- return f"Error fetching time for timezone '{timezone}': {str(e)}"
70
 
71
 
72
  final_answer = FinalAnswerTool()
@@ -83,15 +99,13 @@ custom_role_conversions=None,
83
  )
84
 
85
 
86
- # Import tool from Hub
87
- image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
88
 
89
  with open("prompts.yaml", 'r') as stream:
90
  prompt_templates = yaml.safe_load(stream)
91
 
92
  agent = CodeAgent(
93
  model=model,
94
- tools=[final_answer, get_current_time_in_timezone,image_generation_tool, search_tool, get_latest_news], ## add your tools here (don't remove final answer)
95
  max_steps=6,
96
  verbosity_level=1,
97
  grammar=None,
 
14
 
15
  search_tool = DuckDuckGoSearchTool()
16
 
 
17
  @tool
18
  def get_latest_news() -> Dict[str, List[str]]:
19
  """
20
+ Tool returns news headlines from news sites with improved scraping configuration.
 
21
  Returns:
22
+ Dict[str, List[str]]: A dictionary where keys are news site URLs and values are lists of headlines.
 
 
 
 
 
 
 
 
23
  """
24
+ # More specific configuration for each site
25
  site_config = {
26
+ "https://www.cnn.com/": {
27
+ 'container_tag': 'div',
28
+ 'container_class': 'container__headline', # Main headline container class
29
+ 'headline_tag': 'span',
30
+ 'headline_class': 'container__headline-text', # Actual headline text class
31
+ 'alternative_tags': [
32
+ {'tag': 'h3', 'class': 'cd__headline'},
33
+ {'tag': 'span', 'class': 'card-text'}
34
+ ]
35
+ },
36
+ "https://www.politico.com/": {
37
+ 'container_tag': 'div',
38
+ 'container_class': 'headline', # Main headline container
39
+ 'headline_tag': 'h3',
40
+ 'headline_class': 'headline__text', # Actual headline text class
41
+ 'alternative_tags': [
42
+ {'tag': 'h2', 'class': 'story-card__title'},
43
+ {'tag': 'h3', 'class': 'media-item__headline'}
44
+ ]
45
+ }
46
  }
47
+
48
  headlines = {}
49
+ headers = {
50
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
51
+ }
52
+
53
+ for site_url, config in site_config.items():
54
  try:
55
+ response = requests.get(site_url, headers=headers, timeout=10)
 
56
  response.raise_for_status()
57
  soup = BeautifulSoup(response.content, 'html.parser')
58
+ site_headlines = set() # Using set to avoid duplicates
59
+
60
+ # Try primary method
61
+ for container in soup.find_all(config['container_tag'], class_=config['container_class']):
62
+ headline = container.find(config['headline_tag'], class_=config['headline_class'])
63
+ if headline and headline.text.strip():
64
+ site_headlines.add(headline.text.strip())
65
+
66
+ # Try alternative tags if primary method didn't work
67
+ if not site_headlines:
68
+ for alt_config in config['alternative_tags']:
69
+ headlines_elements = soup.find_all(alt_config['tag'], class_=alt_config['class'])
70
+ for headline in headlines_elements:
71
+ if headline.text.strip():
72
+ site_headlines.add(headline.text.strip())
73
+
74
+ # Clean up and store results
75
+ headlines[site_url] = list(site_headlines)[:10] # Limit to top 10 headlines
76
+
77
+ # If no headlines found, try a more generic approach
78
+ if not headlines[site_url]:
79
+ all_headlines = soup.find_all(['h1', 'h2', 'h3'], class_=lambda x: x and ('headline' in x.lower() or 'title' in x.lower()))
80
+ headlines[site_url] = [h.text.strip() for h in all_headlines[:10] if h.text.strip()]
81
+
82
+ except Exception as e:
83
+ headlines[site_url] = [f"Error fetching news: {str(e)}"]
84
+
85
  return headlines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
 
88
  final_answer = FinalAnswerTool()
 
99
  )
100
 
101
 
 
 
102
 
103
  with open("prompts.yaml", 'r') as stream:
104
  prompt_templates = yaml.safe_load(stream)
105
 
106
  agent = CodeAgent(
107
  model=model,
108
+ tools=[final_answer, search_tool, get_latest_news], ## add your tools here (don't remove final answer)
109
  max_steps=6,
110
  verbosity_level=1,
111
  grammar=None,