dygoo commited on
Commit
0125826
·
verified ·
1 Parent(s): ae517af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -39
app.py CHANGED
@@ -17,34 +17,70 @@ search_tool = DuckDuckGoSearchTool()
17
  @tool
18
  def get_latest_news() -> Dict[str, List[str]]:
19
  """
20
- Tool returns news headlines from news sites with improved scraping configuration.
21
  Returns:
22
  Dict[str, List[str]]: A dictionary where keys are news site URLs and values are lists of headlines.
23
  """
24
- # More specific configuration for each site
25
  site_config = {
26
  "https://www.cnn.com/": {
27
- 'container_tag': 'div',
28
- 'container_class': 'container__headline', # Main headline container class
29
- 'headline_tag': 'span',
30
- 'headline_class': 'container__headline-text', # Actual headline text class
31
- 'alternative_tags': [
32
- {'tag': 'h3', 'class': 'cd__headline'},
33
- {'tag': 'span', 'class': 'card-text'}
34
- ]
 
 
 
 
35
  },
36
  "https://www.politico.com/": {
37
- 'container_tag': 'div',
38
- 'container_class': 'headline', # Main headline container
39
- 'headline_tag': 'h3',
40
- 'headline_class': 'headline__text', # Actual headline text class
41
- 'alternative_tags': [
42
- {'tag': 'h2', 'class': 'story-card__title'},
43
- {'tag': 'h3', 'class': 'media-item__headline'}
44
- ]
 
 
 
 
45
  }
46
  }
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  headlines = {}
49
  headers = {
50
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
@@ -55,29 +91,38 @@ def get_latest_news() -> Dict[str, List[str]]:
55
  response = requests.get(site_url, headers=headers, timeout=10)
56
  response.raise_for_status()
57
  soup = BeautifulSoup(response.content, 'html.parser')
58
- site_headlines = set() # Using set to avoid duplicates
59
-
60
- # Try primary method
61
- for container in soup.find_all(config['container_tag'], class_=config['container_class']):
62
- headline = container.find(config['headline_tag'], class_=config['headline_class'])
63
- if headline and headline.text.strip():
64
- site_headlines.add(headline.text.strip())
65
-
66
- # Try alternative tags if primary method didn't work
67
- if not site_headlines:
68
- for alt_config in config['alternative_tags']:
69
- headlines_elements = soup.find_all(alt_config['tag'], class_=alt_config['class'])
 
70
  for headline in headlines_elements:
71
- if headline.text.strip():
72
- site_headlines.add(headline.text.strip())
 
 
73
 
74
- # Clean up and store results
75
- headlines[site_url] = list(site_headlines)[:10] # Limit to top 10 headlines
76
-
77
- # If no headlines found, try a more generic approach
78
- if not headlines[site_url]:
79
- all_headlines = soup.find_all(['h1', 'h2', 'h3'], class_=lambda x: x and ('headline' in x.lower() or 'title' in x.lower()))
80
- headlines[site_url] = [h.text.strip() for h in all_headlines[:10] if h.text.strip()]
 
 
 
 
 
 
81
 
82
  except Exception as e:
83
  headlines[site_url] = [f"Error fetching news: {str(e)}"]
 
17
  @tool
18
  def get_latest_news() -> Dict[str, List[str]]:
19
  """
20
+ Tool returns legitimate news headlines while filtering out ads and sponsored content.
21
  Returns:
22
  Dict[str, List[str]]: A dictionary where keys are news site URLs and values are lists of headlines.
23
  """
 
24
  site_config = {
25
  "https://www.cnn.com/": {
26
+ 'sections': [
27
+ {
28
+ 'container': {'tag': 'div', 'class': 'zone__content'},
29
+ 'headline': {'tag': 'span', 'class': 'container__headline-text'}
30
+ },
31
+ {
32
+ 'container': {'tag': 'div', 'class': 'card'},
33
+ 'headline': {'tag': 'span', 'class': 'headline__text'}
34
+ }
35
+ ],
36
+ 'exclude_classes': ['ad', 'sponsor', 'paid', 'promoted', 'advertisement'],
37
+ 'exclude_parents': ['div.ad-feedback', 'div.commercial']
38
  },
39
  "https://www.politico.com/": {
40
+ 'sections': [
41
+ {
42
+ 'container': {'tag': 'div', 'class': 'story-grid'},
43
+ 'headline': {'tag': 'h3', 'class': 'headline'}
44
+ },
45
+ {
46
+ 'container': {'tag': 'article', 'class': 'story-feed__story'},
47
+ 'headline': {'tag': 'h3', 'class': 'story-feed__headline'}
48
+ }
49
+ ],
50
+ 'exclude_classes': ['ad', 'sponsor', 'paid-content', 'promoted'],
51
+ 'exclude_parents': ['div.advertisement', 'div.sponsor-content']
52
  }
53
  }
54
 
55
+ def is_valid_headline(element, site_config) -> bool:
56
+ """Check if the headline is legitimate news and not an advertisement."""
57
+ # Check if element or its parents have excluded classes
58
+ for excluded in site_config['exclude_classes']:
59
+ if any(excluded in cls.lower() for cls in element.get('class', [])):
60
+ return False
61
+
62
+ # Check parent elements
63
+ parent = element.parent
64
+ while parent:
65
+ if parent.get('class'):
66
+ # Check for excluded parent classes
67
+ if any(excluded in ' '.join(parent.get('class', [])).lower()
68
+ for excluded in site_config['exclude_classes']):
69
+ return False
70
+ parent = parent.parent
71
+
72
+ # Check for common ad indicators in text
73
+ text = element.text.lower().strip()
74
+ ad_indicators = ['sponsored', 'advertisement', 'promoted', 'paid content', 'partner content']
75
+ if any(indicator in text for indicator in ad_indicators):
76
+ return False
77
+
78
+ # Check minimum length to filter out potential button text or incomplete headlines
79
+ if len(text) < 20:
80
+ return False
81
+
82
+ return True
83
+
84
  headlines = {}
85
  headers = {
86
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
91
  response = requests.get(site_url, headers=headers, timeout=10)
92
  response.raise_for_status()
93
  soup = BeautifulSoup(response.content, 'html.parser')
94
+ site_headlines = set()
95
+
96
+ # Try each configured section
97
+ for section in config['sections']:
98
+ containers = soup.find_all(section['container']['tag'],
99
+ class_=section['container']['class'])
100
+
101
+ for container in containers:
102
+ headlines_elements = container.find_all(
103
+ section['headline']['tag'],
104
+ class_=section['headline']['class']
105
+ )
106
+
107
  for headline in headlines_elements:
108
+ if is_valid_headline(headline, config):
109
+ clean_text = ' '.join(headline.text.strip().split())
110
+ if clean_text:
111
+ site_headlines.add(clean_text)
112
 
113
+ # If no headlines found, try main content area with stricter filtering
114
+ if not site_headlines:
115
+ main_content = soup.find('main') or soup.find('div', id='main-content')
116
+ if main_content:
117
+ for headline_tag in ['h1', 'h2', 'h3']:
118
+ headlines_elements = main_content.find_all(headline_tag)
119
+ for headline in headlines_elements:
120
+ if is_valid_headline(headline, config):
121
+ clean_text = ' '.join(headline.text.strip().split())
122
+ if clean_text:
123
+ site_headlines.add(clean_text)
124
+
125
+ headlines[site_url] = list(site_headlines)[:10]
126
 
127
  except Exception as e:
128
  headlines[site_url] = [f"Error fetching news: {str(e)}"]