First_agent_template

Sleeping

App Files Files Community

dygoo commited on Feb 14

Commit

0125826

verified ·

1 Parent(s): ae517af

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -39

app.py CHANGED Viewed

@@ -17,34 +17,70 @@ search_tool = DuckDuckGoSearchTool()
 @tool
 def get_latest_news() -> Dict[str, List[str]]:
     """
-    Tool returns news headlines from news sites with improved scraping configuration.
     Returns:
         Dict[str, List[str]]: A dictionary where keys are news site URLs and values are lists of headlines.
     """
-    # More specific configuration for each site
     site_config = {
         "https://www.cnn.com/": {
-            'container_tag': 'div',
-            'container_class': 'container__headline',  # Main headline container class
-            'headline_tag': 'span',
-            'headline_class': 'container__headline-text',  # Actual headline text class
-            'alternative_tags': [
-                {'tag': 'h3', 'class': 'cd__headline'},
-                {'tag': 'span', 'class': 'card-text'}
-            ]
         },
         "https://www.politico.com/": {
-            'container_tag': 'div',
-            'container_class': 'headline',  # Main headline container
-            'headline_tag': 'h3',
-            'headline_class': 'headline__text',  # Actual headline text class
-            'alternative_tags': [
-                {'tag': 'h2', 'class': 'story-card__title'},
-                {'tag': 'h3', 'class': 'media-item__headline'}
-            ]
         }
     }
     headlines = {}
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
@@ -55,29 +91,38 @@ def get_latest_news() -> Dict[str, List[str]]:
             response = requests.get(site_url, headers=headers, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.content, 'html.parser')
-            site_headlines = set()  # Using set to avoid duplicates
-            # Try primary method
-            for container in soup.find_all(config['container_tag'], class_=config['container_class']):
-                headline = container.find(config['headline_tag'], class_=config['headline_class'])
-                if headline and headline.text.strip():
-                    site_headlines.add(headline.text.strip())
-            # Try alternative tags if primary method didn't work
-            if not site_headlines:
-                for alt_config in config['alternative_tags']:
-                    headlines_elements = soup.find_all(alt_config['tag'], class_=alt_config['class'])
                     for headline in headlines_elements:
-                        if headline.text.strip():
-                            site_headlines.add(headline.text.strip())
-            # Clean up and store results
-            headlines[site_url] = list(site_headlines)[:10]  # Limit to top 10 headlines
-            # If no headlines found, try a more generic approach
-            if not headlines[site_url]:
-                all_headlines = soup.find_all(['h1', 'h2', 'h3'], class_=lambda x: x and ('headline' in x.lower() or 'title' in x.lower()))
-                headlines[site_url] = [h.text.strip() for h in all_headlines[:10] if h.text.strip()]
         except Exception as e:
             headlines[site_url] = [f"Error fetching news: {str(e)}"]

 @tool
 def get_latest_news() -> Dict[str, List[str]]:
     """
+    Tool returns legitimate news headlines while filtering out ads and sponsored content.
     Returns:
         Dict[str, List[str]]: A dictionary where keys are news site URLs and values are lists of headlines.
     """
     site_config = {
         "https://www.cnn.com/": {
+            'sections': [
+                {
+                    'container': {'tag': 'div', 'class': 'zone__content'},
+                    'headline': {'tag': 'span', 'class': 'container__headline-text'}
+                },
+                {
+                    'container': {'tag': 'div', 'class': 'card'},
+                    'headline': {'tag': 'span', 'class': 'headline__text'}
+                }
+            ],
+            'exclude_classes': ['ad', 'sponsor', 'paid', 'promoted', 'advertisement'],
+            'exclude_parents': ['div.ad-feedback', 'div.commercial']
         },
         "https://www.politico.com/": {
+            'sections': [
+                {
+                    'container': {'tag': 'div', 'class': 'story-grid'},
+                    'headline': {'tag': 'h3', 'class': 'headline'}
+                },
+                {
+                    'container': {'tag': 'article', 'class': 'story-feed__story'},
+                    'headline': {'tag': 'h3', 'class': 'story-feed__headline'}
+                }
+            ],
+            'exclude_classes': ['ad', 'sponsor', 'paid-content', 'promoted'],
+            'exclude_parents': ['div.advertisement', 'div.sponsor-content']
         }
     }
+    def is_valid_headline(element, site_config) -> bool:
+        """Check if the headline is legitimate news and not an advertisement."""
+        # Check if element or its parents have excluded classes
+        for excluded in site_config['exclude_classes']:
+            if any(excluded in cls.lower() for cls in element.get('class', [])):
+                return False
+        # Check parent elements
+        parent = element.parent
+        while parent:
+            if parent.get('class'):
+                # Check for excluded parent classes
+                if any(excluded in ' '.join(parent.get('class', [])).lower()
+                      for excluded in site_config['exclude_classes']):
+                    return False
+            parent = parent.parent
+        # Check for common ad indicators in text
+        text = element.text.lower().strip()
+        ad_indicators = ['sponsored', 'advertisement', 'promoted', 'paid content', 'partner content']
+        if any(indicator in text for indicator in ad_indicators):
+            return False
+        # Check minimum length to filter out potential button text or incomplete headlines
+        if len(text) < 20:
+            return False
+        return True
     headlines = {}
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             response = requests.get(site_url, headers=headers, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.content, 'html.parser')
+            site_headlines = set()
+            # Try each configured section
+            for section in config['sections']:
+                containers = soup.find_all(section['container']['tag'],
+                                        class_=section['container']['class'])
+                for container in containers:
+                    headlines_elements = container.find_all(
+                        section['headline']['tag'],
+                        class_=section['headline']['class']
+                    )
                     for headline in headlines_elements:
+                        if is_valid_headline(headline, config):
+                            clean_text = ' '.join(headline.text.strip().split())
+                            if clean_text:
+                                site_headlines.add(clean_text)
+            # If no headlines found, try main content area with stricter filtering
+            if not site_headlines:
+                main_content = soup.find('main') or soup.find('div', id='main-content')
+                if main_content:
+                    for headline_tag in ['h1', 'h2', 'h3']:
+                        headlines_elements = main_content.find_all(headline_tag)
+                        for headline in headlines_elements:
+                            if is_valid_headline(headline, config):
+                                clean_text = ' '.join(headline.text.strip().split())
+                                if clean_text:
+                                    site_headlines.add(clean_text)
+            headlines[site_url] = list(site_headlines)[:10]
         except Exception as e:
             headlines[site_url] = [f"Error fetching news: {str(e)}"]