Spaces:

wekey1998
/

news-sentiment-project

Running

App Files Files Community

wekey1998 commited on 4 days ago

Commit

44fda85

verified ·

1 Parent(s): a453533

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +70 -62

scraper.py CHANGED Viewed

@@ -2,10 +2,10 @@ import requests
 from bs4 import BeautifulSoup
 import feedparser
 import trafilatura
-from urllib.parse import urljoin, urlparse
 import time
 import logging
-from datetime import datetime, timedelta
 from typing import List, Dict, Optional, Set
 import hashlib
 import re
@@ -46,8 +46,11 @@ class NewsletterScraper:
         logger.info("NewsletterScraper initialized")
     def _create_session(self) -> requests.Session:
-        """Create a session with retry strategy"""
         session = requests.Session()
         retry_strategy = Retry(
@@ -76,7 +79,7 @@ class NewsletterScraper:
         return wrapper
     def _get_random_headers(self) -> Dict[str, str]:
-        """Get randomized headers to avoid blocking"""
         return {
             'User-Agent': random.choice(self.user_agents),
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
@@ -90,7 +93,7 @@ class NewsletterScraper:
     # Public entrypoint
     # -------------------------------------------------------------------------
     def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
-        """Main scraping function"""
         logger.info(f"Starting news scraping for query: {query}")
         all_articles: List[Dict] = []
         self.scraped_urls.clear()
@@ -98,10 +101,12 @@ class NewsletterScraper:
         try:
             # Primary: Google News RSS
-            google_articles = self._scrape_google_news(query, max_articles // 2 or 5)
             all_articles.extend(google_articles)
-            # Secondary: Other RSS sources (limit a few to reduce timeouts on free CPU)
             for source_name, rss_url in list(self.rss_sources.items())[1:4]:
                 if len(all_articles) >= max_articles:
                     break
@@ -145,10 +150,9 @@ class NewsletterScraper:
     # -------------------------------------------------------------------------
     # Source-specific + generic RSS
     # -------------------------------------------------------------------------
-    def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
-        """Scrape Google News RSS and resolve to publisher URLs"""
         try:
-            url = self.rss_sources['google_news'].format(requests.utils.quote(query))
             headers = self._get_random_headers()
             response = self.session.get(url, headers=headers)
             if response.status_code != 200:
@@ -157,6 +161,7 @@ class NewsletterScraper:
             feed = feedparser.parse(response.content)
             articles: List[Dict] = []
             for entry in feed.entries[:max_articles * 2]:  # extra for filtering
                 try:
@@ -168,6 +173,10 @@ class NewsletterScraper:
                     title = clean_html(raw_title) if '<' in raw_title else raw_title
                     summary = clean_html(raw_summary) if '<' in raw_summary else raw_summary
                     article = {
                         'title': title.strip(),
                         'url': link,
@@ -202,7 +211,7 @@ class NewsletterScraper:
             return []
     def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
-        """Scrape a generic RSS source and soft-filter by query"""
         try:
             headers = self._get_random_headers()
             response = self.session.get(rss_url, headers=headers)
@@ -222,7 +231,7 @@ class NewsletterScraper:
                     if not (q in title.lower() or q in summary.lower()):
                         continue
-                    # Clean any HTML artifacts
                     title = clean_html(title) if '<' in title else title
                     summary = clean_html(summary) if '<' in summary else summary
@@ -267,60 +276,59 @@ class NewsletterScraper:
     # Extraction + cleaning
     # -------------------------------------------------------------------------
     def _extract_full_content(self, url: str) -> Optional[str]:
-    """
-    Extract full article content; resolve redirects; parse with trafilatura.
-    """
-    try:
-        headers = self._get_random_headers()
-        # If it's a Google News link, follow redirects to the publisher URL
-        parsed = urlparse(url)
-        if parsed.netloc.endswith("news.google.com"):
             try:
-                resp = self.session.get(url, headers=headers, allow_redirects=True)
-                if resp is not None and resp.url and resp.status_code in (200, 301, 302):
-                    url = resp.url
             except Exception as e:
-                logger.warning(f"Failed to resolve Google News redirect: {e}")
-        # 1st try: fetch HTML with our session (so we control headers)
-        downloaded_html = None
-        try:
-            r = self.session.get(url, headers=headers, allow_redirects=True)
-            if r is not None and r.status_code == 200:
-                downloaded_html = r.text
         except Exception as e:
-            logger.debug(f"requests fetch failed, will try trafilatura.fetch_url: {e}")
-        # Fallback: let trafilatura fetch the URL itself (no headers param)
-        if not downloaded_html:
-            downloaded_html = trafilatura.fetch_url(url)
-        if not downloaded_html:
             return None
-        # Extract readable text
-        text = trafilatura.extract(
-            downloaded_html,
-            include_comments=False,
-            include_tables=False,
-            include_formatting=False,
-            no_fallback=False,
-        )
-        if text and len(text.strip()) > 100:
-            return text.strip()
-        return None
-    except Exception as e:
-        logger.warning(f"Error extracting content from {url}: {str(e)}")
-        return None
     # -------------------------------------------------------------------------
     # Post-processing helpers
     # -------------------------------------------------------------------------
     def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
-        """Remove duplicate articles based on title+summary similarity"""
         unique_articles: List[Dict] = []
         for article in articles:
@@ -335,7 +343,7 @@ class NewsletterScraper:
         return unique_articles
     def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
-        """Filter articles for relevance and quality"""
         filtered: List[Dict] = []
         q = query.lower()
@@ -354,7 +362,7 @@ class NewsletterScraper:
         return filtered
     def _is_english(self, text: str) -> bool:
-        """Check if text is in English using language detection"""
         try:
             if len(text.strip()) < 20:
                 return True  # too short to decide; keep it
@@ -365,7 +373,7 @@ class NewsletterScraper:
             return True
     def _parse_date(self, date_str: str) -> Optional[datetime]:
-        """Parse date from RSS feed"""
         if not date_str:
             return datetime.now()
         try:
@@ -384,7 +392,7 @@ class NewsletterScraper:
             return datetime.now()
     def _extract_source_name(self, url: str) -> str:
-        """Extract source name from URL"""
         try:
             domain = urlparse(url).netloc
             domain = domain.replace('www.', '').replace('feeds.', '')
@@ -406,7 +414,7 @@ class NewsletterScraper:
             return 'Unknown'
     def get_available_sources(self) -> List[str]:
-        """Get list of available news sources"""
         return list(self.rss_sources.keys())
@@ -414,7 +422,7 @@ class NewsletterScraper:
 # Module-level helpers
 # -------------------------------------------------------------------------
 def clean_html(html_content: str) -> str:
-    """Clean HTML content and extract readable text"""
     try:
         soup = BeautifulSoup(html_content, 'html.parser')
@@ -438,7 +446,7 @@ def clean_html(html_content: str) -> str:
         return ""
 def is_valid_article_url(url: str) -> bool:
-    """Check if URL is likely to be a valid article URL"""
     try:
         parsed = urlparse(url)

 from bs4 import BeautifulSoup
 import feedparser
 import trafilatura
+from urllib.parse import urlparse
 import time
 import logging
+from datetime import datetime
 from typing import List, Dict, Optional, Set
 import hashlib
 import re
         logger.info("NewsletterScraper initialized")
+    # -------------------------------------------------------------------------
+    # Session + headers
+    # -------------------------------------------------------------------------
     def _create_session(self) -> requests.Session:
+        """Create a session with retry strategy and default timeouts."""
         session = requests.Session()
         retry_strategy = Retry(
         return wrapper
     def _get_random_headers(self) -> Dict[str, str]:
+        """Get randomized headers to avoid blocking."""
         return {
             'User-Agent': random.choice(self.user_agents),
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
     # Public entrypoint
     # -------------------------------------------------------------------------
     def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
+        """Main scraping function."""
         logger.info(f"Starting news scraping for query: {query}")
         all_articles: List[Dict] = []
         self.scraped_urls.clear()
         try:
             # Primary: Google News RSS
+            from requests.utils import quote
+            google_url = self.rss_sources['google_news'].format(quote(query))
+            google_articles = self._scrape_google_news(google_url, query, max_articles // 2 or 5)
             all_articles.extend(google_articles)
+            # Secondary: a few other RSS sources
             for source_name, rss_url in list(self.rss_sources.items())[1:4]:
                 if len(all_articles) >= max_articles:
                     break
     # -------------------------------------------------------------------------
     # Source-specific + generic RSS
     # -------------------------------------------------------------------------
+    def _scrape_google_news(self, url: str, query: str, max_articles: int) -> List[Dict]:
+        """Scrape Google News RSS and resolve to publisher URLs."""
         try:
             headers = self._get_random_headers()
             response = self.session.get(url, headers=headers)
             if response.status_code != 200:
             feed = feedparser.parse(response.content)
             articles: List[Dict] = []
+            q = query.lower()
             for entry in feed.entries[:max_articles * 2]:  # extra for filtering
                 try:
                     title = clean_html(raw_title) if '<' in raw_title else raw_title
                     summary = clean_html(raw_summary) if '<' in raw_summary else raw_summary
+                    # Soft query match
+                    if not (q in title.lower() or q in summary.lower()):
+                        continue
                     article = {
                         'title': title.strip(),
                         'url': link,
             return []
     def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
+        """Scrape a generic RSS source and soft-filter by query."""
         try:
             headers = self._get_random_headers()
             response = self.session.get(rss_url, headers=headers)
                     if not (q in title.lower() or q in summary.lower()):
                         continue
+                    # Clean HTML artifacts
                     title = clean_html(title) if '<' in title else title
                     summary = clean_html(summary) if '<' in summary else summary
     # Extraction + cleaning
     # -------------------------------------------------------------------------
     def _extract_full_content(self, url: str) -> Optional[str]:
+        """
+        Extract full article content; resolve redirects; parse with trafilatura.
+        """
+        try:
+            headers = self._get_random_headers()
+            # If it's a Google News link, follow redirects to the publisher URL
+            parsed = urlparse(url)
+            if parsed.netloc.endswith("news.google.com"):
+                try:
+                    resp = self.session.get(url, headers=headers, allow_redirects=True)
+                    if resp is not None and resp.url and resp.status_code in (200, 301, 302):
+                        url = resp.url
+                except Exception as e:
+                    logger.warning(f"Failed to resolve Google News redirect: {e}")
+            # 1st try: fetch HTML with our session (so we control headers)
+            downloaded_html = None
             try:
+                r = self.session.get(url, headers=headers, allow_redirects=True)
+                if r is not None and r.status_code == 200:
+                    downloaded_html = r.text
             except Exception as e:
+                logger.debug(f"requests fetch failed, will try trafilatura.fetch_url: {e}")
+            # Fallback: let trafilatura fetch the URL itself (no headers param)
+            if not downloaded_html:
+                downloaded_html = trafilatura.fetch_url(url)
+            if not downloaded_html:
+                return None
+            # Extract readable text
+            text = trafilatura.extract(
+                downloaded_html,
+                include_comments=False,
+                include_tables=False,
+                include_formatting=False,
+                no_fallback=False,
+            )
+            if text and len(text.strip()) > 100:
+                return text.strip()
+            return None
         except Exception as e:
+            logger.warning(f"Error extracting content from {url}: {str(e)}")
             return None
     # -------------------------------------------------------------------------
     # Post-processing helpers
     # -------------------------------------------------------------------------
     def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
+        """Remove duplicate articles based on title+summary similarity."""
         unique_articles: List[Dict] = []
         for article in articles:
         return unique_articles
     def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
+        """Filter articles for relevance and quality."""
         filtered: List[Dict] = []
         q = query.lower()
         return filtered
     def _is_english(self, text: str) -> bool:
+        """Check if text is in English using language detection."""
         try:
             if len(text.strip()) < 20:
                 return True  # too short to decide; keep it
             return True
     def _parse_date(self, date_str: str) -> Optional[datetime]:
+        """Parse date from RSS feed."""
         if not date_str:
             return datetime.now()
         try:
             return datetime.now()
     def _extract_source_name(self, url: str) -> str:
+        """Extract source name from URL."""
         try:
             domain = urlparse(url).netloc
             domain = domain.replace('www.', '').replace('feeds.', '')
             return 'Unknown'
     def get_available_sources(self) -> List[str]:
+        """Get list of available news sources."""
         return list(self.rss_sources.keys())
 # Module-level helpers
 # -------------------------------------------------------------------------
 def clean_html(html_content: str) -> str:
+    """Clean HTML content and extract readable text."""
     try:
         soup = BeautifulSoup(html_content, 'html.parser')
         return ""
 def is_valid_article_url(url: str) -> bool:
+    """Check if URL is likely to be a valid article URL."""
     try:
         parsed = urlparse(url)