Spaces:

wekey1998
/

news-sentiment-project

Running

App Files Files Community

wekey1998 commited on 5 days ago

Commit

9b1ab7e

verified ·

1 Parent(s): b02cc7b

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +40 -30

scraper.py CHANGED Viewed

@@ -267,41 +267,51 @@ class NewsletterScraper:
     # Extraction + cleaning
     # -------------------------------------------------------------------------
     def _extract_full_content(self, url: str) -> Optional[str]:
-        """Extract full article content; resolves Google News redirect first."""
-        try:
-            headers = self._get_random_headers()
-            # If it's a Google News link, follow redirects to the publisher URL
-            parsed = urlparse(url)
-            if parsed.netloc.endswith("news.google.com"):
-                try:
-                    resp = self.session.get(url, headers=headers, allow_redirects=True)
-                    if resp is not None and resp.url and resp.status_code in (200, 301, 302):
-                        url = resp.url
-                except Exception as e:
-                    logger.warning(f"Failed to resolve Google News redirect: {e}")
-            # Fetch with trafilatura at the publisher URL
-            downloaded = trafilatura.fetch_url(url, headers=headers)
-            if not downloaded:
-                return None
-            text = trafilatura.extract(
-                downloaded,
-                include_comments=False,
-                include_tables=False,
-                include_formatting=False,
-                no_fallback=False
-            )
-            if text and len(text.strip()) > 100:
-                return text.strip()
             return None
-        except Exception as e:
-            logger.warning(f"Error extracting content from {url}: {str(e)}")
-            return None
     # -------------------------------------------------------------------------
     # Post-processing helpers

     # Extraction + cleaning
     # -------------------------------------------------------------------------
     def _extract_full_content(self, url: str) -> Optional[str]:
+    """Extract full article content; resolve redirects; parse with trafilatura."""
+    try:
+        headers = self._get_random_headers()
+        # If it's a Google News link, follow redirects to the publisher URL
+        parsed = urlparse(url)
+        if parsed.netloc.endswith("news.google.com"):
+            try:
+                resp = self.session.get(url, headers=headers, allow_redirects=True)
+                if resp is not None and resp.url and resp.status_code in (200, 301, 302):
+                    url = resp.url
+            except Exception as e:
+                logger.warning(f"Failed to resolve Google News redirect: {e}")
+        # 1st try: fetch HTML with our session (so we control headers)
+        downloaded_html = None
+        try:
+            r = self.session.get(url, headers=headers, allow_redirects=True)
+            if r is not None and r.status_code == 200:
+                downloaded_html = r.text
+        except Exception as e:
+            logger.debug(f"requests fetch failed, will try trafilatura.fetch_url: {e}")
+        # Fallback: let trafilatura fetch the URL itself (no headers param)
+        if not downloaded_html:
+            downloaded_html = trafilatura.fetch_url(url)
+        if not downloaded_html:
             return None
+        # Extract readable text
+        text = trafilatura.extract(
+            downloaded_html,
+            include_comments=False,
+            include_tables=False,
+            include_formatting=False,
+            no_fallback=False,
+        )
+        if text and len(text.strip()) > 100:
+            return text.strip()
+        return None
+    except Exception as e:
+        logger.warning(f"Error extracting content from {url}: {str(e)}")
+        return None
     # -------------------------------------------------------------------------
     # Post-processing helpers