Update scraper.py
Browse files- scraper.py +40 -30
scraper.py
CHANGED
@@ -267,41 +267,51 @@ class NewsletterScraper:
|
|
267 |
# Extraction + cleaning
|
268 |
# -------------------------------------------------------------------------
|
269 |
def _extract_full_content(self, url: str) -> Optional[str]:
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
# If it's a Google News link, follow redirects to the publisher URL
|
275 |
-
parsed = urlparse(url)
|
276 |
-
if parsed.netloc.endswith("news.google.com"):
|
277 |
-
try:
|
278 |
-
resp = self.session.get(url, headers=headers, allow_redirects=True)
|
279 |
-
if resp is not None and resp.url and resp.status_code in (200, 301, 302):
|
280 |
-
url = resp.url
|
281 |
-
except Exception as e:
|
282 |
-
logger.warning(f"Failed to resolve Google News redirect: {e}")
|
283 |
-
|
284 |
-
# Fetch with trafilatura at the publisher URL
|
285 |
-
downloaded = trafilatura.fetch_url(url, headers=headers)
|
286 |
-
if not downloaded:
|
287 |
-
return None
|
288 |
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
-
|
298 |
-
|
|
|
299 |
|
|
|
300 |
return None
|
301 |
|
302 |
-
|
303 |
-
|
304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
# -------------------------------------------------------------------------
|
307 |
# Post-processing helpers
|
|
|
267 |
# Extraction + cleaning
|
268 |
# -------------------------------------------------------------------------
|
269 |
def _extract_full_content(self, url: str) -> Optional[str]:
|
270 |
+
"""Extract full article content; resolve redirects; parse with trafilatura."""
|
271 |
+
try:
|
272 |
+
headers = self._get_random_headers()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
+
# If it's a Google News link, follow redirects to the publisher URL
|
275 |
+
parsed = urlparse(url)
|
276 |
+
if parsed.netloc.endswith("news.google.com"):
|
277 |
+
try:
|
278 |
+
resp = self.session.get(url, headers=headers, allow_redirects=True)
|
279 |
+
if resp is not None and resp.url and resp.status_code in (200, 301, 302):
|
280 |
+
url = resp.url
|
281 |
+
except Exception as e:
|
282 |
+
logger.warning(f"Failed to resolve Google News redirect: {e}")
|
283 |
+
|
284 |
+
# 1st try: fetch HTML with our session (so we control headers)
|
285 |
+
downloaded_html = None
|
286 |
+
try:
|
287 |
+
r = self.session.get(url, headers=headers, allow_redirects=True)
|
288 |
+
if r is not None and r.status_code == 200:
|
289 |
+
downloaded_html = r.text
|
290 |
+
except Exception as e:
|
291 |
+
logger.debug(f"requests fetch failed, will try trafilatura.fetch_url: {e}")
|
292 |
|
293 |
+
# Fallback: let trafilatura fetch the URL itself (no headers param)
|
294 |
+
if not downloaded_html:
|
295 |
+
downloaded_html = trafilatura.fetch_url(url)
|
296 |
|
297 |
+
if not downloaded_html:
|
298 |
return None
|
299 |
|
300 |
+
# Extract readable text
|
301 |
+
text = trafilatura.extract(
|
302 |
+
downloaded_html,
|
303 |
+
include_comments=False,
|
304 |
+
include_tables=False,
|
305 |
+
include_formatting=False,
|
306 |
+
no_fallback=False,
|
307 |
+
)
|
308 |
+
if text and len(text.strip()) > 100:
|
309 |
+
return text.strip()
|
310 |
+
return None
|
311 |
+
|
312 |
+
except Exception as e:
|
313 |
+
logger.warning(f"Error extracting content from {url}: {str(e)}")
|
314 |
+
return None
|
315 |
|
316 |
# -------------------------------------------------------------------------
|
317 |
# Post-processing helpers
|