wekey1998 commited on
Commit
9b1ab7e
·
verified ·
1 Parent(s): b02cc7b

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +40 -30
scraper.py CHANGED
@@ -267,41 +267,51 @@ class NewsletterScraper:
267
  # Extraction + cleaning
268
  # -------------------------------------------------------------------------
269
  def _extract_full_content(self, url: str) -> Optional[str]:
270
- """Extract full article content; resolves Google News redirect first."""
271
- try:
272
- headers = self._get_random_headers()
273
-
274
- # If it's a Google News link, follow redirects to the publisher URL
275
- parsed = urlparse(url)
276
- if parsed.netloc.endswith("news.google.com"):
277
- try:
278
- resp = self.session.get(url, headers=headers, allow_redirects=True)
279
- if resp is not None and resp.url and resp.status_code in (200, 301, 302):
280
- url = resp.url
281
- except Exception as e:
282
- logger.warning(f"Failed to resolve Google News redirect: {e}")
283
-
284
- # Fetch with trafilatura at the publisher URL
285
- downloaded = trafilatura.fetch_url(url, headers=headers)
286
- if not downloaded:
287
- return None
288
 
289
- text = trafilatura.extract(
290
- downloaded,
291
- include_comments=False,
292
- include_tables=False,
293
- include_formatting=False,
294
- no_fallback=False
295
- )
 
 
 
 
 
 
 
 
 
 
 
296
 
297
- if text and len(text.strip()) > 100:
298
- return text.strip()
 
299
 
 
300
  return None
301
 
302
- except Exception as e:
303
- logger.warning(f"Error extracting content from {url}: {str(e)}")
304
- return None
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  # -------------------------------------------------------------------------
307
  # Post-processing helpers
 
267
  # Extraction + cleaning
268
  # -------------------------------------------------------------------------
269
  def _extract_full_content(self, url: str) -> Optional[str]:
270
+ """Extract full article content; resolve redirects; parse with trafilatura."""
271
+ try:
272
+ headers = self._get_random_headers()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
+ # If it's a Google News link, follow redirects to the publisher URL
275
+ parsed = urlparse(url)
276
+ if parsed.netloc.endswith("news.google.com"):
277
+ try:
278
+ resp = self.session.get(url, headers=headers, allow_redirects=True)
279
+ if resp is not None and resp.url and resp.status_code in (200, 301, 302):
280
+ url = resp.url
281
+ except Exception as e:
282
+ logger.warning(f"Failed to resolve Google News redirect: {e}")
283
+
284
+ # 1st try: fetch HTML with our session (so we control headers)
285
+ downloaded_html = None
286
+ try:
287
+ r = self.session.get(url, headers=headers, allow_redirects=True)
288
+ if r is not None and r.status_code == 200:
289
+ downloaded_html = r.text
290
+ except Exception as e:
291
+ logger.debug(f"requests fetch failed, will try trafilatura.fetch_url: {e}")
292
 
293
+ # Fallback: let trafilatura fetch the URL itself (no headers param)
294
+ if not downloaded_html:
295
+ downloaded_html = trafilatura.fetch_url(url)
296
 
297
+ if not downloaded_html:
298
  return None
299
 
300
+ # Extract readable text
301
+ text = trafilatura.extract(
302
+ downloaded_html,
303
+ include_comments=False,
304
+ include_tables=False,
305
+ include_formatting=False,
306
+ no_fallback=False,
307
+ )
308
+ if text and len(text.strip()) > 100:
309
+ return text.strip()
310
+ return None
311
+
312
+ except Exception as e:
313
+ logger.warning(f"Error extracting content from {url}: {str(e)}")
314
+ return None
315
 
316
  # -------------------------------------------------------------------------
317
  # Post-processing helpers