wekey1998 commited on
Commit
44fda85
·
verified ·
1 Parent(s): a453533

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +70 -62
scraper.py CHANGED
@@ -2,10 +2,10 @@ import requests
2
  from bs4 import BeautifulSoup
3
  import feedparser
4
  import trafilatura
5
- from urllib.parse import urljoin, urlparse
6
  import time
7
  import logging
8
- from datetime import datetime, timedelta
9
  from typing import List, Dict, Optional, Set
10
  import hashlib
11
  import re
@@ -46,8 +46,11 @@ class NewsletterScraper:
46
 
47
  logger.info("NewsletterScraper initialized")
48
 
 
 
 
49
  def _create_session(self) -> requests.Session:
50
- """Create a session with retry strategy"""
51
  session = requests.Session()
52
 
53
  retry_strategy = Retry(
@@ -76,7 +79,7 @@ class NewsletterScraper:
76
  return wrapper
77
 
78
  def _get_random_headers(self) -> Dict[str, str]:
79
- """Get randomized headers to avoid blocking"""
80
  return {
81
  'User-Agent': random.choice(self.user_agents),
82
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
@@ -90,7 +93,7 @@ class NewsletterScraper:
90
  # Public entrypoint
91
  # -------------------------------------------------------------------------
92
  def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
93
- """Main scraping function"""
94
  logger.info(f"Starting news scraping for query: {query}")
95
  all_articles: List[Dict] = []
96
  self.scraped_urls.clear()
@@ -98,10 +101,12 @@ class NewsletterScraper:
98
 
99
  try:
100
  # Primary: Google News RSS
101
- google_articles = self._scrape_google_news(query, max_articles // 2 or 5)
 
 
102
  all_articles.extend(google_articles)
103
 
104
- # Secondary: Other RSS sources (limit a few to reduce timeouts on free CPU)
105
  for source_name, rss_url in list(self.rss_sources.items())[1:4]:
106
  if len(all_articles) >= max_articles:
107
  break
@@ -145,10 +150,9 @@ class NewsletterScraper:
145
  # -------------------------------------------------------------------------
146
  # Source-specific + generic RSS
147
  # -------------------------------------------------------------------------
148
- def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
149
- """Scrape Google News RSS and resolve to publisher URLs"""
150
  try:
151
- url = self.rss_sources['google_news'].format(requests.utils.quote(query))
152
  headers = self._get_random_headers()
153
  response = self.session.get(url, headers=headers)
154
  if response.status_code != 200:
@@ -157,6 +161,7 @@ class NewsletterScraper:
157
 
158
  feed = feedparser.parse(response.content)
159
  articles: List[Dict] = []
 
160
 
161
  for entry in feed.entries[:max_articles * 2]: # extra for filtering
162
  try:
@@ -168,6 +173,10 @@ class NewsletterScraper:
168
  title = clean_html(raw_title) if '<' in raw_title else raw_title
169
  summary = clean_html(raw_summary) if '<' in raw_summary else raw_summary
170
 
 
 
 
 
171
  article = {
172
  'title': title.strip(),
173
  'url': link,
@@ -202,7 +211,7 @@ class NewsletterScraper:
202
  return []
203
 
204
  def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
205
- """Scrape a generic RSS source and soft-filter by query"""
206
  try:
207
  headers = self._get_random_headers()
208
  response = self.session.get(rss_url, headers=headers)
@@ -222,7 +231,7 @@ class NewsletterScraper:
222
  if not (q in title.lower() or q in summary.lower()):
223
  continue
224
 
225
- # Clean any HTML artifacts
226
  title = clean_html(title) if '<' in title else title
227
  summary = clean_html(summary) if '<' in summary else summary
228
 
@@ -267,60 +276,59 @@ class NewsletterScraper:
267
  # Extraction + cleaning
268
  # -------------------------------------------------------------------------
269
  def _extract_full_content(self, url: str) -> Optional[str]:
270
- """
271
- Extract full article content; resolve redirects; parse with trafilatura.
272
- """
273
- try:
274
- headers = self._get_random_headers()
275
 
276
- # If it's a Google News link, follow redirects to the publisher URL
277
- parsed = urlparse(url)
278
- if parsed.netloc.endswith("news.google.com"):
 
 
 
 
 
 
 
 
 
279
  try:
280
- resp = self.session.get(url, headers=headers, allow_redirects=True)
281
- if resp is not None and resp.url and resp.status_code in (200, 301, 302):
282
- url = resp.url
283
  except Exception as e:
284
- logger.warning(f"Failed to resolve Google News redirect: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
- # 1st try: fetch HTML with our session (so we control headers)
287
- downloaded_html = None
288
- try:
289
- r = self.session.get(url, headers=headers, allow_redirects=True)
290
- if r is not None and r.status_code == 200:
291
- downloaded_html = r.text
292
  except Exception as e:
293
- logger.debug(f"requests fetch failed, will try trafilatura.fetch_url: {e}")
294
-
295
- # Fallback: let trafilatura fetch the URL itself (no headers param)
296
- if not downloaded_html:
297
- downloaded_html = trafilatura.fetch_url(url)
298
-
299
- if not downloaded_html:
300
  return None
301
 
302
- # Extract readable text
303
- text = trafilatura.extract(
304
- downloaded_html,
305
- include_comments=False,
306
- include_tables=False,
307
- include_formatting=False,
308
- no_fallback=False,
309
- )
310
- if text and len(text.strip()) > 100:
311
- return text.strip()
312
- return None
313
-
314
- except Exception as e:
315
- logger.warning(f"Error extracting content from {url}: {str(e)}")
316
- return None
317
-
318
-
319
  # -------------------------------------------------------------------------
320
  # Post-processing helpers
321
  # -------------------------------------------------------------------------
322
  def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
323
- """Remove duplicate articles based on title+summary similarity"""
324
  unique_articles: List[Dict] = []
325
 
326
  for article in articles:
@@ -335,7 +343,7 @@ class NewsletterScraper:
335
  return unique_articles
336
 
337
  def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
338
- """Filter articles for relevance and quality"""
339
  filtered: List[Dict] = []
340
  q = query.lower()
341
 
@@ -354,7 +362,7 @@ class NewsletterScraper:
354
  return filtered
355
 
356
  def _is_english(self, text: str) -> bool:
357
- """Check if text is in English using language detection"""
358
  try:
359
  if len(text.strip()) < 20:
360
  return True # too short to decide; keep it
@@ -365,7 +373,7 @@ class NewsletterScraper:
365
  return True
366
 
367
  def _parse_date(self, date_str: str) -> Optional[datetime]:
368
- """Parse date from RSS feed"""
369
  if not date_str:
370
  return datetime.now()
371
  try:
@@ -384,7 +392,7 @@ class NewsletterScraper:
384
  return datetime.now()
385
 
386
  def _extract_source_name(self, url: str) -> str:
387
- """Extract source name from URL"""
388
  try:
389
  domain = urlparse(url).netloc
390
  domain = domain.replace('www.', '').replace('feeds.', '')
@@ -406,7 +414,7 @@ class NewsletterScraper:
406
  return 'Unknown'
407
 
408
  def get_available_sources(self) -> List[str]:
409
- """Get list of available news sources"""
410
  return list(self.rss_sources.keys())
411
 
412
 
@@ -414,7 +422,7 @@ class NewsletterScraper:
414
  # Module-level helpers
415
  # -------------------------------------------------------------------------
416
  def clean_html(html_content: str) -> str:
417
- """Clean HTML content and extract readable text"""
418
  try:
419
  soup = BeautifulSoup(html_content, 'html.parser')
420
 
@@ -438,7 +446,7 @@ def clean_html(html_content: str) -> str:
438
  return ""
439
 
440
  def is_valid_article_url(url: str) -> bool:
441
- """Check if URL is likely to be a valid article URL"""
442
  try:
443
  parsed = urlparse(url)
444
 
 
2
  from bs4 import BeautifulSoup
3
  import feedparser
4
  import trafilatura
5
+ from urllib.parse import urlparse
6
  import time
7
  import logging
8
+ from datetime import datetime
9
  from typing import List, Dict, Optional, Set
10
  import hashlib
11
  import re
 
46
 
47
  logger.info("NewsletterScraper initialized")
48
 
49
+ # -------------------------------------------------------------------------
50
+ # Session + headers
51
+ # -------------------------------------------------------------------------
52
  def _create_session(self) -> requests.Session:
53
+ """Create a session with retry strategy and default timeouts."""
54
  session = requests.Session()
55
 
56
  retry_strategy = Retry(
 
79
  return wrapper
80
 
81
  def _get_random_headers(self) -> Dict[str, str]:
82
+ """Get randomized headers to avoid blocking."""
83
  return {
84
  'User-Agent': random.choice(self.user_agents),
85
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 
93
  # Public entrypoint
94
  # -------------------------------------------------------------------------
95
  def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
96
+ """Main scraping function."""
97
  logger.info(f"Starting news scraping for query: {query}")
98
  all_articles: List[Dict] = []
99
  self.scraped_urls.clear()
 
101
 
102
  try:
103
  # Primary: Google News RSS
104
+ from requests.utils import quote
105
+ google_url = self.rss_sources['google_news'].format(quote(query))
106
+ google_articles = self._scrape_google_news(google_url, query, max_articles // 2 or 5)
107
  all_articles.extend(google_articles)
108
 
109
+ # Secondary: a few other RSS sources
110
  for source_name, rss_url in list(self.rss_sources.items())[1:4]:
111
  if len(all_articles) >= max_articles:
112
  break
 
150
  # -------------------------------------------------------------------------
151
  # Source-specific + generic RSS
152
  # -------------------------------------------------------------------------
153
+ def _scrape_google_news(self, url: str, query: str, max_articles: int) -> List[Dict]:
154
+ """Scrape Google News RSS and resolve to publisher URLs."""
155
  try:
 
156
  headers = self._get_random_headers()
157
  response = self.session.get(url, headers=headers)
158
  if response.status_code != 200:
 
161
 
162
  feed = feedparser.parse(response.content)
163
  articles: List[Dict] = []
164
+ q = query.lower()
165
 
166
  for entry in feed.entries[:max_articles * 2]: # extra for filtering
167
  try:
 
173
  title = clean_html(raw_title) if '<' in raw_title else raw_title
174
  summary = clean_html(raw_summary) if '<' in raw_summary else raw_summary
175
 
176
+ # Soft query match
177
+ if not (q in title.lower() or q in summary.lower()):
178
+ continue
179
+
180
  article = {
181
  'title': title.strip(),
182
  'url': link,
 
211
  return []
212
 
213
  def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
214
+ """Scrape a generic RSS source and soft-filter by query."""
215
  try:
216
  headers = self._get_random_headers()
217
  response = self.session.get(rss_url, headers=headers)
 
231
  if not (q in title.lower() or q in summary.lower()):
232
  continue
233
 
234
+ # Clean HTML artifacts
235
  title = clean_html(title) if '<' in title else title
236
  summary = clean_html(summary) if '<' in summary else summary
237
 
 
276
  # Extraction + cleaning
277
  # -------------------------------------------------------------------------
278
  def _extract_full_content(self, url: str) -> Optional[str]:
279
+ """
280
+ Extract full article content; resolve redirects; parse with trafilatura.
281
+ """
282
+ try:
283
+ headers = self._get_random_headers()
284
 
285
+ # If it's a Google News link, follow redirects to the publisher URL
286
+ parsed = urlparse(url)
287
+ if parsed.netloc.endswith("news.google.com"):
288
+ try:
289
+ resp = self.session.get(url, headers=headers, allow_redirects=True)
290
+ if resp is not None and resp.url and resp.status_code in (200, 301, 302):
291
+ url = resp.url
292
+ except Exception as e:
293
+ logger.warning(f"Failed to resolve Google News redirect: {e}")
294
+
295
+ # 1st try: fetch HTML with our session (so we control headers)
296
+ downloaded_html = None
297
  try:
298
+ r = self.session.get(url, headers=headers, allow_redirects=True)
299
+ if r is not None and r.status_code == 200:
300
+ downloaded_html = r.text
301
  except Exception as e:
302
+ logger.debug(f"requests fetch failed, will try trafilatura.fetch_url: {e}")
303
+
304
+ # Fallback: let trafilatura fetch the URL itself (no headers param)
305
+ if not downloaded_html:
306
+ downloaded_html = trafilatura.fetch_url(url)
307
+
308
+ if not downloaded_html:
309
+ return None
310
+
311
+ # Extract readable text
312
+ text = trafilatura.extract(
313
+ downloaded_html,
314
+ include_comments=False,
315
+ include_tables=False,
316
+ include_formatting=False,
317
+ no_fallback=False,
318
+ )
319
+ if text and len(text.strip()) > 100:
320
+ return text.strip()
321
+ return None
322
 
 
 
 
 
 
 
323
  except Exception as e:
324
+ logger.warning(f"Error extracting content from {url}: {str(e)}")
 
 
 
 
 
 
325
  return None
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  # -------------------------------------------------------------------------
328
  # Post-processing helpers
329
  # -------------------------------------------------------------------------
330
  def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
331
+ """Remove duplicate articles based on title+summary similarity."""
332
  unique_articles: List[Dict] = []
333
 
334
  for article in articles:
 
343
  return unique_articles
344
 
345
  def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
346
+ """Filter articles for relevance and quality."""
347
  filtered: List[Dict] = []
348
  q = query.lower()
349
 
 
362
  return filtered
363
 
364
  def _is_english(self, text: str) -> bool:
365
+ """Check if text is in English using language detection."""
366
  try:
367
  if len(text.strip()) < 20:
368
  return True # too short to decide; keep it
 
373
  return True
374
 
375
  def _parse_date(self, date_str: str) -> Optional[datetime]:
376
+ """Parse date from RSS feed."""
377
  if not date_str:
378
  return datetime.now()
379
  try:
 
392
  return datetime.now()
393
 
394
  def _extract_source_name(self, url: str) -> str:
395
+ """Extract source name from URL."""
396
  try:
397
  domain = urlparse(url).netloc
398
  domain = domain.replace('www.', '').replace('feeds.', '')
 
414
  return 'Unknown'
415
 
416
  def get_available_sources(self) -> List[str]:
417
+ """Get list of available news sources."""
418
  return list(self.rss_sources.keys())
419
 
420
 
 
422
  # Module-level helpers
423
  # -------------------------------------------------------------------------
424
  def clean_html(html_content: str) -> str:
425
+ """Clean HTML content and extract readable text."""
426
  try:
427
  soup = BeautifulSoup(html_content, 'html.parser')
428
 
 
446
  return ""
447
 
448
  def is_valid_article_url(url: str) -> bool:
449
+ """Check if URL is likely to be a valid article URL."""
450
  try:
451
  parsed = urlparse(url)
452