wekey1998 commited on
Commit
b02cc7b
·
verified ·
1 Parent(s): aa675fc

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +207 -158
scraper.py CHANGED
@@ -17,13 +17,13 @@ from urllib3.util.retry import Retry
17
  logger = logging.getLogger(__name__)
18
 
19
  class NewsletterScraper:
20
- """Robust news scraper with multiple sources and deduplication"""
21
-
22
  def __init__(self):
23
  self.session = self._create_session()
24
  self.scraped_urls: Set[str] = set()
25
  self.content_hashes: Set[str] = set()
26
-
27
  # News sources configuration
28
  self.rss_sources = {
29
  'google_news': 'https://news.google.com/rss/search?q={}&hl=en&gl=US&ceid=US:en',
@@ -35,204 +35,257 @@ class NewsletterScraper:
35
  'financial_times': 'https://www.ft.com/rss/home',
36
  'bloomberg': 'https://feeds.bloomberg.com/politics/news.rss'
37
  }
38
-
39
  self.user_agents = [
40
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
41
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
42
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
43
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
44
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
45
  ]
46
-
47
  logger.info("NewsletterScraper initialized")
48
-
49
  def _create_session(self) -> requests.Session:
50
  """Create a session with retry strategy"""
51
  session = requests.Session()
52
-
53
- # Retry strategy
54
  retry_strategy = Retry(
55
  total=3,
56
  backoff_factor=1,
57
  status_forcelist=[429, 500, 502, 503, 504],
 
 
58
  )
59
-
60
- adapter = HTTPAdapter(max_retries=retry_strategy)
61
  session.mount("http://", adapter)
62
  session.mount("https://", adapter)
63
-
 
 
64
  return session
65
-
 
 
 
 
 
 
 
 
 
66
  def _get_random_headers(self) -> Dict[str, str]:
67
  """Get randomized headers to avoid blocking"""
68
  return {
69
  'User-Agent': random.choice(self.user_agents),
70
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
71
- 'Accept-Language': 'en-US,en;q=0.5',
72
- 'Accept-Encoding': 'gzip, deflate',
73
  'Connection': 'keep-alive',
74
  'Upgrade-Insecure-Requests': '1',
75
  }
76
-
 
 
 
77
  def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
78
  """Main scraping function"""
79
  logger.info(f"Starting news scraping for query: {query}")
80
-
81
- all_articles = []
82
  self.scraped_urls.clear()
83
  self.content_hashes.clear()
84
-
85
  try:
86
  # Primary: Google News RSS
87
- google_articles = self._scrape_google_news(query, max_articles // 2)
88
  all_articles.extend(google_articles)
89
-
90
- # Secondary: Other RSS sources
91
- for source_name, rss_url in list(self.rss_sources.items())[1:4]: # Limit to avoid timeouts
92
  if len(all_articles) >= max_articles:
93
  break
94
-
95
  try:
96
  source_articles = self._scrape_rss_source(rss_url, query, 5)
97
  all_articles.extend(source_articles)
98
  except Exception as e:
99
  logger.warning(f"Failed to scrape {source_name}: {str(e)}")
100
  continue
101
-
102
  # Deduplicate and filter
103
  articles = self._deduplicate_articles(all_articles)
104
  articles = self._filter_articles(articles, query)
105
  articles = articles[:max_articles]
106
-
107
- # Extract full content
108
  for article in articles:
109
  try:
110
  full_content = self._extract_full_content(article['url'])
111
  if full_content and len(full_content) > 200:
112
  article['content'] = full_content
113
  else:
114
- article['content'] = article.get('summary', article.get('title', ''))
 
 
115
  except Exception as e:
116
  logger.warning(f"Failed to extract content from {article['url']}: {str(e)}")
117
- article['content'] = article.get('summary', article.get('title', ''))
118
-
 
119
  # Filter by language (English only)
120
- articles = [article for article in articles if self._is_english(article['content'])]
121
-
122
  logger.info(f"Successfully scraped {len(articles)} articles")
123
  return articles
124
-
125
  except Exception as e:
126
  logger.error(f"Error in scrape_news: {str(e)}")
127
  return []
128
-
 
 
 
129
  def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
130
- """Scrape Google News RSS"""
131
  try:
132
- url = self.rss_sources['google_news'].format(query.replace(' ', '%20'))
133
-
134
  headers = self._get_random_headers()
135
- response = self.session.get(url, headers=headers, timeout=10)
136
-
137
  if response.status_code != 200:
138
  logger.warning(f"Google News RSS returned status {response.status_code}")
139
  return []
140
-
141
  feed = feedparser.parse(response.content)
142
- articles = []
143
-
144
- for entry in feed.entries[:max_articles * 2]: # Get extra for filtering
145
  try:
 
 
 
 
 
 
 
 
146
  article = {
147
- 'title': entry.title,
148
- 'url': entry.link,
149
- 'summary': entry.get('summary', ''),
150
  'date': self._parse_date(entry.get('published', '')),
151
  'source': 'Google News'
152
  }
153
-
154
- # Skip if already seen
 
 
 
 
 
 
 
 
155
  if article['url'] in self.scraped_urls:
156
  continue
157
-
158
  self.scraped_urls.add(article['url'])
159
  articles.append(article)
160
-
161
  except Exception as e:
162
  logger.warning(f"Error parsing Google News entry: {str(e)}")
163
  continue
164
-
165
  return articles
166
-
167
  except Exception as e:
168
  logger.error(f"Error scraping Google News: {str(e)}")
169
  return []
170
-
171
  def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
172
- """Scrape a generic RSS source"""
173
  try:
174
  headers = self._get_random_headers()
175
- response = self.session.get(rss_url, headers=headers, timeout=10)
176
-
177
  if response.status_code != 200:
178
  return []
179
-
180
  feed = feedparser.parse(response.content)
181
- articles = []
182
- query_lower = query.lower()
183
-
184
- for entry in feed.entries[:max_articles * 3]: # Get extra for filtering
185
  try:
186
- title = entry.get('title', '')
187
- summary = entry.get('summary', '')
188
-
189
- # Check if article is relevant to query
190
- if not (query_lower in title.lower() or query_lower in summary.lower()):
191
  continue
192
-
 
 
 
 
193
  article = {
194
- 'title': title,
195
- 'url': entry.get('link', ''),
196
- 'summary': summary,
197
  'date': self._parse_date(entry.get('published', '')),
198
  'source': self._extract_source_name(rss_url)
199
  }
200
-
201
- # Skip if already seen
 
 
 
 
 
 
 
 
202
  if article['url'] in self.scraped_urls:
203
  continue
204
-
205
  self.scraped_urls.add(article['url'])
206
  articles.append(article)
207
-
208
  if len(articles) >= max_articles:
209
  break
210
-
211
  except Exception as e:
212
  logger.warning(f"Error parsing RSS entry: {str(e)}")
213
  continue
214
-
215
- # Small delay to be respectful
216
- time.sleep(0.5)
217
-
218
  return articles
219
-
220
  except Exception as e:
221
  logger.error(f"Error scraping RSS {rss_url}: {str(e)}")
222
  return []
223
-
 
 
 
224
  def _extract_full_content(self, url: str) -> Optional[str]:
225
- """Extract full article content using trafilatura"""
226
  try:
227
  headers = self._get_random_headers()
228
-
229
- # Download the page
 
 
 
 
 
 
 
 
 
 
230
  downloaded = trafilatura.fetch_url(url, headers=headers)
231
-
232
  if not downloaded:
233
  return None
234
-
235
- # Extract text content
236
  text = trafilatura.extract(
237
  downloaded,
238
  include_comments=False,
@@ -240,137 +293,133 @@ class NewsletterScraper:
240
  include_formatting=False,
241
  no_fallback=False
242
  )
243
-
244
  if text and len(text.strip()) > 100:
245
  return text.strip()
246
-
247
  return None
248
-
249
  except Exception as e:
250
  logger.warning(f"Error extracting content from {url}: {str(e)}")
251
  return None
252
-
 
 
 
253
  def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
254
- """Remove duplicate articles based on content similarity"""
255
- unique_articles = []
256
-
257
  for article in articles:
258
- # Create content hash
259
- content_for_hash = f"{article['title']} {article.get('summary', '')}"
260
  content_hash = hashlib.md5(content_for_hash.encode()).hexdigest()
261
-
262
  if content_hash not in self.content_hashes:
263
  self.content_hashes.add(content_hash)
264
  unique_articles.append(article)
265
-
266
  logger.info(f"Deduplicated {len(articles)} -> {len(unique_articles)} articles")
267
  return unique_articles
268
-
269
  def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
270
  """Filter articles for relevance and quality"""
271
- filtered_articles = []
272
- query_lower = query.lower()
273
-
274
- for article in articles:
275
- # Check minimum content length
276
- title_summary = f"{article['title']} {article.get('summary', '')}"
277
- if len(title_summary.strip()) < 50:
278
  continue
279
-
280
- # Check relevance (more flexible than RSS filtering)
281
- if (query_lower in article['title'].lower() or
282
- query_lower in article.get('summary', '').lower() or
283
- any(word in article['title'].lower() for word in query_lower.split())):
284
-
285
- filtered_articles.append(article)
286
-
287
- logger.info(f"Filtered {len(articles)} -> {len(filtered_articles)} articles for relevance")
288
- return filtered_articles
289
-
290
  def _is_english(self, text: str) -> bool:
291
  """Check if text is in English using language detection"""
292
  try:
293
  if len(text.strip()) < 20:
294
- return True # Assume short text is English
295
-
296
- detected_lang = detect(text[:1000]) # Check first 1000 chars
297
  return detected_lang == 'en'
298
-
299
  except Exception:
300
  # If detection fails, assume English
301
  return True
302
-
303
  def _parse_date(self, date_str: str) -> Optional[datetime]:
304
  """Parse date from RSS feed"""
305
  if not date_str:
306
  return datetime.now()
307
-
308
  try:
309
- # Try common RSS date formats
310
- for fmt in ['%a, %d %b %Y %H:%M:%S %Z',
311
- '%Y-%m-%dT%H:%M:%SZ',
312
- '%Y-%m-%d %H:%M:%S']:
 
 
313
  try:
314
  return datetime.strptime(date_str.strip(), fmt)
315
  except ValueError:
316
  continue
317
-
318
- # If all fails, return current time
319
  return datetime.now()
320
-
321
  except Exception:
322
  return datetime.now()
323
-
324
  def _extract_source_name(self, url: str) -> str:
325
  """Extract source name from URL"""
326
  try:
327
  domain = urlparse(url).netloc
328
-
329
- # Clean up common domain patterns
330
  domain = domain.replace('www.', '').replace('feeds.', '')
331
-
332
- # Map known domains to clean names
333
  domain_mapping = {
334
  'news.google.com': 'Google News',
335
  'finance.yahoo.com': 'Yahoo Finance',
336
  'reuters.com': 'Reuters',
337
  'reutersagency.com': 'Reuters',
338
  'bbc.co.uk': 'BBC',
 
339
  'cnbc.com': 'CNBC',
340
  'marketwatch.com': 'MarketWatch',
341
  'ft.com': 'Financial Times',
342
  'bloomberg.com': 'Bloomberg'
343
  }
344
-
345
  return domain_mapping.get(domain, domain.title())
346
-
347
  except Exception:
348
  return 'Unknown'
349
-
350
  def get_available_sources(self) -> List[str]:
351
  """Get list of available news sources"""
352
  return list(self.rss_sources.keys())
353
 
354
- # Additional utility functions for scraping
 
 
 
355
  def clean_html(html_content: str) -> str:
356
- """Clean HTML content and extract text"""
357
  try:
358
  soup = BeautifulSoup(html_content, 'html.parser')
359
-
360
  # Remove script and style elements
361
  for script in soup(["script", "style"]):
362
  script.extract()
363
-
364
- # Get text
365
- text = soup.get_text()
366
-
367
  # Clean up whitespace
368
  lines = (line.strip() for line in text.splitlines())
369
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
370
  text = ' '.join(chunk for chunk in chunks if chunk)
371
-
372
- return text
373
-
 
 
374
  except Exception as e:
375
  logger.error(f"Error cleaning HTML: {str(e)}")
376
  return ""
@@ -379,18 +428,18 @@ def is_valid_article_url(url: str) -> bool:
379
  """Check if URL is likely to be a valid article URL"""
380
  try:
381
  parsed = urlparse(url)
382
-
383
  # Skip certain file types
384
- skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.mp4', '.mp3']
385
  if any(url.lower().endswith(ext) for ext in skip_extensions):
386
  return False
387
-
388
  # Skip obvious non-article URLs
389
  skip_patterns = ['login', 'register', 'subscribe', 'newsletter', 'sitemap']
390
  if any(pattern in url.lower() for pattern in skip_patterns):
391
  return False
392
-
393
  return True
394
-
395
  except Exception:
396
- return False
 
17
  logger = logging.getLogger(__name__)
18
 
19
  class NewsletterScraper:
20
+ """Robust news scraper with multiple sources, redirect resolution, and deduplication"""
21
+
22
  def __init__(self):
23
  self.session = self._create_session()
24
  self.scraped_urls: Set[str] = set()
25
  self.content_hashes: Set[str] = set()
26
+
27
  # News sources configuration
28
  self.rss_sources = {
29
  'google_news': 'https://news.google.com/rss/search?q={}&hl=en&gl=US&ceid=US:en',
 
35
  'financial_times': 'https://www.ft.com/rss/home',
36
  'bloomberg': 'https://feeds.bloomberg.com/politics/news.rss'
37
  }
38
+
39
  self.user_agents = [
40
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
41
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15',
42
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
43
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
44
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12.6; rv:121.0) Gecko/20100101 Firefox/121.0'
45
  ]
46
+
47
  logger.info("NewsletterScraper initialized")
48
+
49
  def _create_session(self) -> requests.Session:
50
  """Create a session with retry strategy"""
51
  session = requests.Session()
52
+
 
53
  retry_strategy = Retry(
54
  total=3,
55
  backoff_factor=1,
56
  status_forcelist=[429, 500, 502, 503, 504],
57
+ allowed_methods=["GET", "HEAD"],
58
+ raise_on_status=False,
59
  )
60
+
61
+ adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=20, pool_maxsize=20)
62
  session.mount("http://", adapter)
63
  session.mount("https://", adapter)
64
+
65
+ # Default timeouts for all requests via session
66
+ session.request = self._with_timeout(session.request, timeout=10)
67
  return session
68
+
69
+ @staticmethod
70
+ def _with_timeout(func, timeout: int = 10):
71
+ """Wrap session.request to always include a timeout unless explicitly provided."""
72
+ def wrapper(method, url, **kwargs):
73
+ if "timeout" not in kwargs:
74
+ kwargs["timeout"] = timeout
75
+ return func(method, url, **kwargs)
76
+ return wrapper
77
+
78
  def _get_random_headers(self) -> Dict[str, str]:
79
  """Get randomized headers to avoid blocking"""
80
  return {
81
  'User-Agent': random.choice(self.user_agents),
82
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
83
+ 'Accept-Language': 'en-US,en;q=0.9',
84
+ 'Accept-Encoding': 'gzip, deflate, br',
85
  'Connection': 'keep-alive',
86
  'Upgrade-Insecure-Requests': '1',
87
  }
88
+
89
+ # -------------------------------------------------------------------------
90
+ # Public entrypoint
91
+ # -------------------------------------------------------------------------
92
  def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
93
  """Main scraping function"""
94
  logger.info(f"Starting news scraping for query: {query}")
95
+ all_articles: List[Dict] = []
 
96
  self.scraped_urls.clear()
97
  self.content_hashes.clear()
98
+
99
  try:
100
  # Primary: Google News RSS
101
+ google_articles = self._scrape_google_news(query, max_articles // 2 or 5)
102
  all_articles.extend(google_articles)
103
+
104
+ # Secondary: Other RSS sources (limit a few to reduce timeouts on free CPU)
105
+ for source_name, rss_url in list(self.rss_sources.items())[1:4]:
106
  if len(all_articles) >= max_articles:
107
  break
 
108
  try:
109
  source_articles = self._scrape_rss_source(rss_url, query, 5)
110
  all_articles.extend(source_articles)
111
  except Exception as e:
112
  logger.warning(f"Failed to scrape {source_name}: {str(e)}")
113
  continue
114
+
115
  # Deduplicate and filter
116
  articles = self._deduplicate_articles(all_articles)
117
  articles = self._filter_articles(articles, query)
118
  articles = articles[:max_articles]
119
+
120
+ # Extract full content (after resolving redirects)
121
  for article in articles:
122
  try:
123
  full_content = self._extract_full_content(article['url'])
124
  if full_content and len(full_content) > 200:
125
  article['content'] = full_content
126
  else:
127
+ # Fallback to cleaned summary/title
128
+ fallback = article.get('summary') or article.get('title', '')
129
+ article['content'] = clean_html(fallback) if '<' in fallback else fallback
130
  except Exception as e:
131
  logger.warning(f"Failed to extract content from {article['url']}: {str(e)}")
132
+ fallback = article.get('summary') or article.get('title', '')
133
+ article['content'] = clean_html(fallback) if '<' in fallback else fallback
134
+
135
  # Filter by language (English only)
136
+ articles = [a for a in articles if self._is_english(a.get('content', ''))]
137
+
138
  logger.info(f"Successfully scraped {len(articles)} articles")
139
  return articles
140
+
141
  except Exception as e:
142
  logger.error(f"Error in scrape_news: {str(e)}")
143
  return []
144
+
145
+ # -------------------------------------------------------------------------
146
+ # Source-specific + generic RSS
147
+ # -------------------------------------------------------------------------
148
  def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
149
+ """Scrape Google News RSS and resolve to publisher URLs"""
150
  try:
151
+ url = self.rss_sources['google_news'].format(requests.utils.quote(query))
 
152
  headers = self._get_random_headers()
153
+ response = self.session.get(url, headers=headers)
 
154
  if response.status_code != 200:
155
  logger.warning(f"Google News RSS returned status {response.status_code}")
156
  return []
157
+
158
  feed = feedparser.parse(response.content)
159
+ articles: List[Dict] = []
160
+
161
+ for entry in feed.entries[:max_articles * 2]: # extra for filtering
162
  try:
163
+ raw_title = entry.get('title', '') or ''
164
+ raw_summary = entry.get('summary', '') or ''
165
+ link = entry.get('link', '') or ''
166
+
167
+ # Strip HTML if present in title/summary
168
+ title = clean_html(raw_title) if '<' in raw_title else raw_title
169
+ summary = clean_html(raw_summary) if '<' in raw_summary else raw_summary
170
+
171
  article = {
172
+ 'title': title.strip(),
173
+ 'url': link,
174
+ 'summary': summary.strip(),
175
  'date': self._parse_date(entry.get('published', '')),
176
  'source': 'Google News'
177
  }
178
+
179
+ # Try to resolve redirect to publisher and set clean source
180
+ try:
181
+ r = self.session.get(link, headers=headers, allow_redirects=True)
182
+ final_url = r.url if r is not None and r.url else link
183
+ article['url'] = final_url
184
+ article['source'] = self._extract_source_name(final_url)
185
+ except Exception as e:
186
+ logger.debug(f"Redirect resolution failed for Google link: {e}")
187
+
188
  if article['url'] in self.scraped_urls:
189
  continue
190
+
191
  self.scraped_urls.add(article['url'])
192
  articles.append(article)
193
+
194
  except Exception as e:
195
  logger.warning(f"Error parsing Google News entry: {str(e)}")
196
  continue
197
+
198
  return articles
199
+
200
  except Exception as e:
201
  logger.error(f"Error scraping Google News: {str(e)}")
202
  return []
203
+
204
  def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
205
+ """Scrape a generic RSS source and soft-filter by query"""
206
  try:
207
  headers = self._get_random_headers()
208
+ response = self.session.get(rss_url, headers=headers)
 
209
  if response.status_code != 200:
210
  return []
211
+
212
  feed = feedparser.parse(response.content)
213
+ articles: List[Dict] = []
214
+ q = query.lower()
215
+
216
+ for entry in feed.entries[:max_articles * 3]:
217
  try:
218
+ title = entry.get('title', '') or ''
219
+ summary = entry.get('summary', '') or ''
220
+ link = entry.get('link', '') or ''
221
+
222
+ if not (q in title.lower() or q in summary.lower()):
223
  continue
224
+
225
+ # Clean any HTML artifacts
226
+ title = clean_html(title) if '<' in title else title
227
+ summary = clean_html(summary) if '<' in summary else summary
228
+
229
  article = {
230
+ 'title': title.strip(),
231
+ 'url': link,
232
+ 'summary': summary.strip(),
233
  'date': self._parse_date(entry.get('published', '')),
234
  'source': self._extract_source_name(rss_url)
235
  }
236
+
237
+ # Resolve potential redirects to get publisher domain
238
+ try:
239
+ r = self.session.get(link, headers=headers, allow_redirects=True)
240
+ final_url = r.url if r is not None and r.url else link
241
+ article['url'] = final_url
242
+ article['source'] = self._extract_source_name(final_url)
243
+ except Exception:
244
+ pass
245
+
246
  if article['url'] in self.scraped_urls:
247
  continue
248
+
249
  self.scraped_urls.add(article['url'])
250
  articles.append(article)
251
+
252
  if len(articles) >= max_articles:
253
  break
254
+
255
  except Exception as e:
256
  logger.warning(f"Error parsing RSS entry: {str(e)}")
257
  continue
258
+
259
+ time.sleep(0.4) # be polite
 
 
260
  return articles
261
+
262
  except Exception as e:
263
  logger.error(f"Error scraping RSS {rss_url}: {str(e)}")
264
  return []
265
+
266
+ # -------------------------------------------------------------------------
267
+ # Extraction + cleaning
268
+ # -------------------------------------------------------------------------
269
  def _extract_full_content(self, url: str) -> Optional[str]:
270
+ """Extract full article content; resolves Google News redirect first."""
271
  try:
272
  headers = self._get_random_headers()
273
+
274
+ # If it's a Google News link, follow redirects to the publisher URL
275
+ parsed = urlparse(url)
276
+ if parsed.netloc.endswith("news.google.com"):
277
+ try:
278
+ resp = self.session.get(url, headers=headers, allow_redirects=True)
279
+ if resp is not None and resp.url and resp.status_code in (200, 301, 302):
280
+ url = resp.url
281
+ except Exception as e:
282
+ logger.warning(f"Failed to resolve Google News redirect: {e}")
283
+
284
+ # Fetch with trafilatura at the publisher URL
285
  downloaded = trafilatura.fetch_url(url, headers=headers)
 
286
  if not downloaded:
287
  return None
288
+
 
289
  text = trafilatura.extract(
290
  downloaded,
291
  include_comments=False,
 
293
  include_formatting=False,
294
  no_fallback=False
295
  )
296
+
297
  if text and len(text.strip()) > 100:
298
  return text.strip()
299
+
300
  return None
301
+
302
  except Exception as e:
303
  logger.warning(f"Error extracting content from {url}: {str(e)}")
304
  return None
305
+
306
+ # -------------------------------------------------------------------------
307
+ # Post-processing helpers
308
+ # -------------------------------------------------------------------------
309
  def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
310
+ """Remove duplicate articles based on title+summary similarity"""
311
+ unique_articles: List[Dict] = []
312
+
313
  for article in articles:
314
+ content_for_hash = f"{article.get('title','')} {article.get('summary','')}"
 
315
  content_hash = hashlib.md5(content_for_hash.encode()).hexdigest()
316
+
317
  if content_hash not in self.content_hashes:
318
  self.content_hashes.add(content_hash)
319
  unique_articles.append(article)
320
+
321
  logger.info(f"Deduplicated {len(articles)} -> {len(unique_articles)} articles")
322
  return unique_articles
323
+
324
  def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
325
  """Filter articles for relevance and quality"""
326
+ filtered: List[Dict] = []
327
+ q = query.lower()
328
+
329
+ for a in articles:
330
+ title_summary = f"{a.get('title','')} {a.get('summary','')}".strip()
331
+ if len(title_summary) < 50:
 
332
  continue
333
+
334
+ title_l = a.get('title', '').lower()
335
+ summary_l = a.get('summary', '').lower()
336
+
337
+ if (q in title_l) or (q in summary_l) or any(word in title_l for word in q.split()):
338
+ filtered.append(a)
339
+
340
+ logger.info(f"Filtered {len(articles)} -> {len(filtered)} articles for relevance")
341
+ return filtered
342
+
 
343
  def _is_english(self, text: str) -> bool:
344
  """Check if text is in English using language detection"""
345
  try:
346
  if len(text.strip()) < 20:
347
+ return True # too short to decide; keep it
348
+ detected_lang = detect(text[:1000])
 
349
  return detected_lang == 'en'
 
350
  except Exception:
351
  # If detection fails, assume English
352
  return True
353
+
354
  def _parse_date(self, date_str: str) -> Optional[datetime]:
355
  """Parse date from RSS feed"""
356
  if not date_str:
357
  return datetime.now()
 
358
  try:
359
+ for fmt in [
360
+ '%a, %d %b %Y %H:%M:%S %Z',
361
+ '%a, %d %b %Y %H:%M:%S %z',
362
+ '%Y-%m-%dT%H:%M:%SZ',
363
+ '%Y-%m-%d %H:%M:%S',
364
+ ]:
365
  try:
366
  return datetime.strptime(date_str.strip(), fmt)
367
  except ValueError:
368
  continue
 
 
369
  return datetime.now()
 
370
  except Exception:
371
  return datetime.now()
372
+
373
  def _extract_source_name(self, url: str) -> str:
374
  """Extract source name from URL"""
375
  try:
376
  domain = urlparse(url).netloc
 
 
377
  domain = domain.replace('www.', '').replace('feeds.', '')
378
+
 
379
  domain_mapping = {
380
  'news.google.com': 'Google News',
381
  'finance.yahoo.com': 'Yahoo Finance',
382
  'reuters.com': 'Reuters',
383
  'reutersagency.com': 'Reuters',
384
  'bbc.co.uk': 'BBC',
385
+ 'bbc.com': 'BBC',
386
  'cnbc.com': 'CNBC',
387
  'marketwatch.com': 'MarketWatch',
388
  'ft.com': 'Financial Times',
389
  'bloomberg.com': 'Bloomberg'
390
  }
 
391
  return domain_mapping.get(domain, domain.title())
 
392
  except Exception:
393
  return 'Unknown'
394
+
395
  def get_available_sources(self) -> List[str]:
396
  """Get list of available news sources"""
397
  return list(self.rss_sources.keys())
398
 
399
+
400
+ # -------------------------------------------------------------------------
401
+ # Module-level helpers
402
+ # -------------------------------------------------------------------------
403
  def clean_html(html_content: str) -> str:
404
+ """Clean HTML content and extract readable text"""
405
  try:
406
  soup = BeautifulSoup(html_content, 'html.parser')
407
+
408
  # Remove script and style elements
409
  for script in soup(["script", "style"]):
410
  script.extract()
411
+
412
+ text = soup.get_text(separator=" ")
413
+
 
414
  # Clean up whitespace
415
  lines = (line.strip() for line in text.splitlines())
416
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
417
  text = ' '.join(chunk for chunk in chunks if chunk)
418
+
419
+ # Remove leftover html entities / excessive spaces
420
+ text = re.sub(r'\s+', ' ', text)
421
+ return text.strip()
422
+
423
  except Exception as e:
424
  logger.error(f"Error cleaning HTML: {str(e)}")
425
  return ""
 
428
  """Check if URL is likely to be a valid article URL"""
429
  try:
430
  parsed = urlparse(url)
431
+
432
  # Skip certain file types
433
+ skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.mp4', '.mp3', '.webp']
434
  if any(url.lower().endswith(ext) for ext in skip_extensions):
435
  return False
436
+
437
  # Skip obvious non-article URLs
438
  skip_patterns = ['login', 'register', 'subscribe', 'newsletter', 'sitemap']
439
  if any(pattern in url.lower() for pattern in skip_patterns):
440
  return False
441
+
442
  return True
443
+
444
  except Exception:
445
+ return False