Update scraper.py
Browse files- scraper.py +70 -62
scraper.py
CHANGED
@@ -2,10 +2,10 @@ import requests
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import feedparser
|
4 |
import trafilatura
|
5 |
-
from urllib.parse import
|
6 |
import time
|
7 |
import logging
|
8 |
-
from datetime import datetime
|
9 |
from typing import List, Dict, Optional, Set
|
10 |
import hashlib
|
11 |
import re
|
@@ -46,8 +46,11 @@ class NewsletterScraper:
|
|
46 |
|
47 |
logger.info("NewsletterScraper initialized")
|
48 |
|
|
|
|
|
|
|
49 |
def _create_session(self) -> requests.Session:
|
50 |
-
"""Create a session with retry strategy"""
|
51 |
session = requests.Session()
|
52 |
|
53 |
retry_strategy = Retry(
|
@@ -76,7 +79,7 @@ class NewsletterScraper:
|
|
76 |
return wrapper
|
77 |
|
78 |
def _get_random_headers(self) -> Dict[str, str]:
|
79 |
-
"""Get randomized headers to avoid blocking"""
|
80 |
return {
|
81 |
'User-Agent': random.choice(self.user_agents),
|
82 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
@@ -90,7 +93,7 @@ class NewsletterScraper:
|
|
90 |
# Public entrypoint
|
91 |
# -------------------------------------------------------------------------
|
92 |
def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
|
93 |
-
"""Main scraping function"""
|
94 |
logger.info(f"Starting news scraping for query: {query}")
|
95 |
all_articles: List[Dict] = []
|
96 |
self.scraped_urls.clear()
|
@@ -98,10 +101,12 @@ class NewsletterScraper:
|
|
98 |
|
99 |
try:
|
100 |
# Primary: Google News RSS
|
101 |
-
|
|
|
|
|
102 |
all_articles.extend(google_articles)
|
103 |
|
104 |
-
# Secondary:
|
105 |
for source_name, rss_url in list(self.rss_sources.items())[1:4]:
|
106 |
if len(all_articles) >= max_articles:
|
107 |
break
|
@@ -145,10 +150,9 @@ class NewsletterScraper:
|
|
145 |
# -------------------------------------------------------------------------
|
146 |
# Source-specific + generic RSS
|
147 |
# -------------------------------------------------------------------------
|
148 |
-
def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
|
149 |
-
"""Scrape Google News RSS and resolve to publisher URLs"""
|
150 |
try:
|
151 |
-
url = self.rss_sources['google_news'].format(requests.utils.quote(query))
|
152 |
headers = self._get_random_headers()
|
153 |
response = self.session.get(url, headers=headers)
|
154 |
if response.status_code != 200:
|
@@ -157,6 +161,7 @@ class NewsletterScraper:
|
|
157 |
|
158 |
feed = feedparser.parse(response.content)
|
159 |
articles: List[Dict] = []
|
|
|
160 |
|
161 |
for entry in feed.entries[:max_articles * 2]: # extra for filtering
|
162 |
try:
|
@@ -168,6 +173,10 @@ class NewsletterScraper:
|
|
168 |
title = clean_html(raw_title) if '<' in raw_title else raw_title
|
169 |
summary = clean_html(raw_summary) if '<' in raw_summary else raw_summary
|
170 |
|
|
|
|
|
|
|
|
|
171 |
article = {
|
172 |
'title': title.strip(),
|
173 |
'url': link,
|
@@ -202,7 +211,7 @@ class NewsletterScraper:
|
|
202 |
return []
|
203 |
|
204 |
def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
|
205 |
-
"""Scrape a generic RSS source and soft-filter by query"""
|
206 |
try:
|
207 |
headers = self._get_random_headers()
|
208 |
response = self.session.get(rss_url, headers=headers)
|
@@ -222,7 +231,7 @@ class NewsletterScraper:
|
|
222 |
if not (q in title.lower() or q in summary.lower()):
|
223 |
continue
|
224 |
|
225 |
-
# Clean
|
226 |
title = clean_html(title) if '<' in title else title
|
227 |
summary = clean_html(summary) if '<' in summary else summary
|
228 |
|
@@ -267,60 +276,59 @@ class NewsletterScraper:
|
|
267 |
# Extraction + cleaning
|
268 |
# -------------------------------------------------------------------------
|
269 |
def _extract_full_content(self, url: str) -> Optional[str]:
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
|
276 |
-
|
277 |
-
|
278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
try:
|
280 |
-
|
281 |
-
if
|
282 |
-
|
283 |
except Exception as e:
|
284 |
-
logger.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
-
# 1st try: fetch HTML with our session (so we control headers)
|
287 |
-
downloaded_html = None
|
288 |
-
try:
|
289 |
-
r = self.session.get(url, headers=headers, allow_redirects=True)
|
290 |
-
if r is not None and r.status_code == 200:
|
291 |
-
downloaded_html = r.text
|
292 |
except Exception as e:
|
293 |
-
logger.
|
294 |
-
|
295 |
-
# Fallback: let trafilatura fetch the URL itself (no headers param)
|
296 |
-
if not downloaded_html:
|
297 |
-
downloaded_html = trafilatura.fetch_url(url)
|
298 |
-
|
299 |
-
if not downloaded_html:
|
300 |
return None
|
301 |
|
302 |
-
# Extract readable text
|
303 |
-
text = trafilatura.extract(
|
304 |
-
downloaded_html,
|
305 |
-
include_comments=False,
|
306 |
-
include_tables=False,
|
307 |
-
include_formatting=False,
|
308 |
-
no_fallback=False,
|
309 |
-
)
|
310 |
-
if text and len(text.strip()) > 100:
|
311 |
-
return text.strip()
|
312 |
-
return None
|
313 |
-
|
314 |
-
except Exception as e:
|
315 |
-
logger.warning(f"Error extracting content from {url}: {str(e)}")
|
316 |
-
return None
|
317 |
-
|
318 |
-
|
319 |
# -------------------------------------------------------------------------
|
320 |
# Post-processing helpers
|
321 |
# -------------------------------------------------------------------------
|
322 |
def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
|
323 |
-
"""Remove duplicate articles based on title+summary similarity"""
|
324 |
unique_articles: List[Dict] = []
|
325 |
|
326 |
for article in articles:
|
@@ -335,7 +343,7 @@ class NewsletterScraper:
|
|
335 |
return unique_articles
|
336 |
|
337 |
def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
|
338 |
-
"""Filter articles for relevance and quality"""
|
339 |
filtered: List[Dict] = []
|
340 |
q = query.lower()
|
341 |
|
@@ -354,7 +362,7 @@ class NewsletterScraper:
|
|
354 |
return filtered
|
355 |
|
356 |
def _is_english(self, text: str) -> bool:
|
357 |
-
"""Check if text is in English using language detection"""
|
358 |
try:
|
359 |
if len(text.strip()) < 20:
|
360 |
return True # too short to decide; keep it
|
@@ -365,7 +373,7 @@ class NewsletterScraper:
|
|
365 |
return True
|
366 |
|
367 |
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
368 |
-
"""Parse date from RSS feed"""
|
369 |
if not date_str:
|
370 |
return datetime.now()
|
371 |
try:
|
@@ -384,7 +392,7 @@ class NewsletterScraper:
|
|
384 |
return datetime.now()
|
385 |
|
386 |
def _extract_source_name(self, url: str) -> str:
|
387 |
-
"""Extract source name from URL"""
|
388 |
try:
|
389 |
domain = urlparse(url).netloc
|
390 |
domain = domain.replace('www.', '').replace('feeds.', '')
|
@@ -406,7 +414,7 @@ class NewsletterScraper:
|
|
406 |
return 'Unknown'
|
407 |
|
408 |
def get_available_sources(self) -> List[str]:
|
409 |
-
"""Get list of available news sources"""
|
410 |
return list(self.rss_sources.keys())
|
411 |
|
412 |
|
@@ -414,7 +422,7 @@ class NewsletterScraper:
|
|
414 |
# Module-level helpers
|
415 |
# -------------------------------------------------------------------------
|
416 |
def clean_html(html_content: str) -> str:
|
417 |
-
"""Clean HTML content and extract readable text"""
|
418 |
try:
|
419 |
soup = BeautifulSoup(html_content, 'html.parser')
|
420 |
|
@@ -438,7 +446,7 @@ def clean_html(html_content: str) -> str:
|
|
438 |
return ""
|
439 |
|
440 |
def is_valid_article_url(url: str) -> bool:
|
441 |
-
"""Check if URL is likely to be a valid article URL"""
|
442 |
try:
|
443 |
parsed = urlparse(url)
|
444 |
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import feedparser
|
4 |
import trafilatura
|
5 |
+
from urllib.parse import urlparse
|
6 |
import time
|
7 |
import logging
|
8 |
+
from datetime import datetime
|
9 |
from typing import List, Dict, Optional, Set
|
10 |
import hashlib
|
11 |
import re
|
|
|
46 |
|
47 |
logger.info("NewsletterScraper initialized")
|
48 |
|
49 |
+
# -------------------------------------------------------------------------
|
50 |
+
# Session + headers
|
51 |
+
# -------------------------------------------------------------------------
|
52 |
def _create_session(self) -> requests.Session:
|
53 |
+
"""Create a session with retry strategy and default timeouts."""
|
54 |
session = requests.Session()
|
55 |
|
56 |
retry_strategy = Retry(
|
|
|
79 |
return wrapper
|
80 |
|
81 |
def _get_random_headers(self) -> Dict[str, str]:
|
82 |
+
"""Get randomized headers to avoid blocking."""
|
83 |
return {
|
84 |
'User-Agent': random.choice(self.user_agents),
|
85 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
|
93 |
# Public entrypoint
|
94 |
# -------------------------------------------------------------------------
|
95 |
def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
|
96 |
+
"""Main scraping function."""
|
97 |
logger.info(f"Starting news scraping for query: {query}")
|
98 |
all_articles: List[Dict] = []
|
99 |
self.scraped_urls.clear()
|
|
|
101 |
|
102 |
try:
|
103 |
# Primary: Google News RSS
|
104 |
+
from requests.utils import quote
|
105 |
+
google_url = self.rss_sources['google_news'].format(quote(query))
|
106 |
+
google_articles = self._scrape_google_news(google_url, query, max_articles // 2 or 5)
|
107 |
all_articles.extend(google_articles)
|
108 |
|
109 |
+
# Secondary: a few other RSS sources
|
110 |
for source_name, rss_url in list(self.rss_sources.items())[1:4]:
|
111 |
if len(all_articles) >= max_articles:
|
112 |
break
|
|
|
150 |
# -------------------------------------------------------------------------
|
151 |
# Source-specific + generic RSS
|
152 |
# -------------------------------------------------------------------------
|
153 |
+
def _scrape_google_news(self, url: str, query: str, max_articles: int) -> List[Dict]:
|
154 |
+
"""Scrape Google News RSS and resolve to publisher URLs."""
|
155 |
try:
|
|
|
156 |
headers = self._get_random_headers()
|
157 |
response = self.session.get(url, headers=headers)
|
158 |
if response.status_code != 200:
|
|
|
161 |
|
162 |
feed = feedparser.parse(response.content)
|
163 |
articles: List[Dict] = []
|
164 |
+
q = query.lower()
|
165 |
|
166 |
for entry in feed.entries[:max_articles * 2]: # extra for filtering
|
167 |
try:
|
|
|
173 |
title = clean_html(raw_title) if '<' in raw_title else raw_title
|
174 |
summary = clean_html(raw_summary) if '<' in raw_summary else raw_summary
|
175 |
|
176 |
+
# Soft query match
|
177 |
+
if not (q in title.lower() or q in summary.lower()):
|
178 |
+
continue
|
179 |
+
|
180 |
article = {
|
181 |
'title': title.strip(),
|
182 |
'url': link,
|
|
|
211 |
return []
|
212 |
|
213 |
def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
|
214 |
+
"""Scrape a generic RSS source and soft-filter by query."""
|
215 |
try:
|
216 |
headers = self._get_random_headers()
|
217 |
response = self.session.get(rss_url, headers=headers)
|
|
|
231 |
if not (q in title.lower() or q in summary.lower()):
|
232 |
continue
|
233 |
|
234 |
+
# Clean HTML artifacts
|
235 |
title = clean_html(title) if '<' in title else title
|
236 |
summary = clean_html(summary) if '<' in summary else summary
|
237 |
|
|
|
276 |
# Extraction + cleaning
|
277 |
# -------------------------------------------------------------------------
|
278 |
def _extract_full_content(self, url: str) -> Optional[str]:
|
279 |
+
"""
|
280 |
+
Extract full article content; resolve redirects; parse with trafilatura.
|
281 |
+
"""
|
282 |
+
try:
|
283 |
+
headers = self._get_random_headers()
|
284 |
|
285 |
+
# If it's a Google News link, follow redirects to the publisher URL
|
286 |
+
parsed = urlparse(url)
|
287 |
+
if parsed.netloc.endswith("news.google.com"):
|
288 |
+
try:
|
289 |
+
resp = self.session.get(url, headers=headers, allow_redirects=True)
|
290 |
+
if resp is not None and resp.url and resp.status_code in (200, 301, 302):
|
291 |
+
url = resp.url
|
292 |
+
except Exception as e:
|
293 |
+
logger.warning(f"Failed to resolve Google News redirect: {e}")
|
294 |
+
|
295 |
+
# 1st try: fetch HTML with our session (so we control headers)
|
296 |
+
downloaded_html = None
|
297 |
try:
|
298 |
+
r = self.session.get(url, headers=headers, allow_redirects=True)
|
299 |
+
if r is not None and r.status_code == 200:
|
300 |
+
downloaded_html = r.text
|
301 |
except Exception as e:
|
302 |
+
logger.debug(f"requests fetch failed, will try trafilatura.fetch_url: {e}")
|
303 |
+
|
304 |
+
# Fallback: let trafilatura fetch the URL itself (no headers param)
|
305 |
+
if not downloaded_html:
|
306 |
+
downloaded_html = trafilatura.fetch_url(url)
|
307 |
+
|
308 |
+
if not downloaded_html:
|
309 |
+
return None
|
310 |
+
|
311 |
+
# Extract readable text
|
312 |
+
text = trafilatura.extract(
|
313 |
+
downloaded_html,
|
314 |
+
include_comments=False,
|
315 |
+
include_tables=False,
|
316 |
+
include_formatting=False,
|
317 |
+
no_fallback=False,
|
318 |
+
)
|
319 |
+
if text and len(text.strip()) > 100:
|
320 |
+
return text.strip()
|
321 |
+
return None
|
322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
except Exception as e:
|
324 |
+
logger.warning(f"Error extracting content from {url}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
return None
|
326 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
# -------------------------------------------------------------------------
|
328 |
# Post-processing helpers
|
329 |
# -------------------------------------------------------------------------
|
330 |
def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
|
331 |
+
"""Remove duplicate articles based on title+summary similarity."""
|
332 |
unique_articles: List[Dict] = []
|
333 |
|
334 |
for article in articles:
|
|
|
343 |
return unique_articles
|
344 |
|
345 |
def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
|
346 |
+
"""Filter articles for relevance and quality."""
|
347 |
filtered: List[Dict] = []
|
348 |
q = query.lower()
|
349 |
|
|
|
362 |
return filtered
|
363 |
|
364 |
def _is_english(self, text: str) -> bool:
|
365 |
+
"""Check if text is in English using language detection."""
|
366 |
try:
|
367 |
if len(text.strip()) < 20:
|
368 |
return True # too short to decide; keep it
|
|
|
373 |
return True
|
374 |
|
375 |
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
376 |
+
"""Parse date from RSS feed."""
|
377 |
if not date_str:
|
378 |
return datetime.now()
|
379 |
try:
|
|
|
392 |
return datetime.now()
|
393 |
|
394 |
def _extract_source_name(self, url: str) -> str:
|
395 |
+
"""Extract source name from URL."""
|
396 |
try:
|
397 |
domain = urlparse(url).netloc
|
398 |
domain = domain.replace('www.', '').replace('feeds.', '')
|
|
|
414 |
return 'Unknown'
|
415 |
|
416 |
def get_available_sources(self) -> List[str]:
|
417 |
+
"""Get list of available news sources."""
|
418 |
return list(self.rss_sources.keys())
|
419 |
|
420 |
|
|
|
422 |
# Module-level helpers
|
423 |
# -------------------------------------------------------------------------
|
424 |
def clean_html(html_content: str) -> str:
|
425 |
+
"""Clean HTML content and extract readable text."""
|
426 |
try:
|
427 |
soup = BeautifulSoup(html_content, 'html.parser')
|
428 |
|
|
|
446 |
return ""
|
447 |
|
448 |
def is_valid_article_url(url: str) -> bool:
|
449 |
+
"""Check if URL is likely to be a valid article URL."""
|
450 |
try:
|
451 |
parsed = urlparse(url)
|
452 |
|