Update scraper.py
Browse files- scraper.py +207 -158
scraper.py
CHANGED
@@ -17,13 +17,13 @@ from urllib3.util.retry import Retry
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
19 |
class NewsletterScraper:
|
20 |
-
"""Robust news scraper with multiple sources and deduplication"""
|
21 |
-
|
22 |
def __init__(self):
|
23 |
self.session = self._create_session()
|
24 |
self.scraped_urls: Set[str] = set()
|
25 |
self.content_hashes: Set[str] = set()
|
26 |
-
|
27 |
# News sources configuration
|
28 |
self.rss_sources = {
|
29 |
'google_news': 'https://news.google.com/rss/search?q={}&hl=en&gl=US&ceid=US:en',
|
@@ -35,204 +35,257 @@ class NewsletterScraper:
|
|
35 |
'financial_times': 'https://www.ft.com/rss/home',
|
36 |
'bloomberg': 'https://feeds.bloomberg.com/politics/news.rss'
|
37 |
}
|
38 |
-
|
39 |
self.user_agents = [
|
40 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
41 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X
|
42 |
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
43 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:
|
44 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X
|
45 |
]
|
46 |
-
|
47 |
logger.info("NewsletterScraper initialized")
|
48 |
-
|
49 |
def _create_session(self) -> requests.Session:
|
50 |
"""Create a session with retry strategy"""
|
51 |
session = requests.Session()
|
52 |
-
|
53 |
-
# Retry strategy
|
54 |
retry_strategy = Retry(
|
55 |
total=3,
|
56 |
backoff_factor=1,
|
57 |
status_forcelist=[429, 500, 502, 503, 504],
|
|
|
|
|
58 |
)
|
59 |
-
|
60 |
-
adapter = HTTPAdapter(max_retries=retry_strategy)
|
61 |
session.mount("http://", adapter)
|
62 |
session.mount("https://", adapter)
|
63 |
-
|
|
|
|
|
64 |
return session
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
def _get_random_headers(self) -> Dict[str, str]:
|
67 |
"""Get randomized headers to avoid blocking"""
|
68 |
return {
|
69 |
'User-Agent': random.choice(self.user_agents),
|
70 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
71 |
-
'Accept-Language': 'en-US,en;q=0.
|
72 |
-
'Accept-Encoding': 'gzip, deflate',
|
73 |
'Connection': 'keep-alive',
|
74 |
'Upgrade-Insecure-Requests': '1',
|
75 |
}
|
76 |
-
|
|
|
|
|
|
|
77 |
def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
|
78 |
"""Main scraping function"""
|
79 |
logger.info(f"Starting news scraping for query: {query}")
|
80 |
-
|
81 |
-
all_articles = []
|
82 |
self.scraped_urls.clear()
|
83 |
self.content_hashes.clear()
|
84 |
-
|
85 |
try:
|
86 |
# Primary: Google News RSS
|
87 |
-
google_articles = self._scrape_google_news(query, max_articles // 2)
|
88 |
all_articles.extend(google_articles)
|
89 |
-
|
90 |
-
# Secondary: Other RSS sources
|
91 |
-
for source_name, rss_url in list(self.rss_sources.items())[1:4]:
|
92 |
if len(all_articles) >= max_articles:
|
93 |
break
|
94 |
-
|
95 |
try:
|
96 |
source_articles = self._scrape_rss_source(rss_url, query, 5)
|
97 |
all_articles.extend(source_articles)
|
98 |
except Exception as e:
|
99 |
logger.warning(f"Failed to scrape {source_name}: {str(e)}")
|
100 |
continue
|
101 |
-
|
102 |
# Deduplicate and filter
|
103 |
articles = self._deduplicate_articles(all_articles)
|
104 |
articles = self._filter_articles(articles, query)
|
105 |
articles = articles[:max_articles]
|
106 |
-
|
107 |
-
# Extract full content
|
108 |
for article in articles:
|
109 |
try:
|
110 |
full_content = self._extract_full_content(article['url'])
|
111 |
if full_content and len(full_content) > 200:
|
112 |
article['content'] = full_content
|
113 |
else:
|
114 |
-
|
|
|
|
|
115 |
except Exception as e:
|
116 |
logger.warning(f"Failed to extract content from {article['url']}: {str(e)}")
|
117 |
-
|
118 |
-
|
|
|
119 |
# Filter by language (English only)
|
120 |
-
articles = [
|
121 |
-
|
122 |
logger.info(f"Successfully scraped {len(articles)} articles")
|
123 |
return articles
|
124 |
-
|
125 |
except Exception as e:
|
126 |
logger.error(f"Error in scrape_news: {str(e)}")
|
127 |
return []
|
128 |
-
|
|
|
|
|
|
|
129 |
def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
|
130 |
-
"""Scrape Google News RSS"""
|
131 |
try:
|
132 |
-
url = self.rss_sources['google_news'].format(
|
133 |
-
|
134 |
headers = self._get_random_headers()
|
135 |
-
response = self.session.get(url, headers=headers
|
136 |
-
|
137 |
if response.status_code != 200:
|
138 |
logger.warning(f"Google News RSS returned status {response.status_code}")
|
139 |
return []
|
140 |
-
|
141 |
feed = feedparser.parse(response.content)
|
142 |
-
articles = []
|
143 |
-
|
144 |
-
for entry in feed.entries[:max_articles * 2]: #
|
145 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
article = {
|
147 |
-
'title':
|
148 |
-
'url':
|
149 |
-
'summary':
|
150 |
'date': self._parse_date(entry.get('published', '')),
|
151 |
'source': 'Google News'
|
152 |
}
|
153 |
-
|
154 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
if article['url'] in self.scraped_urls:
|
156 |
continue
|
157 |
-
|
158 |
self.scraped_urls.add(article['url'])
|
159 |
articles.append(article)
|
160 |
-
|
161 |
except Exception as e:
|
162 |
logger.warning(f"Error parsing Google News entry: {str(e)}")
|
163 |
continue
|
164 |
-
|
165 |
return articles
|
166 |
-
|
167 |
except Exception as e:
|
168 |
logger.error(f"Error scraping Google News: {str(e)}")
|
169 |
return []
|
170 |
-
|
171 |
def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
|
172 |
-
"""Scrape a generic RSS source"""
|
173 |
try:
|
174 |
headers = self._get_random_headers()
|
175 |
-
response = self.session.get(rss_url, headers=headers
|
176 |
-
|
177 |
if response.status_code != 200:
|
178 |
return []
|
179 |
-
|
180 |
feed = feedparser.parse(response.content)
|
181 |
-
articles = []
|
182 |
-
|
183 |
-
|
184 |
-
for entry in feed.entries[:max_articles * 3]:
|
185 |
try:
|
186 |
-
title = entry.get('title', '')
|
187 |
-
summary = entry.get('summary', '')
|
188 |
-
|
189 |
-
|
190 |
-
if not (
|
191 |
continue
|
192 |
-
|
|
|
|
|
|
|
|
|
193 |
article = {
|
194 |
-
'title': title,
|
195 |
-
'url':
|
196 |
-
'summary': summary,
|
197 |
'date': self._parse_date(entry.get('published', '')),
|
198 |
'source': self._extract_source_name(rss_url)
|
199 |
}
|
200 |
-
|
201 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
if article['url'] in self.scraped_urls:
|
203 |
continue
|
204 |
-
|
205 |
self.scraped_urls.add(article['url'])
|
206 |
articles.append(article)
|
207 |
-
|
208 |
if len(articles) >= max_articles:
|
209 |
break
|
210 |
-
|
211 |
except Exception as e:
|
212 |
logger.warning(f"Error parsing RSS entry: {str(e)}")
|
213 |
continue
|
214 |
-
|
215 |
-
#
|
216 |
-
time.sleep(0.5)
|
217 |
-
|
218 |
return articles
|
219 |
-
|
220 |
except Exception as e:
|
221 |
logger.error(f"Error scraping RSS {rss_url}: {str(e)}")
|
222 |
return []
|
223 |
-
|
|
|
|
|
|
|
224 |
def _extract_full_content(self, url: str) -> Optional[str]:
|
225 |
-
"""Extract full article content
|
226 |
try:
|
227 |
headers = self._get_random_headers()
|
228 |
-
|
229 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
downloaded = trafilatura.fetch_url(url, headers=headers)
|
231 |
-
|
232 |
if not downloaded:
|
233 |
return None
|
234 |
-
|
235 |
-
# Extract text content
|
236 |
text = trafilatura.extract(
|
237 |
downloaded,
|
238 |
include_comments=False,
|
@@ -240,137 +293,133 @@ class NewsletterScraper:
|
|
240 |
include_formatting=False,
|
241 |
no_fallback=False
|
242 |
)
|
243 |
-
|
244 |
if text and len(text.strip()) > 100:
|
245 |
return text.strip()
|
246 |
-
|
247 |
return None
|
248 |
-
|
249 |
except Exception as e:
|
250 |
logger.warning(f"Error extracting content from {url}: {str(e)}")
|
251 |
return None
|
252 |
-
|
|
|
|
|
|
|
253 |
def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
|
254 |
-
"""Remove duplicate articles based on
|
255 |
-
unique_articles = []
|
256 |
-
|
257 |
for article in articles:
|
258 |
-
|
259 |
-
content_for_hash = f"{article['title']} {article.get('summary', '')}"
|
260 |
content_hash = hashlib.md5(content_for_hash.encode()).hexdigest()
|
261 |
-
|
262 |
if content_hash not in self.content_hashes:
|
263 |
self.content_hashes.add(content_hash)
|
264 |
unique_articles.append(article)
|
265 |
-
|
266 |
logger.info(f"Deduplicated {len(articles)} -> {len(unique_articles)} articles")
|
267 |
return unique_articles
|
268 |
-
|
269 |
def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
|
270 |
"""Filter articles for relevance and quality"""
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
for
|
275 |
-
|
276 |
-
title_summary
|
277 |
-
if len(title_summary.strip()) < 50:
|
278 |
continue
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
def _is_english(self, text: str) -> bool:
|
291 |
"""Check if text is in English using language detection"""
|
292 |
try:
|
293 |
if len(text.strip()) < 20:
|
294 |
-
return True #
|
295 |
-
|
296 |
-
detected_lang = detect(text[:1000]) # Check first 1000 chars
|
297 |
return detected_lang == 'en'
|
298 |
-
|
299 |
except Exception:
|
300 |
# If detection fails, assume English
|
301 |
return True
|
302 |
-
|
303 |
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
304 |
"""Parse date from RSS feed"""
|
305 |
if not date_str:
|
306 |
return datetime.now()
|
307 |
-
|
308 |
try:
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
|
|
|
|
313 |
try:
|
314 |
return datetime.strptime(date_str.strip(), fmt)
|
315 |
except ValueError:
|
316 |
continue
|
317 |
-
|
318 |
-
# If all fails, return current time
|
319 |
return datetime.now()
|
320 |
-
|
321 |
except Exception:
|
322 |
return datetime.now()
|
323 |
-
|
324 |
def _extract_source_name(self, url: str) -> str:
|
325 |
"""Extract source name from URL"""
|
326 |
try:
|
327 |
domain = urlparse(url).netloc
|
328 |
-
|
329 |
-
# Clean up common domain patterns
|
330 |
domain = domain.replace('www.', '').replace('feeds.', '')
|
331 |
-
|
332 |
-
# Map known domains to clean names
|
333 |
domain_mapping = {
|
334 |
'news.google.com': 'Google News',
|
335 |
'finance.yahoo.com': 'Yahoo Finance',
|
336 |
'reuters.com': 'Reuters',
|
337 |
'reutersagency.com': 'Reuters',
|
338 |
'bbc.co.uk': 'BBC',
|
|
|
339 |
'cnbc.com': 'CNBC',
|
340 |
'marketwatch.com': 'MarketWatch',
|
341 |
'ft.com': 'Financial Times',
|
342 |
'bloomberg.com': 'Bloomberg'
|
343 |
}
|
344 |
-
|
345 |
return domain_mapping.get(domain, domain.title())
|
346 |
-
|
347 |
except Exception:
|
348 |
return 'Unknown'
|
349 |
-
|
350 |
def get_available_sources(self) -> List[str]:
|
351 |
"""Get list of available news sources"""
|
352 |
return list(self.rss_sources.keys())
|
353 |
|
354 |
-
|
|
|
|
|
|
|
355 |
def clean_html(html_content: str) -> str:
|
356 |
-
"""Clean HTML content and extract text"""
|
357 |
try:
|
358 |
soup = BeautifulSoup(html_content, 'html.parser')
|
359 |
-
|
360 |
# Remove script and style elements
|
361 |
for script in soup(["script", "style"]):
|
362 |
script.extract()
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
# Clean up whitespace
|
368 |
lines = (line.strip() for line in text.splitlines())
|
369 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
370 |
text = ' '.join(chunk for chunk in chunks if chunk)
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
|
|
374 |
except Exception as e:
|
375 |
logger.error(f"Error cleaning HTML: {str(e)}")
|
376 |
return ""
|
@@ -379,18 +428,18 @@ def is_valid_article_url(url: str) -> bool:
|
|
379 |
"""Check if URL is likely to be a valid article URL"""
|
380 |
try:
|
381 |
parsed = urlparse(url)
|
382 |
-
|
383 |
# Skip certain file types
|
384 |
-
skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.mp4', '.mp3']
|
385 |
if any(url.lower().endswith(ext) for ext in skip_extensions):
|
386 |
return False
|
387 |
-
|
388 |
# Skip obvious non-article URLs
|
389 |
skip_patterns = ['login', 'register', 'subscribe', 'newsletter', 'sitemap']
|
390 |
if any(pattern in url.lower() for pattern in skip_patterns):
|
391 |
return False
|
392 |
-
|
393 |
return True
|
394 |
-
|
395 |
except Exception:
|
396 |
-
return False
|
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
19 |
class NewsletterScraper:
|
20 |
+
"""Robust news scraper with multiple sources, redirect resolution, and deduplication"""
|
21 |
+
|
22 |
def __init__(self):
|
23 |
self.session = self._create_session()
|
24 |
self.scraped_urls: Set[str] = set()
|
25 |
self.content_hashes: Set[str] = set()
|
26 |
+
|
27 |
# News sources configuration
|
28 |
self.rss_sources = {
|
29 |
'google_news': 'https://news.google.com/rss/search?q={}&hl=en&gl=US&ceid=US:en',
|
|
|
35 |
'financial_times': 'https://www.ft.com/rss/home',
|
36 |
'bloomberg': 'https://feeds.bloomberg.com/politics/news.rss'
|
37 |
}
|
38 |
+
|
39 |
self.user_agents = [
|
40 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
41 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15',
|
42 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
43 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
44 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12.6; rv:121.0) Gecko/20100101 Firefox/121.0'
|
45 |
]
|
46 |
+
|
47 |
logger.info("NewsletterScraper initialized")
|
48 |
+
|
49 |
def _create_session(self) -> requests.Session:
|
50 |
"""Create a session with retry strategy"""
|
51 |
session = requests.Session()
|
52 |
+
|
|
|
53 |
retry_strategy = Retry(
|
54 |
total=3,
|
55 |
backoff_factor=1,
|
56 |
status_forcelist=[429, 500, 502, 503, 504],
|
57 |
+
allowed_methods=["GET", "HEAD"],
|
58 |
+
raise_on_status=False,
|
59 |
)
|
60 |
+
|
61 |
+
adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=20, pool_maxsize=20)
|
62 |
session.mount("http://", adapter)
|
63 |
session.mount("https://", adapter)
|
64 |
+
|
65 |
+
# Default timeouts for all requests via session
|
66 |
+
session.request = self._with_timeout(session.request, timeout=10)
|
67 |
return session
|
68 |
+
|
69 |
+
@staticmethod
|
70 |
+
def _with_timeout(func, timeout: int = 10):
|
71 |
+
"""Wrap session.request to always include a timeout unless explicitly provided."""
|
72 |
+
def wrapper(method, url, **kwargs):
|
73 |
+
if "timeout" not in kwargs:
|
74 |
+
kwargs["timeout"] = timeout
|
75 |
+
return func(method, url, **kwargs)
|
76 |
+
return wrapper
|
77 |
+
|
78 |
def _get_random_headers(self) -> Dict[str, str]:
|
79 |
"""Get randomized headers to avoid blocking"""
|
80 |
return {
|
81 |
'User-Agent': random.choice(self.user_agents),
|
82 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
83 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
84 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
85 |
'Connection': 'keep-alive',
|
86 |
'Upgrade-Insecure-Requests': '1',
|
87 |
}
|
88 |
+
|
89 |
+
# -------------------------------------------------------------------------
|
90 |
+
# Public entrypoint
|
91 |
+
# -------------------------------------------------------------------------
|
92 |
def scrape_news(self, query: str, max_articles: int = 20) -> List[Dict]:
|
93 |
"""Main scraping function"""
|
94 |
logger.info(f"Starting news scraping for query: {query}")
|
95 |
+
all_articles: List[Dict] = []
|
|
|
96 |
self.scraped_urls.clear()
|
97 |
self.content_hashes.clear()
|
98 |
+
|
99 |
try:
|
100 |
# Primary: Google News RSS
|
101 |
+
google_articles = self._scrape_google_news(query, max_articles // 2 or 5)
|
102 |
all_articles.extend(google_articles)
|
103 |
+
|
104 |
+
# Secondary: Other RSS sources (limit a few to reduce timeouts on free CPU)
|
105 |
+
for source_name, rss_url in list(self.rss_sources.items())[1:4]:
|
106 |
if len(all_articles) >= max_articles:
|
107 |
break
|
|
|
108 |
try:
|
109 |
source_articles = self._scrape_rss_source(rss_url, query, 5)
|
110 |
all_articles.extend(source_articles)
|
111 |
except Exception as e:
|
112 |
logger.warning(f"Failed to scrape {source_name}: {str(e)}")
|
113 |
continue
|
114 |
+
|
115 |
# Deduplicate and filter
|
116 |
articles = self._deduplicate_articles(all_articles)
|
117 |
articles = self._filter_articles(articles, query)
|
118 |
articles = articles[:max_articles]
|
119 |
+
|
120 |
+
# Extract full content (after resolving redirects)
|
121 |
for article in articles:
|
122 |
try:
|
123 |
full_content = self._extract_full_content(article['url'])
|
124 |
if full_content and len(full_content) > 200:
|
125 |
article['content'] = full_content
|
126 |
else:
|
127 |
+
# Fallback to cleaned summary/title
|
128 |
+
fallback = article.get('summary') or article.get('title', '')
|
129 |
+
article['content'] = clean_html(fallback) if '<' in fallback else fallback
|
130 |
except Exception as e:
|
131 |
logger.warning(f"Failed to extract content from {article['url']}: {str(e)}")
|
132 |
+
fallback = article.get('summary') or article.get('title', '')
|
133 |
+
article['content'] = clean_html(fallback) if '<' in fallback else fallback
|
134 |
+
|
135 |
# Filter by language (English only)
|
136 |
+
articles = [a for a in articles if self._is_english(a.get('content', ''))]
|
137 |
+
|
138 |
logger.info(f"Successfully scraped {len(articles)} articles")
|
139 |
return articles
|
140 |
+
|
141 |
except Exception as e:
|
142 |
logger.error(f"Error in scrape_news: {str(e)}")
|
143 |
return []
|
144 |
+
|
145 |
+
# -------------------------------------------------------------------------
|
146 |
+
# Source-specific + generic RSS
|
147 |
+
# -------------------------------------------------------------------------
|
148 |
def _scrape_google_news(self, query: str, max_articles: int) -> List[Dict]:
|
149 |
+
"""Scrape Google News RSS and resolve to publisher URLs"""
|
150 |
try:
|
151 |
+
url = self.rss_sources['google_news'].format(requests.utils.quote(query))
|
|
|
152 |
headers = self._get_random_headers()
|
153 |
+
response = self.session.get(url, headers=headers)
|
|
|
154 |
if response.status_code != 200:
|
155 |
logger.warning(f"Google News RSS returned status {response.status_code}")
|
156 |
return []
|
157 |
+
|
158 |
feed = feedparser.parse(response.content)
|
159 |
+
articles: List[Dict] = []
|
160 |
+
|
161 |
+
for entry in feed.entries[:max_articles * 2]: # extra for filtering
|
162 |
try:
|
163 |
+
raw_title = entry.get('title', '') or ''
|
164 |
+
raw_summary = entry.get('summary', '') or ''
|
165 |
+
link = entry.get('link', '') or ''
|
166 |
+
|
167 |
+
# Strip HTML if present in title/summary
|
168 |
+
title = clean_html(raw_title) if '<' in raw_title else raw_title
|
169 |
+
summary = clean_html(raw_summary) if '<' in raw_summary else raw_summary
|
170 |
+
|
171 |
article = {
|
172 |
+
'title': title.strip(),
|
173 |
+
'url': link,
|
174 |
+
'summary': summary.strip(),
|
175 |
'date': self._parse_date(entry.get('published', '')),
|
176 |
'source': 'Google News'
|
177 |
}
|
178 |
+
|
179 |
+
# Try to resolve redirect to publisher and set clean source
|
180 |
+
try:
|
181 |
+
r = self.session.get(link, headers=headers, allow_redirects=True)
|
182 |
+
final_url = r.url if r is not None and r.url else link
|
183 |
+
article['url'] = final_url
|
184 |
+
article['source'] = self._extract_source_name(final_url)
|
185 |
+
except Exception as e:
|
186 |
+
logger.debug(f"Redirect resolution failed for Google link: {e}")
|
187 |
+
|
188 |
if article['url'] in self.scraped_urls:
|
189 |
continue
|
190 |
+
|
191 |
self.scraped_urls.add(article['url'])
|
192 |
articles.append(article)
|
193 |
+
|
194 |
except Exception as e:
|
195 |
logger.warning(f"Error parsing Google News entry: {str(e)}")
|
196 |
continue
|
197 |
+
|
198 |
return articles
|
199 |
+
|
200 |
except Exception as e:
|
201 |
logger.error(f"Error scraping Google News: {str(e)}")
|
202 |
return []
|
203 |
+
|
204 |
def _scrape_rss_source(self, rss_url: str, query: str, max_articles: int) -> List[Dict]:
|
205 |
+
"""Scrape a generic RSS source and soft-filter by query"""
|
206 |
try:
|
207 |
headers = self._get_random_headers()
|
208 |
+
response = self.session.get(rss_url, headers=headers)
|
|
|
209 |
if response.status_code != 200:
|
210 |
return []
|
211 |
+
|
212 |
feed = feedparser.parse(response.content)
|
213 |
+
articles: List[Dict] = []
|
214 |
+
q = query.lower()
|
215 |
+
|
216 |
+
for entry in feed.entries[:max_articles * 3]:
|
217 |
try:
|
218 |
+
title = entry.get('title', '') or ''
|
219 |
+
summary = entry.get('summary', '') or ''
|
220 |
+
link = entry.get('link', '') or ''
|
221 |
+
|
222 |
+
if not (q in title.lower() or q in summary.lower()):
|
223 |
continue
|
224 |
+
|
225 |
+
# Clean any HTML artifacts
|
226 |
+
title = clean_html(title) if '<' in title else title
|
227 |
+
summary = clean_html(summary) if '<' in summary else summary
|
228 |
+
|
229 |
article = {
|
230 |
+
'title': title.strip(),
|
231 |
+
'url': link,
|
232 |
+
'summary': summary.strip(),
|
233 |
'date': self._parse_date(entry.get('published', '')),
|
234 |
'source': self._extract_source_name(rss_url)
|
235 |
}
|
236 |
+
|
237 |
+
# Resolve potential redirects to get publisher domain
|
238 |
+
try:
|
239 |
+
r = self.session.get(link, headers=headers, allow_redirects=True)
|
240 |
+
final_url = r.url if r is not None and r.url else link
|
241 |
+
article['url'] = final_url
|
242 |
+
article['source'] = self._extract_source_name(final_url)
|
243 |
+
except Exception:
|
244 |
+
pass
|
245 |
+
|
246 |
if article['url'] in self.scraped_urls:
|
247 |
continue
|
248 |
+
|
249 |
self.scraped_urls.add(article['url'])
|
250 |
articles.append(article)
|
251 |
+
|
252 |
if len(articles) >= max_articles:
|
253 |
break
|
254 |
+
|
255 |
except Exception as e:
|
256 |
logger.warning(f"Error parsing RSS entry: {str(e)}")
|
257 |
continue
|
258 |
+
|
259 |
+
time.sleep(0.4) # be polite
|
|
|
|
|
260 |
return articles
|
261 |
+
|
262 |
except Exception as e:
|
263 |
logger.error(f"Error scraping RSS {rss_url}: {str(e)}")
|
264 |
return []
|
265 |
+
|
266 |
+
# -------------------------------------------------------------------------
|
267 |
+
# Extraction + cleaning
|
268 |
+
# -------------------------------------------------------------------------
|
269 |
def _extract_full_content(self, url: str) -> Optional[str]:
|
270 |
+
"""Extract full article content; resolves Google News redirect first."""
|
271 |
try:
|
272 |
headers = self._get_random_headers()
|
273 |
+
|
274 |
+
# If it's a Google News link, follow redirects to the publisher URL
|
275 |
+
parsed = urlparse(url)
|
276 |
+
if parsed.netloc.endswith("news.google.com"):
|
277 |
+
try:
|
278 |
+
resp = self.session.get(url, headers=headers, allow_redirects=True)
|
279 |
+
if resp is not None and resp.url and resp.status_code in (200, 301, 302):
|
280 |
+
url = resp.url
|
281 |
+
except Exception as e:
|
282 |
+
logger.warning(f"Failed to resolve Google News redirect: {e}")
|
283 |
+
|
284 |
+
# Fetch with trafilatura at the publisher URL
|
285 |
downloaded = trafilatura.fetch_url(url, headers=headers)
|
|
|
286 |
if not downloaded:
|
287 |
return None
|
288 |
+
|
|
|
289 |
text = trafilatura.extract(
|
290 |
downloaded,
|
291 |
include_comments=False,
|
|
|
293 |
include_formatting=False,
|
294 |
no_fallback=False
|
295 |
)
|
296 |
+
|
297 |
if text and len(text.strip()) > 100:
|
298 |
return text.strip()
|
299 |
+
|
300 |
return None
|
301 |
+
|
302 |
except Exception as e:
|
303 |
logger.warning(f"Error extracting content from {url}: {str(e)}")
|
304 |
return None
|
305 |
+
|
306 |
+
# -------------------------------------------------------------------------
|
307 |
+
# Post-processing helpers
|
308 |
+
# -------------------------------------------------------------------------
|
309 |
def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
|
310 |
+
"""Remove duplicate articles based on title+summary similarity"""
|
311 |
+
unique_articles: List[Dict] = []
|
312 |
+
|
313 |
for article in articles:
|
314 |
+
content_for_hash = f"{article.get('title','')} {article.get('summary','')}"
|
|
|
315 |
content_hash = hashlib.md5(content_for_hash.encode()).hexdigest()
|
316 |
+
|
317 |
if content_hash not in self.content_hashes:
|
318 |
self.content_hashes.add(content_hash)
|
319 |
unique_articles.append(article)
|
320 |
+
|
321 |
logger.info(f"Deduplicated {len(articles)} -> {len(unique_articles)} articles")
|
322 |
return unique_articles
|
323 |
+
|
324 |
def _filter_articles(self, articles: List[Dict], query: str) -> List[Dict]:
|
325 |
"""Filter articles for relevance and quality"""
|
326 |
+
filtered: List[Dict] = []
|
327 |
+
q = query.lower()
|
328 |
+
|
329 |
+
for a in articles:
|
330 |
+
title_summary = f"{a.get('title','')} {a.get('summary','')}".strip()
|
331 |
+
if len(title_summary) < 50:
|
|
|
332 |
continue
|
333 |
+
|
334 |
+
title_l = a.get('title', '').lower()
|
335 |
+
summary_l = a.get('summary', '').lower()
|
336 |
+
|
337 |
+
if (q in title_l) or (q in summary_l) or any(word in title_l for word in q.split()):
|
338 |
+
filtered.append(a)
|
339 |
+
|
340 |
+
logger.info(f"Filtered {len(articles)} -> {len(filtered)} articles for relevance")
|
341 |
+
return filtered
|
342 |
+
|
|
|
343 |
def _is_english(self, text: str) -> bool:
|
344 |
"""Check if text is in English using language detection"""
|
345 |
try:
|
346 |
if len(text.strip()) < 20:
|
347 |
+
return True # too short to decide; keep it
|
348 |
+
detected_lang = detect(text[:1000])
|
|
|
349 |
return detected_lang == 'en'
|
|
|
350 |
except Exception:
|
351 |
# If detection fails, assume English
|
352 |
return True
|
353 |
+
|
354 |
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
355 |
"""Parse date from RSS feed"""
|
356 |
if not date_str:
|
357 |
return datetime.now()
|
|
|
358 |
try:
|
359 |
+
for fmt in [
|
360 |
+
'%a, %d %b %Y %H:%M:%S %Z',
|
361 |
+
'%a, %d %b %Y %H:%M:%S %z',
|
362 |
+
'%Y-%m-%dT%H:%M:%SZ',
|
363 |
+
'%Y-%m-%d %H:%M:%S',
|
364 |
+
]:
|
365 |
try:
|
366 |
return datetime.strptime(date_str.strip(), fmt)
|
367 |
except ValueError:
|
368 |
continue
|
|
|
|
|
369 |
return datetime.now()
|
|
|
370 |
except Exception:
|
371 |
return datetime.now()
|
372 |
+
|
373 |
def _extract_source_name(self, url: str) -> str:
|
374 |
"""Extract source name from URL"""
|
375 |
try:
|
376 |
domain = urlparse(url).netloc
|
|
|
|
|
377 |
domain = domain.replace('www.', '').replace('feeds.', '')
|
378 |
+
|
|
|
379 |
domain_mapping = {
|
380 |
'news.google.com': 'Google News',
|
381 |
'finance.yahoo.com': 'Yahoo Finance',
|
382 |
'reuters.com': 'Reuters',
|
383 |
'reutersagency.com': 'Reuters',
|
384 |
'bbc.co.uk': 'BBC',
|
385 |
+
'bbc.com': 'BBC',
|
386 |
'cnbc.com': 'CNBC',
|
387 |
'marketwatch.com': 'MarketWatch',
|
388 |
'ft.com': 'Financial Times',
|
389 |
'bloomberg.com': 'Bloomberg'
|
390 |
}
|
|
|
391 |
return domain_mapping.get(domain, domain.title())
|
|
|
392 |
except Exception:
|
393 |
return 'Unknown'
|
394 |
+
|
395 |
def get_available_sources(self) -> List[str]:
|
396 |
"""Get list of available news sources"""
|
397 |
return list(self.rss_sources.keys())
|
398 |
|
399 |
+
|
400 |
+
# -------------------------------------------------------------------------
|
401 |
+
# Module-level helpers
|
402 |
+
# -------------------------------------------------------------------------
|
403 |
def clean_html(html_content: str) -> str:
|
404 |
+
"""Clean HTML content and extract readable text"""
|
405 |
try:
|
406 |
soup = BeautifulSoup(html_content, 'html.parser')
|
407 |
+
|
408 |
# Remove script and style elements
|
409 |
for script in soup(["script", "style"]):
|
410 |
script.extract()
|
411 |
+
|
412 |
+
text = soup.get_text(separator=" ")
|
413 |
+
|
|
|
414 |
# Clean up whitespace
|
415 |
lines = (line.strip() for line in text.splitlines())
|
416 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
417 |
text = ' '.join(chunk for chunk in chunks if chunk)
|
418 |
+
|
419 |
+
# Remove leftover html entities / excessive spaces
|
420 |
+
text = re.sub(r'\s+', ' ', text)
|
421 |
+
return text.strip()
|
422 |
+
|
423 |
except Exception as e:
|
424 |
logger.error(f"Error cleaning HTML: {str(e)}")
|
425 |
return ""
|
|
|
428 |
"""Check if URL is likely to be a valid article URL"""
|
429 |
try:
|
430 |
parsed = urlparse(url)
|
431 |
+
|
432 |
# Skip certain file types
|
433 |
+
skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.mp4', '.mp3', '.webp']
|
434 |
if any(url.lower().endswith(ext) for ext in skip_extensions):
|
435 |
return False
|
436 |
+
|
437 |
# Skip obvious non-article URLs
|
438 |
skip_patterns = ['login', 'register', 'subscribe', 'newsletter', 'sitemap']
|
439 |
if any(pattern in url.lower() for pattern in skip_patterns):
|
440 |
return False
|
441 |
+
|
442 |
return True
|
443 |
+
|
444 |
except Exception:
|
445 |
+
return False
|