|
import requests |
|
import trafilatura |
|
from newspaper import Article |
|
from typing import Optional |
|
from bs4 import BeautifulSoup |
|
|
|
HEADERS = { |
|
"User-Agent": ( |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
"Chrome/115.0.0.0 Safari/537.36" |
|
) |
|
} |
|
|
|
|
|
def clean_text(text: str) -> str: |
|
"""Remove HTML tags and collapse whitespace.""" |
|
soup = BeautifulSoup(text, "html.parser") |
|
cleaned = soup.get_text(separator=" ", strip=True) |
|
return " ".join(cleaned.split()) |
|
|
|
|
|
def is_low_quality(text: str) -> bool: |
|
"""Heuristic to detect low-value content like navbars, footers, etc.""" |
|
if not text or len(text.split()) < 50: |
|
return True |
|
|
|
junk_markers = [ |
|
"subscribe", "click here", "latest headlines", "more from", |
|
"privacy policy", "video", "terms of service", "back to top", |
|
"all rights reserved", "advertisement", "read more", "sign in" |
|
] |
|
|
|
return any(marker in text.lower() for marker in junk_markers) |
|
|
|
|
|
def fallback_html_extract(html: str) -> Optional[str]: |
|
"""Very basic content extractor as a last resort.""" |
|
try: |
|
soup = BeautifulSoup(html, "html.parser") |
|
paragraphs = soup.find_all("p") |
|
text = " ".join(p.get_text(strip=True) for p in paragraphs) |
|
cleaned = clean_text(text) |
|
return cleaned if len(cleaned.split()) >= 50 else None |
|
except Exception as e: |
|
print(f"⚠️ Fallback extract failed: {e}") |
|
return None |
|
|
|
|
|
def scrape_url(url: str, timeout: int = 10) -> Optional[str]: |
|
"""Extract meaningful text from a given URL using multiple methods.""" |
|
try: |
|
response = requests.get(url, timeout=timeout, headers=HEADERS) |
|
if response.status_code != 200: |
|
print(f"⚠️ Bad status ({response.status_code}) for {url}") |
|
return None |
|
|
|
html = response.text |
|
|
|
|
|
extracted = trafilatura.extract( |
|
html, |
|
include_comments=False, |
|
include_tables=False, |
|
no_fallback=False |
|
) |
|
|
|
if extracted: |
|
text = clean_text(extracted) |
|
if not is_low_quality(text): |
|
return text |
|
else: |
|
print(f"⚠️ Skipped low-quality text from Trafilatura: {url}") |
|
else: |
|
print(f"⚠️ Trafilatura extraction failed or empty: {url}") |
|
|
|
except Exception as e: |
|
print(f"⚠️ Trafilatura failed for {url}: {e}") |
|
|
|
|
|
try: |
|
article = Article(url) |
|
article.download() |
|
article.parse() |
|
if article.text: |
|
text = clean_text(article.text) |
|
if not is_low_quality(text): |
|
return text |
|
else: |
|
print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}") |
|
else: |
|
print(f"⚠️ Newspaper3k extracted no text: {url}") |
|
except Exception as e: |
|
print(f"⚠️ Newspaper3k failed for {url}: {e}") |
|
|
|
|
|
try: |
|
if html: |
|
fallback = fallback_html_extract(html) |
|
if fallback: |
|
print(f"✅ Used fallback extractor for: {url}") |
|
return fallback |
|
except Exception as e: |
|
print(f"⚠️ Final fallback failed for {url}: {e}") |
|
|
|
return None |
|
|