File size: 3,370 Bytes
6d24925 3521e98 6d24925 3521e98 a62e0f6 3521e98 6d24925 a62e0f6 3521e98 a62e0f6 3521e98 a62e0f6 3521e98 a62e0f6 3521e98 a62e0f6 3521e98 6d24925 3521e98 6d24925 3521e98 6d24925 a62e0f6 6d24925 a62e0f6 3521e98 6d24925 3521e98 6d24925 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import requests
import trafilatura
from newspaper import Article
from typing import Optional
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/115.0.0.0 Safari/537.36"
)
}
def clean_text(text: str) -> str:
"""Remove HTML tags and collapse whitespace."""
soup = BeautifulSoup(text, "html.parser")
cleaned = soup.get_text(separator=" ", strip=True)
return " ".join(cleaned.split())
def is_low_quality(text: str) -> bool:
"""Heuristic to detect low-value content like navbars, footers, etc."""
if not text or len(text.split()) < 50:
return True
junk_markers = [
"subscribe", "click here", "latest headlines", "more from",
"privacy policy", "video", "terms of service", "back to top",
"all rights reserved", "advertisement", "read more", "sign in"
]
return any(marker in text.lower() for marker in junk_markers)
def fallback_html_extract(html: str) -> Optional[str]:
"""Very basic content extractor as a last resort."""
try:
soup = BeautifulSoup(html, "html.parser")
paragraphs = soup.find_all("p")
text = " ".join(p.get_text(strip=True) for p in paragraphs)
cleaned = clean_text(text)
return cleaned if len(cleaned.split()) >= 50 else None
except Exception as e:
print(f"⚠️ Fallback extract failed: {e}")
return None
def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
"""Extract meaningful text from a given URL using multiple methods."""
try:
response = requests.get(url, timeout=timeout, headers=HEADERS)
if response.status_code != 200:
print(f"⚠️ Bad status ({response.status_code}) for {url}")
return None
html = response.text
# Attempt trafilatura
extracted = trafilatura.extract(
html,
include_comments=False,
include_tables=False,
no_fallback=False
)
if extracted:
text = clean_text(extracted)
if not is_low_quality(text):
return text
else:
print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
else:
print(f"⚠️ Trafilatura extraction failed or empty: {url}")
except Exception as e:
print(f"⚠️ Trafilatura failed for {url}: {e}")
# Fallback to newspaper3k
try:
article = Article(url)
article.download()
article.parse()
if article.text:
text = clean_text(article.text)
if not is_low_quality(text):
return text
else:
print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
else:
print(f"⚠️ Newspaper3k extracted no text: {url}")
except Exception as e:
print(f"⚠️ Newspaper3k failed for {url}: {e}")
# Final fallback to basic HTML parsing
try:
if html:
fallback = fallback_html_extract(html)
if fallback:
print(f"✅ Used fallback extractor for: {url}")
return fallback
except Exception as e:
print(f"⚠️ Final fallback failed for {url}: {e}")
return None
|