ragV98's picture
improvements 1
3521e98
raw
history blame
3.37 kB
import requests
import trafilatura
from newspaper import Article
from typing import Optional
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/115.0.0.0 Safari/537.36"
)
}
def clean_text(text: str) -> str:
"""Remove HTML tags and collapse whitespace."""
soup = BeautifulSoup(text, "html.parser")
cleaned = soup.get_text(separator=" ", strip=True)
return " ".join(cleaned.split())
def is_low_quality(text: str) -> bool:
"""Heuristic to detect low-value content like navbars, footers, etc."""
if not text or len(text.split()) < 50:
return True
junk_markers = [
"subscribe", "click here", "latest headlines", "more from",
"privacy policy", "video", "terms of service", "back to top",
"all rights reserved", "advertisement", "read more", "sign in"
]
return any(marker in text.lower() for marker in junk_markers)
def fallback_html_extract(html: str) -> Optional[str]:
"""Very basic content extractor as a last resort."""
try:
soup = BeautifulSoup(html, "html.parser")
paragraphs = soup.find_all("p")
text = " ".join(p.get_text(strip=True) for p in paragraphs)
cleaned = clean_text(text)
return cleaned if len(cleaned.split()) >= 50 else None
except Exception as e:
print(f"⚠️ Fallback extract failed: {e}")
return None
def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
"""Extract meaningful text from a given URL using multiple methods."""
try:
response = requests.get(url, timeout=timeout, headers=HEADERS)
if response.status_code != 200:
print(f"⚠️ Bad status ({response.status_code}) for {url}")
return None
html = response.text
# Attempt trafilatura
extracted = trafilatura.extract(
html,
include_comments=False,
include_tables=False,
no_fallback=False
)
if extracted:
text = clean_text(extracted)
if not is_low_quality(text):
return text
else:
print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
else:
print(f"⚠️ Trafilatura extraction failed or empty: {url}")
except Exception as e:
print(f"⚠️ Trafilatura failed for {url}: {e}")
# Fallback to newspaper3k
try:
article = Article(url)
article.download()
article.parse()
if article.text:
text = clean_text(article.text)
if not is_low_quality(text):
return text
else:
print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
else:
print(f"⚠️ Newspaper3k extracted no text: {url}")
except Exception as e:
print(f"⚠️ Newspaper3k failed for {url}: {e}")
# Final fallback to basic HTML parsing
try:
if html:
fallback = fallback_html_extract(html)
if fallback:
print(f"✅ Used fallback extractor for: {url}")
return fallback
except Exception as e:
print(f"⚠️ Final fallback failed for {url}: {e}")
return None