File size: 3,370 Bytes
6d24925
 
 
 
 
 
 
 
 
 
 
 
 
 
3521e98
6d24925
3521e98
a62e0f6
 
3521e98
 
6d24925
a62e0f6
3521e98
 
a62e0f6
3521e98
a62e0f6
3521e98
 
 
a62e0f6
3521e98
a62e0f6
 
3521e98
 
 
 
 
 
 
 
 
 
 
 
 
 
6d24925
3521e98
6d24925
 
3521e98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d24925
 
 
a62e0f6
6d24925
 
 
 
a62e0f6
 
 
 
 
 
3521e98
 
6d24925
 
 
3521e98
 
 
 
 
 
 
 
 
 
6d24925
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import requests
import trafilatura
from newspaper import Article
from typing import Optional
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}


def clean_text(text: str) -> str:
    """Remove HTML tags and collapse whitespace."""
    soup = BeautifulSoup(text, "html.parser")
    cleaned = soup.get_text(separator=" ", strip=True)
    return " ".join(cleaned.split())


def is_low_quality(text: str) -> bool:
    """Heuristic to detect low-value content like navbars, footers, etc."""
    if not text or len(text.split()) < 50:
        return True

    junk_markers = [
        "subscribe", "click here", "latest headlines", "more from",
        "privacy policy", "video", "terms of service", "back to top",
        "all rights reserved", "advertisement", "read more", "sign in"
    ]

    return any(marker in text.lower() for marker in junk_markers)


def fallback_html_extract(html: str) -> Optional[str]:
    """Very basic content extractor as a last resort."""
    try:
        soup = BeautifulSoup(html, "html.parser")
        paragraphs = soup.find_all("p")
        text = " ".join(p.get_text(strip=True) for p in paragraphs)
        cleaned = clean_text(text)
        return cleaned if len(cleaned.split()) >= 50 else None
    except Exception as e:
        print(f"⚠️ Fallback extract failed: {e}")
        return None


def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
    """Extract meaningful text from a given URL using multiple methods."""
    try:
        response = requests.get(url, timeout=timeout, headers=HEADERS)
        if response.status_code != 200:
            print(f"⚠️ Bad status ({response.status_code}) for {url}")
            return None

        html = response.text

        # Attempt trafilatura
        extracted = trafilatura.extract(
            html,
            include_comments=False,
            include_tables=False,
            no_fallback=False
        )

        if extracted:
            text = clean_text(extracted)
            if not is_low_quality(text):
                return text
            else:
                print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
        else:
            print(f"⚠️ Trafilatura extraction failed or empty: {url}")

    except Exception as e:
        print(f"⚠️ Trafilatura failed for {url}: {e}")

    # Fallback to newspaper3k
    try:
        article = Article(url)
        article.download()
        article.parse()
        if article.text:
            text = clean_text(article.text)
            if not is_low_quality(text):
                return text
            else:
                print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
        else:
            print(f"⚠️ Newspaper3k extracted no text: {url}")
    except Exception as e:
        print(f"⚠️ Newspaper3k failed for {url}: {e}")

    # Final fallback to basic HTML parsing
    try:
        if html:
            fallback = fallback_html_extract(html)
            if fallback:
                print(f"✅ Used fallback extractor for: {url}")
                return fallback
    except Exception as e:
        print(f"⚠️ Final fallback failed for {url}: {e}")

    return None