Spaces:

nuseAI
/

fastAPIv2

Sleeping

File size: 3,370 Bytes

import requests
import trafilatura
from newspaper import Article
from typing import Optional
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}


def clean_text(text: str) -> str:
    """Remove HTML tags and collapse whitespace."""
    soup = BeautifulSoup(text, "html.parser")
    cleaned = soup.get_text(separator=" ", strip=True)
    return " ".join(cleaned.split())


def is_low_quality(text: str) -> bool:
    """Heuristic to detect low-value content like navbars, footers, etc."""
    if not text or len(text.split()) < 50:
        return True

    junk_markers = [
        "subscribe", "click here", "latest headlines", "more from",
        "privacy policy", "video", "terms of service", "back to top",
        "all rights reserved", "advertisement", "read more", "sign in"
    ]

    return any(marker in text.lower() for marker in junk_markers)


def fallback_html_extract(html: str) -> Optional[str]:
    """Very basic content extractor as a last resort."""
    try:
        soup = BeautifulSoup(html, "html.parser")
        paragraphs = soup.find_all("p")
        text = " ".join(p.get_text(strip=True) for p in paragraphs)
        cleaned = clean_text(text)
        return cleaned if len(cleaned.split()) >= 50 else None
    except Exception as e:
        print(f"⚠️ Fallback extract failed: {e}")
        return None


def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
    """Extract meaningful text from a given URL using multiple methods."""
    try:
        response = requests.get(url, timeout=timeout, headers=HEADERS)
        if response.status_code != 200:
            print(f"⚠️ Bad status ({response.status_code}) for {url}")
            return None

        html = response.text

        # Attempt trafilatura
        extracted = trafilatura.extract(
            html,
            include_comments=False,
            include_tables=False,
            no_fallback=False
        )

        if extracted:
            text = clean_text(extracted)
            if not is_low_quality(text):
                return text
            else:
                print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
        else:
            print(f"⚠️ Trafilatura extraction failed or empty: {url}")

    except Exception as e:
        print(f"⚠️ Trafilatura failed for {url}: {e}")

    # Fallback to newspaper3k
    try:
        article = Article(url)
        article.download()
        article.parse()
        if article.text:
            text = clean_text(article.text)
            if not is_low_quality(text):
                return text
            else:
                print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
        else:
            print(f"⚠️ Newspaper3k extracted no text: {url}")
    except Exception as e:
        print(f"⚠️ Newspaper3k failed for {url}: {e}")

    # Final fallback to basic HTML parsing
    try:
        if html:
            fallback = fallback_html_extract(html)
            if fallback:
                print(f"✅ Used fallback extractor for: {url}")
                return fallback
    except Exception as e:
        print(f"⚠️ Final fallback failed for {url}: {e}")

    return None