Spaces:

nuseAI
/

fastAPIv2

Running

App Files Files Community

fastAPIv2 / components /fetchers /scraper.py

ragV98

improvements 1

3521e98 2 months ago

raw

history blame

3.37 kB

	import requests
	import trafilatura
	from newspaper import Article
	from typing import Optional
	from bs4 import BeautifulSoup

	HEADERS = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/115.0.0.0 Safari/537.36"
	)
	}


	def clean_text(text: str) -> str:
	"""Remove HTML tags and collapse whitespace."""
	soup = BeautifulSoup(text, "html.parser")
	cleaned = soup.get_text(separator=" ", strip=True)
	return " ".join(cleaned.split())


	def is_low_quality(text: str) -> bool:
	"""Heuristic to detect low-value content like navbars, footers, etc."""
	if not text or len(text.split()) < 50:
	return True

	junk_markers = [
	"subscribe", "click here", "latest headlines", "more from",
	"privacy policy", "video", "terms of service", "back to top",
	"all rights reserved", "advertisement", "read more", "sign in"
	]

	return any(marker in text.lower() for marker in junk_markers)


	def fallback_html_extract(html: str) -> Optional[str]:
	"""Very basic content extractor as a last resort."""
	try:
	soup = BeautifulSoup(html, "html.parser")
	paragraphs = soup.find_all("p")
	text = " ".join(p.get_text(strip=True) for p in paragraphs)
	cleaned = clean_text(text)
	return cleaned if len(cleaned.split()) >= 50 else None
	except Exception as e:
	print(f"⚠️ Fallback extract failed: {e}")
	return None


	def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
	"""Extract meaningful text from a given URL using multiple methods."""
	try:
	response = requests.get(url, timeout=timeout, headers=HEADERS)
	if response.status_code != 200:
	print(f"⚠️ Bad status ({response.status_code}) for {url}")
	return None

	html = response.text

	# Attempt trafilatura
	extracted = trafilatura.extract(
	html,
	include_comments=False,
	include_tables=False,
	no_fallback=False
	)

	if extracted:
	text = clean_text(extracted)
	if not is_low_quality(text):
	return text
	else:
	print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
	else:
	print(f"⚠️ Trafilatura extraction failed or empty: {url}")

	except Exception as e:
	print(f"⚠️ Trafilatura failed for {url}: {e}")

	# Fallback to newspaper3k
	try:
	article = Article(url)
	article.download()
	article.parse()
	if article.text:
	text = clean_text(article.text)
	if not is_low_quality(text):
	return text
	else:
	print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
	else:
	print(f"⚠️ Newspaper3k extracted no text: {url}")
	except Exception as e:
	print(f"⚠️ Newspaper3k failed for {url}: {e}")

	# Final fallback to basic HTML parsing
	try:
	if html:
	fallback = fallback_html_extract(html)
	if fallback:
	print(f"✅ Used fallback extractor for: {url}")
	return fallback
	except Exception as e:
	print(f"⚠️ Final fallback failed for {url}: {e}")

	return None