Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
import re | |
import random | |
USER_AGENTS = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", | |
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" | |
] | |
def clean_text(text): | |
"""Clean and normalize text content""" | |
# Remove excessive whitespace | |
text = re.sub(r'\s+', ' ', text) | |
# Remove JavaScript and CSS | |
text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL) | |
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL) | |
# Remove HTML tags | |
text = re.sub(r'<[^>]+>', ' ', text) | |
# Remove special characters | |
text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text) | |
return text.strip() | |
def scrape_url(url): | |
"""Fetch text + image URLs from webpage.""" | |
try: | |
headers = { | |
'User-Agent': random.choice(USER_AGENTS), | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
} | |
res = requests.get(url, headers=headers, timeout=15) | |
res.raise_for_status() | |
soup = BeautifulSoup(res.text, 'html.parser') | |
# Remove unwanted elements | |
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form"]): | |
element.decompose() | |
# Get text from main content areas | |
main_content = soup.find_all(['main', 'article', 'div']) | |
if main_content: | |
text = ' '.join([clean_text(elem.get_text()) for elem in main_content]) | |
else: | |
text = clean_text(soup.get_text()) | |
# Get image URLs (absolute) | |
images = [] | |
for img in soup.find_all('img'): | |
src = img.get('src') or img.get('data-src') | |
if src: | |
abs_url = urljoin(url, src) | |
if abs_url.startswith('http'): | |
images.append(abs_url) | |
return text, images[:5] # Return max 5 images | |
except Exception as e: | |
return f"[Error scraping {url}: {str(e)}]", [] |