Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
from readability import Document | |
import re | |
def clean_text(text): | |
"""Clean and normalize text content""" | |
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace | |
text = re.sub(r'\[[^\]]*\]', '', text) # Remove footnotes | |
return text.strip() | |
def scrape_url(url): | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
res = requests.get(url, headers=headers, timeout=15) | |
res.raise_for_status() | |
# Extract main content | |
doc = Document(res.content) | |
soup = BeautifulSoup(doc.summary(), 'html.parser') | |
# Clean text | |
text = clean_text(soup.get_text(separator='\n', strip=True)) | |
# Get image URLs | |
images = [] | |
for img in soup.find_all('img'): | |
src = img.get('src') or img.get('data-src') | |
if src: | |
abs_url = urljoin(url, src) | |
if abs_url.startswith('http') and any(abs_url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']): | |
images.append(abs_url) | |
return text, images[:10] # Return max 10 images | |
except Exception as e: | |
return f"[Error scraping {url}: {str(e)}]", [] |