import requests from bs4 import BeautifulSoup from urllib.parse import urljoin from readability import Document import re def clean_text(text): """Clean and normalize text content""" text = re.sub(r'\s+', ' ', text) # Remove extra whitespace text = re.sub(r'\[[^\]]*\]', '', text) # Remove footnotes return text.strip() def scrape_url(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } res = requests.get(url, headers=headers, timeout=15) res.raise_for_status() # Extract main content doc = Document(res.content) soup = BeautifulSoup(doc.summary(), 'html.parser') # Clean text text = clean_text(soup.get_text(separator='\n', strip=True)) # Get image URLs images = [] for img in soup.find_all('img'): src = img.get('src') or img.get('data-src') if src: abs_url = urljoin(url, src) if abs_url.startswith('http') and any(abs_url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']): images.append(abs_url) return text, images[:10] # Return max 10 images except Exception as e: return f"[Error scraping {url}: {str(e)}]", []