Spaces:
Runtime error
Runtime error
File size: 1,403 Bytes
1349210 b5cde6a 1b58f37 1349210 1b58f37 1349210 b5cde6a 1b58f37 b5cde6a 1b58f37 b5cde6a 1b58f37 1349210 1b58f37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from readability import Document
import re
def clean_text(text):
"""Clean and normalize text content"""
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
text = re.sub(r'\[[^\]]*\]', '', text) # Remove footnotes
return text.strip()
def scrape_url(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
res = requests.get(url, headers=headers, timeout=15)
res.raise_for_status()
# Extract main content
doc = Document(res.content)
soup = BeautifulSoup(doc.summary(), 'html.parser')
# Clean text
text = clean_text(soup.get_text(separator='\n', strip=True))
# Get image URLs
images = []
for img in soup.find_all('img'):
src = img.get('src') or img.get('data-src')
if src:
abs_url = urljoin(url, src)
if abs_url.startswith('http') and any(abs_url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
images.append(abs_url)
return text, images[:10] # Return max 10 images
except Exception as e:
return f"[Error scraping {url}: {str(e)}]", [] |