File size: 1,403 Bytes
1349210
 
b5cde6a
1b58f37
 
 
 
 
 
 
 
1349210
 
 
1b58f37
 
 
 
1349210
b5cde6a
1b58f37
 
 
 
 
 
 
 
b5cde6a
 
1b58f37
b5cde6a
1b58f37
 
 
 
 
 
1349210
1b58f37
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from readability import Document
import re

def clean_text(text):
    """Clean and normalize text content"""
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'\[[^\]]*\]', '', text)  # Remove footnotes
    return text.strip()

def scrape_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        res = requests.get(url, headers=headers, timeout=15)
        res.raise_for_status()
        
        # Extract main content
        doc = Document(res.content)
        soup = BeautifulSoup(doc.summary(), 'html.parser')
        
        # Clean text
        text = clean_text(soup.get_text(separator='\n', strip=True))
        
        # Get image URLs
        images = []
        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src')
            if src:
                abs_url = urljoin(url, src)
                if abs_url.startswith('http') and any(abs_url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
                    images.append(abs_url)
        
        return text, images[:10]  # Return max 10 images
    
    except Exception as e:
        return f"[Error scraping {url}: {str(e)}]", []