File size: 2,359 Bytes
1349210
 
b5cde6a
a60b17f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1349210
 
863f6b8
1349210
a60b17f
 
 
 
 
 
 
1349210
a60b17f
863f6b8
b5cde6a
a60b17f
 
 
 
 
 
 
 
 
 
 
 
b5cde6a
 
a60b17f
b5cde6a
a60b17f
 
 
 
 
 
1349210
a60b17f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import random

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
]

def clean_text(text):
    """Clean and normalize text content"""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove JavaScript and CSS
    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove special characters
    text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
    return text.strip()

def scrape_url(url):
    """Fetch text + image URLs from webpage."""
    try:
        headers = {
            'User-Agent': random.choice(USER_AGENTS),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
        }
        
        res = requests.get(url, headers=headers, timeout=15)
        res.raise_for_status()
        
        soup = BeautifulSoup(res.text, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(["script", "style", "header", "footer", "nav", "aside", "form"]):
            element.decompose()
        
        # Get text from main content areas
        main_content = soup.find_all(['main', 'article', 'div'])
        if main_content:
            text = ' '.join([clean_text(elem.get_text()) for elem in main_content])
        else:
            text = clean_text(soup.get_text())
        
        # Get image URLs (absolute)
        images = []
        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src')
            if src:
                abs_url = urljoin(url, src)
                if abs_url.startswith('http'):
                    images.append(abs_url)
        
        return text, images[:5]  # Return max 5 images
    
    except Exception as e:
        return f"[Error scraping {url}: {str(e)}]", []