Spaces:
Runtime error
Runtime error
Update scraper.py
Browse files- scraper.py +4 -55
scraper.py
CHANGED
@@ -1,63 +1,12 @@
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
-
from urllib.parse import urljoin
|
4 |
-
import re
|
5 |
-
import random
|
6 |
-
|
7 |
-
USER_AGENTS = [
|
8 |
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
9 |
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
|
10 |
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
|
11 |
-
]
|
12 |
-
|
13 |
-
def clean_text(text):
|
14 |
-
"""Clean and normalize text content"""
|
15 |
-
# Remove excessive whitespace
|
16 |
-
text = re.sub(r'\s+', ' ', text)
|
17 |
-
# Remove JavaScript and CSS
|
18 |
-
text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
|
19 |
-
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
|
20 |
-
# Remove HTML tags
|
21 |
-
text = re.sub(r'<[^>]+>', ' ', text)
|
22 |
-
# Remove special characters
|
23 |
-
text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
|
24 |
-
return text.strip()
|
25 |
|
26 |
def scrape_url(url):
|
27 |
-
"""Fetch
|
28 |
try:
|
29 |
-
|
30 |
-
'User-Agent': random.choice(USER_AGENTS),
|
31 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
32 |
-
'Accept-Language': 'en-US,en;q=0.5',
|
33 |
-
}
|
34 |
-
|
35 |
-
res = requests.get(url, headers=headers, timeout=15)
|
36 |
res.raise_for_status()
|
37 |
-
|
38 |
soup = BeautifulSoup(res.text, 'html.parser')
|
39 |
-
|
40 |
-
# Remove unwanted elements
|
41 |
-
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form"]):
|
42 |
-
element.decompose()
|
43 |
-
|
44 |
-
# Get text from main content areas
|
45 |
-
main_content = soup.find_all(['main', 'article', 'div'])
|
46 |
-
if main_content:
|
47 |
-
text = ' '.join([clean_text(elem.get_text()) for elem in main_content])
|
48 |
-
else:
|
49 |
-
text = clean_text(soup.get_text())
|
50 |
-
|
51 |
-
# Get image URLs (absolute)
|
52 |
-
images = []
|
53 |
-
for img in soup.find_all('img'):
|
54 |
-
src = img.get('src') or img.get('data-src')
|
55 |
-
if src:
|
56 |
-
abs_url = urljoin(url, src)
|
57 |
-
if abs_url.startswith('http'):
|
58 |
-
images.append(abs_url)
|
59 |
-
|
60 |
-
return text, images[:5] # Return max 5 images
|
61 |
-
|
62 |
except Exception as e:
|
63 |
-
return f"[Error scraping {url}: {
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
def scrape_url(url):
|
5 |
+
"""Fetch and extract text from a webpage."""
|
6 |
try:
|
7 |
+
res = requests.get(url, timeout=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
res.raise_for_status()
|
|
|
9 |
soup = BeautifulSoup(res.text, 'html.parser')
|
10 |
+
return soup.get_text(separator='\n', strip=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
except Exception as e:
|
12 |
+
return f"[Error scraping {url}: {e}]"
|