Spaces:

gaur3009
/

Scaper_search

Runtime error

App Files Files Community

Scaper_search / scraper.py

gaur3009

Update scraper.py

a60b17f verified 3 months ago

raw

history blame

2.36 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	import re
	import random

	USER_AGENTS = [
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
	]

	def clean_text(text):
	"""Clean and normalize text content"""
	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove JavaScript and CSS
	text = re.sub(r'<script[^>]>.?</script>', '', text, flags=re.DOTALL)
	text = re.sub(r'<style[^>]>.?</style>', '', text, flags=re.DOTALL)
	# Remove HTML tags
	text = re.sub(r'<[^>]+>', ' ', text)
	# Remove special characters
	text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
	return text.strip()

	def scrape_url(url):
	"""Fetch text + image URLs from webpage."""
	try:
	headers = {
	'User-Agent': random.choice(USER_AGENTS),
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	}

	res = requests.get(url, headers=headers, timeout=15)
	res.raise_for_status()

	soup = BeautifulSoup(res.text, 'html.parser')

	# Remove unwanted elements
	for element in soup(["script", "style", "header", "footer", "nav", "aside", "form"]):
	element.decompose()

	# Get text from main content areas
	main_content = soup.find_all(['main', 'article', 'div'])
	if main_content:
	text = ' '.join([clean_text(elem.get_text()) for elem in main_content])
	else:
	text = clean_text(soup.get_text())

	# Get image URLs (absolute)
	images = []
	for img in soup.find_all('img'):
	src = img.get('src') or img.get('data-src')
	if src:
	abs_url = urljoin(url, src)
	if abs_url.startswith('http'):
	images.append(abs_url)

	return text, images[:5] # Return max 5 images

	except Exception as e:
	return f"[Error scraping {url}: {str(e)}]", []