Spaces:

gaur3009
/

Scaper_search

Runtime error

Scaper_search / scraper.py

Update scraper.py

1b58f37 verified about 2 months ago

1.4 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	from readability import Document
	import re

	def clean_text(text):
	"""Clean and normalize text content"""
	text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
	text = re.sub(r'\[[^\]]*\]', '', text) # Remove footnotes
	return text.strip()

	def scrape_url(url):
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	res = requests.get(url, headers=headers, timeout=15)
	res.raise_for_status()

	# Extract main content
	doc = Document(res.content)
	soup = BeautifulSoup(doc.summary(), 'html.parser')

	# Clean text
	text = clean_text(soup.get_text(separator='\n', strip=True))

	# Get image URLs
	images = []
	for img in soup.find_all('img'):
	src = img.get('src') or img.get('data-src')
	if src:
	abs_url = urljoin(url, src)
	if abs_url.startswith('http') and any(abs_url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
	images.append(abs_url)

	return text, images[:10] # Return max 10 images

	except Exception as e:
	return f"[Error scraping {url}: {str(e)}]", []