gaur3009 commited on
Commit
daac110
Β·
verified Β·
1 Parent(s): 9dad5aa

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +4 -55
scraper.py CHANGED
@@ -1,63 +1,12 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
- from urllib.parse import urljoin
4
- import re
5
- import random
6
-
7
- USER_AGENTS = [
8
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
9
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
10
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
11
- ]
12
-
13
- def clean_text(text):
14
- """Clean and normalize text content"""
15
- # Remove excessive whitespace
16
- text = re.sub(r'\s+', ' ', text)
17
- # Remove JavaScript and CSS
18
- text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
19
- text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
20
- # Remove HTML tags
21
- text = re.sub(r'<[^>]+>', ' ', text)
22
- # Remove special characters
23
- text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
24
- return text.strip()
25
 
26
  def scrape_url(url):
27
- """Fetch text + image URLs from webpage."""
28
  try:
29
- headers = {
30
- 'User-Agent': random.choice(USER_AGENTS),
31
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
32
- 'Accept-Language': 'en-US,en;q=0.5',
33
- }
34
-
35
- res = requests.get(url, headers=headers, timeout=15)
36
  res.raise_for_status()
37
-
38
  soup = BeautifulSoup(res.text, 'html.parser')
39
-
40
- # Remove unwanted elements
41
- for element in soup(["script", "style", "header", "footer", "nav", "aside", "form"]):
42
- element.decompose()
43
-
44
- # Get text from main content areas
45
- main_content = soup.find_all(['main', 'article', 'div'])
46
- if main_content:
47
- text = ' '.join([clean_text(elem.get_text()) for elem in main_content])
48
- else:
49
- text = clean_text(soup.get_text())
50
-
51
- # Get image URLs (absolute)
52
- images = []
53
- for img in soup.find_all('img'):
54
- src = img.get('src') or img.get('data-src')
55
- if src:
56
- abs_url = urljoin(url, src)
57
- if abs_url.startswith('http'):
58
- images.append(abs_url)
59
-
60
- return text, images[:5] # Return max 5 images
61
-
62
  except Exception as e:
63
- return f"[Error scraping {url}: {str(e)}]", []
 
1
  import requests
2
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def scrape_url(url):
5
+ """Fetch and extract text from a webpage."""
6
  try:
7
+ res = requests.get(url, timeout=10)
 
 
 
 
 
 
8
  res.raise_for_status()
 
9
  soup = BeautifulSoup(res.text, 'html.parser')
10
+ return soup.get_text(separator='\n', strip=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  except Exception as e:
12
+ return f"[Error scraping {url}: {e}]"