gaur3009 commited on
Commit
a60b17f
Β·
verified Β·
1 Parent(s): 3be11bc

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +49 -10
scraper.py CHANGED
@@ -1,24 +1,63 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urljoin
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def scrape_url(url):
6
  """Fetch text + image URLs from webpage."""
7
  try:
8
- res = requests.get(url, timeout=10)
 
 
 
 
 
 
9
  res.raise_for_status()
 
10
  soup = BeautifulSoup(res.text, 'html.parser')
11
 
12
- # get text
13
- text = soup.get_text(separator='\n', strip=True)
14
-
15
- # get image URLs (absolute)
 
 
 
 
 
 
 
 
16
  images = []
17
  for img in soup.find_all('img'):
18
- src = img.get('src')
19
  if src:
20
- images.append(urljoin(url, src))
21
-
22
- return text, images
 
 
 
23
  except Exception as e:
24
- return f"[Error scraping {url}: {e}]", []
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urljoin
4
+ import re
5
+ import random
6
+
7
+ USER_AGENTS = [
8
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
9
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
10
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
11
+ ]
12
+
13
+ def clean_text(text):
14
+ """Clean and normalize text content"""
15
+ # Remove excessive whitespace
16
+ text = re.sub(r'\s+', ' ', text)
17
+ # Remove JavaScript and CSS
18
+ text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
19
+ text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
20
+ # Remove HTML tags
21
+ text = re.sub(r'<[^>]+>', ' ', text)
22
+ # Remove special characters
23
+ text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
24
+ return text.strip()
25
 
26
  def scrape_url(url):
27
  """Fetch text + image URLs from webpage."""
28
  try:
29
+ headers = {
30
+ 'User-Agent': random.choice(USER_AGENTS),
31
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
32
+ 'Accept-Language': 'en-US,en;q=0.5',
33
+ }
34
+
35
+ res = requests.get(url, headers=headers, timeout=15)
36
  res.raise_for_status()
37
+
38
  soup = BeautifulSoup(res.text, 'html.parser')
39
 
40
+ # Remove unwanted elements
41
+ for element in soup(["script", "style", "header", "footer", "nav", "aside", "form"]):
42
+ element.decompose()
43
+
44
+ # Get text from main content areas
45
+ main_content = soup.find_all(['main', 'article', 'div'])
46
+ if main_content:
47
+ text = ' '.join([clean_text(elem.get_text()) for elem in main_content])
48
+ else:
49
+ text = clean_text(soup.get_text())
50
+
51
+ # Get image URLs (absolute)
52
  images = []
53
  for img in soup.find_all('img'):
54
+ src = img.get('src') or img.get('data-src')
55
  if src:
56
+ abs_url = urljoin(url, src)
57
+ if abs_url.startswith('http'):
58
+ images.append(abs_url)
59
+
60
+ return text, images[:5] # Return max 5 images
61
+
62
  except Exception as e:
63
+ return f"[Error scraping {url}: {str(e)}]", []