gaur3009 commited on
Commit
1b58f37
Β·
verified Β·
1 Parent(s): ace7592

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +28 -13
scraper.py CHANGED
@@ -1,25 +1,40 @@
1
- # scraper.py
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin
 
 
 
 
 
 
 
 
5
 
6
  def scrape_url(url):
7
- """Fetch text + image URLs from webpage."""
8
  try:
9
- res = requests.get(url, timeout=10)
 
 
 
10
  res.raise_for_status()
11
- soup = BeautifulSoup(res.text, 'html.parser')
12
 
13
- # get text
14
- text = soup.get_text(separator='\n', strip=True)
15
-
16
- # get image URLs (absolute)
 
 
 
 
17
  images = []
18
  for img in soup.find_all('img'):
19
- src = img.get('src')
20
  if src:
21
- images.append(urljoin(url, src))
22
-
23
- return text, images
 
 
 
24
  except Exception as e:
25
- return f"[Error scraping {url}: {e}]", []
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urljoin
4
+ from readability import Document
5
+ import re
6
+
7
+ def clean_text(text):
8
+ """Clean and normalize text content"""
9
+ text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
10
+ text = re.sub(r'\[[^\]]*\]', '', text) # Remove footnotes
11
+ return text.strip()
12
 
13
  def scrape_url(url):
 
14
  try:
15
+ headers = {
16
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
17
+ }
18
+ res = requests.get(url, headers=headers, timeout=15)
19
  res.raise_for_status()
 
20
 
21
+ # Extract main content
22
+ doc = Document(res.content)
23
+ soup = BeautifulSoup(doc.summary(), 'html.parser')
24
+
25
+ # Clean text
26
+ text = clean_text(soup.get_text(separator='\n', strip=True))
27
+
28
+ # Get image URLs
29
  images = []
30
  for img in soup.find_all('img'):
31
+ src = img.get('src') or img.get('data-src')
32
  if src:
33
+ abs_url = urljoin(url, src)
34
+ if abs_url.startswith('http') and any(abs_url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
35
+ images.append(abs_url)
36
+
37
+ return text, images[:10] # Return max 10 images
38
+
39
  except Exception as e:
40
+ return f"[Error scraping {url}: {str(e)}]", []