gaur3009 commited on
Commit
863f6b8
Β·
verified Β·
1 Parent(s): 01f6a5f

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +12 -28
scraper.py CHANGED
@@ -1,40 +1,24 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urljoin
4
- from readability import Document
5
- import re
6
-
7
- def clean_text(text):
8
- """Clean and normalize text content"""
9
- text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
10
- text = re.sub(r'\[[^\]]*\]', '', text) # Remove footnotes
11
- return text.strip()
12
 
13
  def scrape_url(url):
 
14
  try:
15
- headers = {
16
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
17
- }
18
- res = requests.get(url, headers=headers, timeout=15)
19
  res.raise_for_status()
 
20
 
21
- # Extract main content
22
- doc = Document(res.content)
23
- soup = BeautifulSoup(doc.summary(), 'html.parser')
24
-
25
- # Clean text
26
- text = clean_text(soup.get_text(separator='\n', strip=True))
27
-
28
- # Get image URLs
29
  images = []
30
  for img in soup.find_all('img'):
31
- src = img.get('src') or img.get('data-src')
32
  if src:
33
- abs_url = urljoin(url, src)
34
- if abs_url.startswith('http') and any(abs_url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
35
- images.append(abs_url)
36
-
37
- return text, images[:10] # Return max 10 images
38
-
39
  except Exception as e:
40
- return f"[Error scraping {url}: {str(e)}]", []
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urljoin
 
 
 
 
 
 
 
 
4
 
5
  def scrape_url(url):
6
+ """Fetch text + image URLs from webpage."""
7
  try:
8
+ res = requests.get(url, timeout=10)
 
 
 
9
  res.raise_for_status()
10
+ soup = BeautifulSoup(res.text, 'html.parser')
11
 
12
+ # get text
13
+ text = soup.get_text(separator='\n', strip=True)
14
+
15
+ # get image URLs (absolute)
 
 
 
 
16
  images = []
17
  for img in soup.find_all('img'):
18
+ src = img.get('src')
19
  if src:
20
+ images.append(urljoin(url, src))
21
+
22
+ return text, images
 
 
 
23
  except Exception as e:
24
+ return f"[Error scraping {url}: {e}]", []