Spaces:

gaur3009
/

Scaper_search

Runtime error

App Files Files Community

gaur3009 commited on Jul 13

Commit

a60b17f

verified ·

1 Parent(s): 3be11bc

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +49 -10

scraper.py CHANGED Viewed

@@ -1,24 +1,63 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 def scrape_url(url):
     """Fetch text + image URLs from webpage."""
     try:
-        res = requests.get(url, timeout=10)
         res.raise_for_status()
         soup = BeautifulSoup(res.text, 'html.parser')
-        # get text
-        text = soup.get_text(separator='\n', strip=True)
-        # get image URLs (absolute)
         images = []
         for img in soup.find_all('img'):
-            src = img.get('src')
             if src:
-                images.append(urljoin(url, src))
-        return text, images
     except Exception as e:
-        return f"[Error scraping {url}: {e}]", []

 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
+import re
+import random
+USER_AGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
+]
+def clean_text(text):
+    """Clean and normalize text content"""
+    # Remove excessive whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove JavaScript and CSS
+    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
+    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
+    # Remove HTML tags
+    text = re.sub(r'<[^>]+>', ' ', text)
+    # Remove special characters
+    text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
+    return text.strip()
 def scrape_url(url):
     """Fetch text + image URLs from webpage."""
     try:
+        headers = {
+            'User-Agent': random.choice(USER_AGENTS),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+        }
+        res = requests.get(url, headers=headers, timeout=15)
         res.raise_for_status()
         soup = BeautifulSoup(res.text, 'html.parser')
+        # Remove unwanted elements
+        for element in soup(["script", "style", "header", "footer", "nav", "aside", "form"]):
+            element.decompose()
+        # Get text from main content areas
+        main_content = soup.find_all(['main', 'article', 'div'])
+        if main_content:
+            text = ' '.join([clean_text(elem.get_text()) for elem in main_content])
+        else:
+            text = clean_text(soup.get_text())
+        # Get image URLs (absolute)
         images = []
         for img in soup.find_all('img'):
+            src = img.get('src') or img.get('data-src')
             if src:
+                abs_url = urljoin(url, src)
+                if abs_url.startswith('http'):
+                    images.append(abs_url)
+        return text, images[:5]  # Return max 5 images
     except Exception as e:
+        return f"[Error scraping {url}: {str(e)}]", []