Spaces:

gaur3009
/

Scaper_search

Runtime error

gaur3009 commited on Jul 13

Commit

863f6b8

verified ·

1 Parent(s): 01f6a5f

Update scraper.py

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -1,40 +1,24 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
-from readability import Document
-import re
-def clean_text(text):
-    """Clean and normalize text content"""
-    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
-    text = re.sub(r'\[[^\]]*\]', '', text)  # Remove footnotes
-    return text.strip()
 def scrape_url(url):
     try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        }
-        res = requests.get(url, headers=headers, timeout=15)
         res.raise_for_status()
-        # Extract main content
-        doc = Document(res.content)
-        soup = BeautifulSoup(doc.summary(), 'html.parser')
-        # Clean text
-        text = clean_text(soup.get_text(separator='\n', strip=True))
-        # Get image URLs
         images = []
         for img in soup.find_all('img'):
-            src = img.get('src') or img.get('data-src')
             if src:
-                abs_url = urljoin(url, src)
-                if abs_url.startswith('http') and any(abs_url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
-                    images.append(abs_url)
-        return text, images[:10]  # Return max 10 images
     except Exception as e:
-        return f"[Error scraping {url}: {str(e)}]", []

 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 def scrape_url(url):
+    """Fetch text + image URLs from webpage."""
     try:
+        res = requests.get(url, timeout=10)
         res.raise_for_status()
+        soup = BeautifulSoup(res.text, 'html.parser')
+        # get text
+        text = soup.get_text(separator='\n', strip=True)
+        # get image URLs (absolute)
         images = []
         for img in soup.find_all('img'):
+            src = img.get('src')
             if src:
+                images.append(urljoin(url, src))
+        return text, images
     except Exception as e:
+        return f"[Error scraping {url}: {e}]", []