Spaces:

gaur3009
/

Scaper_search

Runtime error

gaur3009 commited on Jul 13

Commit

1b58f37

verified ·

1 Parent(s): ace7592

Update scraper.py

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -1,25 +1,40 @@
-# scraper.py
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 def scrape_url(url):
-    """Fetch text + image URLs from webpage."""
     try:
-        res = requests.get(url, timeout=10)
         res.raise_for_status()
-        soup = BeautifulSoup(res.text, 'html.parser')
-        # get text
-        text = soup.get_text(separator='\n', strip=True)
-        # get image URLs (absolute)
         images = []
         for img in soup.find_all('img'):
-            src = img.get('src')
             if src:
-                images.append(urljoin(url, src))
-        return text, images
     except Exception as e:
-        return f"[Error scraping {url}: {e}]", []

 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
+from readability import Document
+import re
+def clean_text(text):
+    """Clean and normalize text content"""
+    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
+    text = re.sub(r'\[[^\]]*\]', '', text)  # Remove footnotes
+    return text.strip()
 def scrape_url(url):
     try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        res = requests.get(url, headers=headers, timeout=15)
         res.raise_for_status()
+        # Extract main content
+        doc = Document(res.content)
+        soup = BeautifulSoup(doc.summary(), 'html.parser')
+        # Clean text
+        text = clean_text(soup.get_text(separator='\n', strip=True))
+        # Get image URLs
         images = []
         for img in soup.find_all('img'):
+            src = img.get('src') or img.get('data-src')
             if src:
+                abs_url = urljoin(url, src)
+                if abs_url.startswith('http') and any(abs_url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
+                    images.append(abs_url)
+        return text, images[:10]  # Return max 10 images
     except Exception as e:
+        return f"[Error scraping {url}: {str(e)}]", []