Spaces:

gaur3009
/

Scaper_search

Runtime error

gaur3009 commited on Jul 14

Commit

ee2e25a

verified ·

1 Parent(s): 1ac9fd6

Update scraper.py

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -1,13 +1,37 @@
 import requests
 from bs4 import BeautifulSoup
 def scrape_url(url):
     try:
-        res = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
-        res.raise_for_status()
-        soup = BeautifulSoup(res.text, 'html.parser')
-        for tag in soup(['script','style','']): tag.decompose()
-        text = soup.get_text(separator='\n', strip=True)
-        return text
     except Exception as e:
-        return f"[Error scraping {url}: {e}]"

 import requests
 from bs4 import BeautifulSoup
+import re
+# Clean HTML tags
+TAG_CLEANER = re.compile(r"<[^>]+>")
+def clean_text(text):
+    """Clean and normalize text"""
+    text = TAG_CLEANER.sub('', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
 def scrape_url(url):
+    """Efficient content extraction with fallbacks"""
     try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
+            'Accept-Language': 'en-US,en;q=0.9'
+        }
+        response = requests.get(url, timeout=8, headers=headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'lxml')
+        # Try semantic tags first
+        for selector in ['article', 'main', '.article-body', '.post-content']:
+            if element := soup.select_one(selector):
+                return clean_text(element.get_text())
+        # Fallback to paragraph aggregation
+        paragraphs = soup.find_all('p')
+        content = " ".join(p.get_text().strip() for p in paragraphs)
+        return clean_text(content)[:5000]
     except Exception as e:
+        return f"⚠️ Error: Could not retrieve content from {url}"