gaur3009 commited on
Commit
ee2e25a
Β·
verified Β·
1 Parent(s): 1ac9fd6

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +31 -7
scraper.py CHANGED
@@ -1,13 +1,37 @@
1
  import requests
2
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
 
 
3
 
4
  def scrape_url(url):
 
5
  try:
6
- res = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
7
- res.raise_for_status()
8
- soup = BeautifulSoup(res.text, 'html.parser')
9
- for tag in soup(['script','style','']): tag.decompose()
10
- text = soup.get_text(separator='\n', strip=True)
11
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  except Exception as e:
13
- return f"[Error scraping {url}: {e}]"
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ import re
4
+
5
+ # Clean HTML tags
6
+ TAG_CLEANER = re.compile(r"<[^>]+>")
7
+
8
+ def clean_text(text):
9
+ """Clean and normalize text"""
10
+ text = TAG_CLEANER.sub('', text)
11
+ text = re.sub(r'\s+', ' ', text).strip()
12
+ return text
13
 
14
  def scrape_url(url):
15
+ """Efficient content extraction with fallbacks"""
16
  try:
17
+ headers = {
18
+ 'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
19
+ 'Accept-Language': 'en-US,en;q=0.9'
20
+ }
21
+ response = requests.get(url, timeout=8, headers=headers)
22
+ response.raise_for_status()
23
+
24
+ soup = BeautifulSoup(response.text, 'lxml')
25
+
26
+ # Try semantic tags first
27
+ for selector in ['article', 'main', '.article-body', '.post-content']:
28
+ if element := soup.select_one(selector):
29
+ return clean_text(element.get_text())
30
+
31
+ # Fallback to paragraph aggregation
32
+ paragraphs = soup.find_all('p')
33
+ content = " ".join(p.get_text().strip() for p in paragraphs)
34
+ return clean_text(content)[:5000]
35
+
36
  except Exception as e:
37
+ return f"⚠️ Error: Could not retrieve content from {url}"