PyQuarX commited on
Commit
f64f0d6
·
verified ·
1 Parent(s): de8b38a

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +13 -3
scraper.py CHANGED
@@ -2,6 +2,7 @@ from selenium import webdriver
2
  from selenium.webdriver.chrome.service import Service
3
  import time
4
  from bs4 import BeautifulSoup
 
5
 
6
 
7
 
@@ -10,8 +11,17 @@ def scrape_website(website):
10
 
11
  chrome_driver_path = "/usr/bin/chromedriver"
12
  options = webdriver.ChromeOptions()
13
- driver = webdriver.Chrome(service=Service(chrome_driver_path, options=options))
14
-
 
 
 
 
 
 
 
 
 
15
 
16
  try:
17
  driver.get(website)
@@ -19,7 +29,7 @@ def scrape_website(website):
19
  html = driver.page_source
20
 
21
  return html
22
-
23
  finally:
24
  driver.quit()
25
 
 
2
  from selenium.webdriver.chrome.service import Service
3
  import time
4
  from bs4 import BeautifulSoup
5
+ import tempfile
6
 
7
 
8
 
 
11
 
12
  chrome_driver_path = "/usr/bin/chromedriver"
13
  options = webdriver.ChromeOptions()
14
+
15
+ # Essential arguments for headless environments like Hugging Face
16
+ options.add_argument("--headless")
17
+ options.add_argument("--no-sandbox")
18
+ options.add_argument("--disable-dev-shm-usage")
19
+
20
+ # Use a unique user data directory to prevent session conflicts
21
+ unique_user_data_dir = tempfile.mkdtemp()
22
+ options.add_argument(f"--user-data-dir={unique_user_data_dir}")
23
+
24
+ driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
25
 
26
  try:
27
  driver.get(website)
 
29
  html = driver.page_source
30
 
31
  return html
32
+
33
  finally:
34
  driver.quit()
35