PyQuarX commited on
Commit
c7ebd2b
·
verified ·
1 Parent(s): d692aee

Update scraper.py (#1)

Browse files

- Update scraper.py (23d758b91524129d2af7141e05e382d942adae47)

Files changed (1) hide show
  1. scraper.py +10 -3
scraper.py CHANGED
@@ -9,9 +9,15 @@ def scrape_website(website):
9
  print("Launching chrome browser...")
10
 
11
  chrome_driver_path = "/usr/bin/chromedriver"
 
 
12
  options = webdriver.ChromeOptions()
13
- driver = webdriver.Chrome(service=Service(chrome_driver_path, options=options))
14
-
 
 
 
 
15
 
16
  try:
17
  driver.get(website)
@@ -19,11 +25,12 @@ def scrape_website(website):
19
  html = driver.page_source
20
 
21
  return html
22
-
23
  finally:
24
  driver.quit()
25
 
26
 
 
27
  def extract_body_content(html_content):
28
  soup = BeautifulSoup(html_content,"html.parser")
29
  body_content = soup.body
 
9
  print("Launching chrome browser...")
10
 
11
  chrome_driver_path = "/usr/bin/chromedriver"
12
+ chrome_binary_path = "/usr/bin/chromium-browser"
13
+
14
  options = webdriver.ChromeOptions()
15
+ options.binary_location = chrome_binary_path
16
+ options.add_argument("--headless")
17
+ options.add_argument("--no-sandbox")
18
+ options.add_argument("--disable-dev-shm-usage")
19
+
20
+ driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
21
 
22
  try:
23
  driver.get(website)
 
25
  html = driver.page_source
26
 
27
  return html
28
+
29
  finally:
30
  driver.quit()
31
 
32
 
33
+
34
  def extract_body_content(html_content):
35
  soup = BeautifulSoup(html_content,"html.parser")
36
  body_content = soup.body