Spaces:

PyQuarX
/

scrape-with-ai

Running

PyQuarX commited on Apr 13

Commit

c7ebd2b

verified ·

1 Parent(s): d692aee

Update scraper.py (#1)

- Update scraper.py (23d758b91524129d2af7141e05e382d942adae47)

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -9,9 +9,15 @@ def scrape_website(website):
     print("Launching chrome browser...")
     chrome_driver_path = "/usr/bin/chromedriver"
     options = webdriver.ChromeOptions()
-    driver = webdriver.Chrome(service=Service(chrome_driver_path, options=options))
     try:
         driver.get(website)
@@ -19,11 +25,12 @@ def scrape_website(website):
         html = driver.page_source
         return html
     finally:
         driver.quit()
 def extract_body_content(html_content):
     soup = BeautifulSoup(html_content,"html.parser")
     body_content = soup.body

     print("Launching chrome browser...")
     chrome_driver_path = "/usr/bin/chromedriver"
+    chrome_binary_path = "/usr/bin/chromium-browser"
     options = webdriver.ChromeOptions()
+    options.binary_location = chrome_binary_path
+    options.add_argument("--headless")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
     try:
         driver.get(website)
         html = driver.page_source
         return html
     finally:
         driver.quit()
 def extract_body_content(html_content):
     soup = BeautifulSoup(html_content,"html.parser")
     body_content = soup.body