Spaces:

PyQuarX
/

scrape-with-ai

Running

PyQuarX commited on Apr 15

Commit

646a14d

verified ·

1 Parent(s): 578c42d

Update scraper.py

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -3,16 +3,15 @@ from selenium.webdriver.chrome.service import Service
 from bs4 import BeautifulSoup
 import os
 def scrape_website(website):
     print("Launching chrome browser...")
-    chrome_driver_path = "../chromedriver"
-    chrome_binary_path = "../chromium-browser"
-    if not os.path.exists(chrome_driver_path):
-        raise FileNotFoundError(f"Chromedriver not found at {chrome_driver_path}")
-    if not os.path.exists(chrome_binary_path):
-        raise FileNotFoundError(f"Chromium not found at {chrome_binary_path}")
     options = webdriver.ChromeOptions()
     options.binary_location = chrome_binary_path
@@ -25,12 +24,12 @@ def scrape_website(website):
     try:
         driver.get(website)
-        print("Page Loaded...")
         html = driver.page_source
         return html
     finally:
         driver.quit()
 def extract_body_content(html_content):
     soup = BeautifulSoup(html_content, "html.parser")
     body_content = soup.body

 from bs4 import BeautifulSoup
 import os
 def scrape_website(website):
     print("Launching chrome browser...")
+    chrome_driver_path = which("chromedriver")
+    chrome_binary_path = which("chromium-browser") or which("chromium")
+    if not chrome_driver_path or not chrome_binary_path:
+        raise EnvironmentError("chromedriver or chromium-browser not found in PATH")
     options = webdriver.ChromeOptions()
     options.binary_location = chrome_binary_path
     try:
         driver.get(website)
         html = driver.page_source
         return html
     finally:
         driver.quit()
 def extract_body_content(html_content):
     soup = BeautifulSoup(html_content, "html.parser")
     body_content = soup.body