Spaces:

PyQuarX
/

scrape-with-ai

Running

App Files Files Community

PyQuarX commited on Apr 13

Commit

fed0ea1

verified ·

1 Parent(s): 64528a3

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +15 -25

scraper.py CHANGED Viewed

@@ -1,9 +1,7 @@
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
-import time
 from bs4 import BeautifulSoup
 def scrape_website(website):
     print("Launching chrome browser...")
@@ -11,47 +9,39 @@ def scrape_website(website):
     chrome_driver_path = "/usr/bin/chromedriver"
     chrome_binary_path = "/usr/bin/chromium-browser"
     options = webdriver.ChromeOptions()
     options.binary_location = chrome_binary_path
     options.add_argument("--headless")
     options.add_argument("--no-sandbox")
     options.add_argument("--disable-dev-shm-usage")
-    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
     try:
         driver.get(website)
         print("Page Loaded...")
         html = driver.page_source
         return html
     finally:
         driver.quit()
 def extract_body_content(html_content):
-    soup = BeautifulSoup(html_content,"html.parser")
     body_content = soup.body
-    if body_content:
-        return str(body_content)
-    return ""
 def clean_body_content(body_content):
-    soup = BeautifulSoup(body_content,"html.parser")
-    for script_or_style in soup(["script","style"]):
         script_or_style.extract()
     cleaned_content = soup.get_text(separator="\n")
-    cleaned_content = "\n".join(
-        line.strip() for line in cleaned_content.splitlines() if line.strip()
-    )
-    return cleaned_content
-def split_dom_content(dom_content,max_length=60000):
-    return [
-        dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
-    ]

 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from bs4 import BeautifulSoup
+import os
 def scrape_website(website):
     print("Launching chrome browser...")
     chrome_driver_path = "/usr/bin/chromedriver"
     chrome_binary_path = "/usr/bin/chromium-browser"
+    if not os.path.exists(chrome_driver_path):
+        raise FileNotFoundError(f"Chromedriver not found at {chrome_driver_path}")
+    if not os.path.exists(chrome_binary_path):
+        raise FileNotFoundError(f"Chromium not found at {chrome_binary_path}")
     options = webdriver.ChromeOptions()
     options.binary_location = chrome_binary_path
     options.add_argument("--headless")
     options.add_argument("--no-sandbox")
     options.add_argument("--disable-dev-shm-usage")
+    service = Service(executable_path=chrome_driver_path)
+    driver = webdriver.Chrome(service=service, options=options)
     try:
         driver.get(website)
         print("Page Loaded...")
         html = driver.page_source
         return html
     finally:
         driver.quit()
 def extract_body_content(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
     body_content = soup.body
+    return str(body_content) if body_content else ""
 def clean_body_content(body_content):
+    soup = BeautifulSoup(body_content, "html.parser")
+    for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
     cleaned_content = soup.get_text(separator="\n")
+    return "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())
+def split_dom_content(dom_content, max_length=60000):
+    return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]