Spaces:

PyQuarX
/

scrape-with-ai

Paused

PyQuarX commited on Apr 15

Commit

1ab0ddc

verified ·

1 Parent(s): 636189b

Update scraper.py

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -1,25 +1,25 @@
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from bs4 import BeautifulSoup
-import os
-from shutil import which
 def scrape_website(website):
-    print("Launching chrome browser...")
     chrome_driver_path = "/usr/lib/chromium/chromedriver"
-    options = webdriver.ChromeOptions()
     options.binary_location = "/usr/bin/chromium"
-    options.add_argument('--no-sandbox')
-    options.add_argument('--headless')
-    options.add_argument('--disable-dev-shm-usage')
     driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
     try:
         driver.get(website)
         html = driver.page_source
         return html
     finally:
@@ -35,8 +35,9 @@ def clean_body_content(body_content):
     soup = BeautifulSoup(body_content, "html.parser")
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
     cleaned_content = soup.get_text(separator="\n")
     return "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())
 def split_dom_content(dom_content, max_length=60000):
-    return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]

 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+import time
 from bs4 import BeautifulSoup
 def scrape_website(website):
+    print("Launching chromium browser...")
     chrome_driver_path = "/usr/lib/chromium/chromedriver"
+    options = Options()
     options.binary_location = "/usr/bin/chromium"
+    options.add_argument("--headless")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
     driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
     try:
         driver.get(website)
+        print("Page Loaded...")
         html = driver.page_source
         return html
     finally:
     soup = BeautifulSoup(body_content, "html.parser")
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
     cleaned_content = soup.get_text(separator="\n")
     return "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())
 def split_dom_content(dom_content, max_length=60000):
+    return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]