Spaces:

michelerussoAA
/

thingiverse_scraper

Runtime error

App Files Files Community

michelerussoAA commited on Apr 17

Commit

b5b43c5

verified ·

1 Parent(s): befcec8

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +17 -28

scraper.py CHANGED Viewed

@@ -7,7 +7,7 @@ from datetime import datetime
 import pandas as pd
 from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
@@ -50,27 +50,16 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
 def initialize_driver():
-    options = Options()
-    # use new headless mode
-    options.add_argument("--headless=new")
-    # container flags
-    options.add_argument("--disable-dev-shm-usage")
-    options.add_argument("--no-sandbox")
-    options.add_argument("--disable-gpu")
-    options.add_argument("--disable-software-rasterizer")
-    options.add_argument("--disable-setuid-sandbox")
-    # profile & cache in /tmp
-    options.add_argument("--remote-debugging-port=9222")
-    options.add_argument("--user-data-dir=/tmp/chrome-user-data")
-    options.add_argument("--window-size=1920,1080")
-    options.add_argument(
-        "user-agent=Mozilla/5.0 (X11; Linux x86_64) "
-        "AppleWebKit/537.36 (KHTML, like Gecko) "
-        "Chrome/135.0.0.0 Safari/537.36"
-    )
-    # Selenium 4 Manager will auto‑download matching driver into SE_CACHE_PATH
-    # (ensure you set ENV SE_CACHE_PATH=/tmp/.cache/selenium in your Dockerfile)
-    return webdriver.Chrome(options=options)
 def process_batch(start_id, end_id, worker_id):
@@ -101,9 +90,9 @@ def process_batch(start_id, end_id, worker_id):
             # wait for title link
             try:
-                wait.until(EC.presence_of_element_located(
-                    (By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
-                ))
             except TimeoutException:
                 continue
@@ -120,9 +109,9 @@ def process_batch(start_id, end_id, worker_id):
                 except NoSuchElementException:
                     return ""
-            remixes = extract_aria("Remixes")
-            files   = extract_aria("Files")
-            makes   = extract_aria("Makes")
             comments = extract_aria("Comments")
             tags = []

 import pandas as pd
 from selenium import webdriver
+from selenium.webdriver.firefox.options import Options as FFOptions
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 def initialize_driver():
+    options = FFOptions()
+    options.headless = True
+    # point Firefox cache & profile into /tmp
+    options.set_preference("browser.cache.disk.parent_directory", "/tmp")
+    options.set_preference("browser.cache.memory.enable", False)
+    options.set_preference("browser.download.dir", "/tmp")
+    # launch
+    driver = webdriver.Firefox(options=options)
+    driver.set_window_size(1920, 1080)
+    return driver
 def process_batch(start_id, end_id, worker_id):
             # wait for title link
             try:
+                wait.until(EC.presence_of_element_located((
+                    By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]"
+                )))
             except TimeoutException:
                 continue
                 except NoSuchElementException:
                     return ""
+            remixes  = extract_aria("Remixes")
+            files    = extract_aria("Files")
+            makes    = extract_aria("Makes")
             comments = extract_aria("Comments")
             tags = []