Spaces:

michelerussoAA
/

thingiverse_scraper

Runtime error

App Files Files Community

michelerussoAA commited on Apr 17

Commit

e7192f8

verified ·

1 Parent(s): f9dd994

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +21 -9

scraper.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import io
 import time
@@ -10,18 +11,26 @@ from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import (
-    NoSuchElementException, TimeoutException, StaleElementReferenceException
 )
-from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.service import Service
 from huggingface_hub import HfApi, HfFolder
 # Configuration: set via Space secrets
 HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
-HF_TOKEN    = HfFolder.get_token()
 def upload_df_to_hf(df: pd.DataFrame, filename: str):
     buffer = io.StringIO()
     df.to_csv(buffer, index=False)
     buffer.seek(0)
@@ -32,10 +41,11 @@ def upload_df_to_hf(df: pd.DataFrame, filename: str):
         repo_id=HF_REPO_ID,
         repo_type="dataset",
         token=HF_TOKEN,
-        create_pr=False
     )
     print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
 def safe_get_text(driver, xpath, retries=1, delay=0.5):
     for _ in range(retries):
         try:
@@ -44,23 +54,25 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
             time.sleep(delay)
     return ""
 def initialize_driver():
-    # point at the geckodriver we baked into the image
     service = Service("/usr/local/bin/geckodriver")
     opts = Options()
-    opts.add_argument("--headless")              # Firefox headless
     opts.add_argument("--no-sandbox")
     opts.add_argument("--disable-gpu")
     opts.add_argument("--window-size=1920,1080")
-    # use a tmp profile directory
-    opts.add_argument(f"--profile=/tmp/firefox-profile-{os.getpid()}")
     driver = webdriver.Firefox(service=service, options=opts)
     return driver
 def process_batch(start_id, end_id, worker_id):
     print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
     try:

 import os
 import io
 import time
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.expected_conditions import staleness_of
 from selenium.common.exceptions import (
+    NoSuchElementException,
+    TimeoutException,
+    StaleElementReferenceException,
 )
 from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.firefox.options import Options
 from huggingface_hub import HfApi, HfFolder
 # Configuration: set via Space secrets
 HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
+HF_TOKEN   = HfFolder.get_token()
 def upload_df_to_hf(df: pd.DataFrame, filename: str):
+    """
+    Upload a pandas DataFrame directly to HF dataset without writing to disk.
+    """
     buffer = io.StringIO()
     df.to_csv(buffer, index=False)
     buffer.seek(0)
         repo_id=HF_REPO_ID,
         repo_type="dataset",
         token=HF_TOKEN,
+        create_pr=False,
     )
     print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
 def safe_get_text(driver, xpath, retries=1, delay=0.5):
     for _ in range(retries):
         try:
             time.sleep(delay)
     return ""
 def initialize_driver():
+    # point to the geckodriver binary you installed
     service = Service("/usr/local/bin/geckodriver")
     opts = Options()
+    opts.headless = True
+    # disable sandbox (in many container environments)
     opts.add_argument("--no-sandbox")
     opts.add_argument("--disable-gpu")
+    # ensure a full window so responsive pages load properly
     opts.add_argument("--window-size=1920,1080")
     driver = webdriver.Firefox(service=service, options=opts)
     return driver
 def process_batch(start_id, end_id, worker_id):
     print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
     try: