import os import io import time import math import traceback from datetime import datetime import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.expected_conditions import staleness_of from selenium.common.exceptions import ( NoSuchElementException, TimeoutException, StaleElementReferenceException, ) from selenium.webdriver.firefox.service import Service from selenium.webdriver.firefox.options import Options from huggingface_hub import HfApi, HfFolder # Configuration: set via Space secrets HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data") HF_TOKEN = HfFolder.get_token() def upload_df_to_hf(df: pd.DataFrame, filename: str): """ Upload a pandas DataFrame directly to HF dataset without writing to disk. """ buffer = io.StringIO() df.to_csv(buffer, index=False) buffer.seek(0) api = HfApi() api.upload_file( path_or_fileobj=buffer, path_in_repo=filename, repo_id=HF_REPO_ID, repo_type="dataset", token=HF_TOKEN, create_pr=False, ) print(f"✅ Uploaded {filename} to {HF_REPO_ID}") def safe_get_text(driver, xpath, retries=1, delay=0.5): for _ in range(retries): try: return driver.find_element(By.XPATH, xpath).text except (StaleElementReferenceException, NoSuchElementException): time.sleep(delay) return "" def initialize_driver(): # point to the geckodriver binary you installed service = Service("/usr/local/bin/geckodriver") opts = Options() opts.headless = True # disable sandbox (in many container environments) opts.add_argument("--no-sandbox") opts.add_argument("--disable-gpu") # ensure a full window so responsive pages load properly opts.add_argument("--window-size=1920,1080") driver = webdriver.Firefox(service=service, options=opts) return driver def process_batch(start_id, end_id, worker_id): print(f"Worker {worker_id} processing IDs {start_id} to {end_id}") try: driver = initialize_driver() wait = WebDriverWait(driver, 10) results = [] total = end_id - start_id + 1 count = 0 for thing_id in range(end_id, start_id - 1, -1): count += 1 url = f"https://www.thingiverse.com/thing:{thing_id}" print(f"[{worker_id}] ({count}/{total}) {url}") try: driver.get(url) except Exception: continue time.sleep(1) # skip error pages try: driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]") continue except NoSuchElementException: pass # wait for title link try: wait.until(EC.presence_of_element_located(( By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]" ))) except TimeoutException: continue title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]") author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]") date_posted = safe_get_text( driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div" ) def extract_aria(label): try: return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text except NoSuchElementException: return "" remixes = extract_aria("Remixes") files = extract_aria("Files") makes = extract_aria("Makes") comments = extract_aria("Comments") tags = [] try: tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]") tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")] except NoSuchElementException: pass results.append({ "URL": url, "Title": title, "Author": author, "Date": date_posted, "Remixes": remixes, "Files": files, "Makes": makes, "Comments": comments, "Tags": tags }) # checkpoint every 10 items if len(results) % 10 == 0: df_chk = pd.DataFrame(results) chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv" upload_df_to_hf(df_chk, chk_name) # final batch upload if results: df_final = pd.DataFrame(results) final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv" upload_df_to_hf(df_final, final_name) driver.quit() return results except Exception as e: print(f"Worker {worker_id} error: {e}") traceback.print_exc() return [] def main(): # configure your range & parallelism start_thing = 6993281 end_thing = 7003281 num_workers = 5 # split work total = end_thing - start_thing + 1 per = math.ceil(total / num_workers) batches = [] for i in range(num_workers): s = start_thing + i * per e = min(s + per - 1, end_thing) batches.append((s, e, i+1)) all_results = [] from concurrent.futures import ThreadPoolExecutor, as_completed with ThreadPoolExecutor(max_workers=num_workers) as ex: futures = {ex.submit(process_batch, s, e, wid): (s,e,wid) for s,e,wid in batches} for fut in as_completed(futures): all_results.extend(fut.result()) # upload combined file if all_results: df_all = pd.DataFrame(all_results) upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv") if __name__ == "__main__": main()