import os import io import time import math import traceback from datetime import datetime import pandas as pd from selenium import webdriver from selenium.webdriver.firefox.options import Options as FFOptions from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import ( NoSuchElementException, TimeoutException, StaleElementReferenceException ) from huggingface_hub import HfApi, HfFolder # Configuration: set via Space secrets HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data") HF_TOKEN = HfFolder.get_token() def upload_df_to_hf(df: pd.DataFrame, filename: str): """ Upload a pandas DataFrame directly to HF dataset without writing to disk. """ buffer = io.StringIO() df.to_csv(buffer, index=False) buffer.seek(0) api = HfApi() api.upload_file( path_or_fileobj=buffer, path_in_repo=filename, repo_id=HF_REPO_ID, repo_type="dataset", token=HF_TOKEN, create_pr=False ) print(f"✅ Uploaded {filename} to {HF_REPO_ID}") def safe_get_text(driver, xpath, retries=1, delay=0.5): for _ in range(retries): try: return driver.find_element(By.XPATH, xpath).text except (StaleElementReferenceException, NoSuchElementException): time.sleep(delay) return "" def initialize_driver(): options = FFOptions() options.headless = True # point Firefox cache & profile into /tmp options.set_preference("browser.cache.disk.parent_directory", "/tmp") options.set_preference("browser.cache.memory.enable", False) options.set_preference("browser.download.dir", "/tmp") # launch driver = webdriver.Firefox(options=options) driver.set_window_size(1920, 1080) return driver def process_batch(start_id, end_id, worker_id): print(f"Worker {worker_id} processing IDs {start_id} to {end_id}") try: driver = initialize_driver() wait = WebDriverWait(driver, 10) results = [] total = end_id - start_id + 1 count = 0 for thing_id in range(end_id, start_id - 1, -1): count += 1 url = f"https://www.thingiverse.com/thing:{thing_id}" print(f"[{worker_id}] ({count}/{total}) {url}") try: driver.get(url) except Exception: continue time.sleep(1) # skip error pages try: driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]") continue except NoSuchElementException: pass # wait for title link try: wait.until(EC.presence_of_element_located(( By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]" ))) except TimeoutException: continue title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]") author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]") date_posted = safe_get_text( driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div" ) def extract_aria(label): try: return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text except NoSuchElementException: return "" remixes = extract_aria("Remixes") files = extract_aria("Files") makes = extract_aria("Makes") comments = extract_aria("Comments") tags = [] try: tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]") tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")] except NoSuchElementException: pass results.append({ "URL": url, "Title": title, "Author": author, "Date": date_posted, "Remixes": remixes, "Files": files, "Makes": makes, "Comments": comments, "Tags": tags }) # checkpoint every 10 items if len(results) % 10 == 0: df_chk = pd.DataFrame(results) chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv" upload_df_to_hf(df_chk, chk_name) # final batch upload if results: df_final = pd.DataFrame(results) final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv" upload_df_to_hf(df_final, final_name) driver.quit() return results except Exception as e: print(f"Worker {worker_id} error: {e}") traceback.print_exc() return [] def main(): # configure your range & parallelism start_thing = 6993281 end_thing = 7003281 num_workers = 5 # split work total = end_thing - start_thing + 1 per = math.ceil(total / num_workers) batches = [] for i in range(num_workers): s = start_thing + i * per e = min(s + per - 1, end_thing) batches.append((s, e, i+1)) all_results = [] from concurrent.futures import ThreadPoolExecutor, as_completed with ThreadPoolExecutor(max_workers=num_workers) as ex: futures = {ex.submit(process_batch, s, e, wid): (s,e,wid) for s,e,wid in batches} for fut in as_completed(futures): all_results.extend(fut.result()) # upload combined file if all_results: df_all = pd.DataFrame(all_results) upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv") if __name__ == "__main__": main()