Spaces:
Runtime error
Runtime error
import os | |
import io | |
import time | |
import math | |
import traceback | |
from datetime import datetime | |
import pandas as pd | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.expected_conditions import staleness_of | |
from selenium.common.exceptions import ( | |
NoSuchElementException, | |
TimeoutException, | |
StaleElementReferenceException, | |
) | |
from selenium.webdriver.firefox.service import Service | |
from selenium.webdriver.firefox.options import Options | |
from huggingface_hub import HfApi, HfFolder | |
# Configuration: set via Space secrets | |
HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data") | |
HF_TOKEN = HfFolder.get_token() | |
def upload_df_to_hf(df: pd.DataFrame, filename: str): | |
""" | |
Upload a pandas DataFrame directly to HF dataset without writing to disk. | |
""" | |
buffer = io.StringIO() | |
df.to_csv(buffer, index=False) | |
buffer.seek(0) | |
api = HfApi() | |
api.upload_file( | |
path_or_fileobj=buffer, | |
path_in_repo=filename, | |
repo_id=HF_REPO_ID, | |
repo_type="dataset", | |
token=HF_TOKEN, | |
create_pr=False, | |
) | |
print(f"✅ Uploaded {filename} to {HF_REPO_ID}") | |
def safe_get_text(driver, xpath, retries=1, delay=0.5): | |
for _ in range(retries): | |
try: | |
return driver.find_element(By.XPATH, xpath).text | |
except (StaleElementReferenceException, NoSuchElementException): | |
time.sleep(delay) | |
return "" | |
def initialize_driver(): | |
# point to the geckodriver binary you installed | |
service = Service("/usr/local/bin/geckodriver") | |
opts = Options() | |
opts.headless = True | |
# disable sandbox (in many container environments) | |
opts.add_argument("--no-sandbox") | |
opts.add_argument("--disable-gpu") | |
# ensure a full window so responsive pages load properly | |
opts.add_argument("--window-size=1920,1080") | |
driver = webdriver.Firefox(service=service, options=opts) | |
return driver | |
def process_batch(start_id, end_id, worker_id): | |
print(f"Worker {worker_id} processing IDs {start_id} to {end_id}") | |
try: | |
driver = initialize_driver() | |
wait = WebDriverWait(driver, 10) | |
results = [] | |
total = end_id - start_id + 1 | |
count = 0 | |
for thing_id in range(end_id, start_id - 1, -1): | |
count += 1 | |
url = f"https://www.thingiverse.com/thing:{thing_id}" | |
print(f"[{worker_id}] ({count}/{total}) {url}") | |
try: | |
driver.get(url) | |
except Exception: | |
continue | |
time.sleep(1) | |
# skip error pages | |
try: | |
driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]") | |
continue | |
except NoSuchElementException: | |
pass | |
# wait for title link | |
try: | |
wait.until(EC.presence_of_element_located(( | |
By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]" | |
))) | |
except TimeoutException: | |
continue | |
title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]") | |
author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]") | |
date_posted = safe_get_text( | |
driver, | |
"//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div" | |
) | |
def extract_aria(label): | |
try: | |
return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text | |
except NoSuchElementException: | |
return "" | |
remixes = extract_aria("Remixes") | |
files = extract_aria("Files") | |
makes = extract_aria("Makes") | |
comments = extract_aria("Comments") | |
tags = [] | |
try: | |
tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]") | |
tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")] | |
except NoSuchElementException: | |
pass | |
results.append({ | |
"URL": url, | |
"Title": title, | |
"Author": author, | |
"Date": date_posted, | |
"Remixes": remixes, | |
"Files": files, | |
"Makes": makes, | |
"Comments": comments, | |
"Tags": tags | |
}) | |
# checkpoint every 10 items | |
if len(results) % 10 == 0: | |
df_chk = pd.DataFrame(results) | |
chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv" | |
upload_df_to_hf(df_chk, chk_name) | |
# final batch upload | |
if results: | |
df_final = pd.DataFrame(results) | |
final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv" | |
upload_df_to_hf(df_final, final_name) | |
driver.quit() | |
return results | |
except Exception as e: | |
print(f"Worker {worker_id} error: {e}") | |
traceback.print_exc() | |
return [] | |
def main(): | |
# configure your range & parallelism | |
start_thing = 6993281 | |
end_thing = 7003281 | |
num_workers = 5 | |
# split work | |
total = end_thing - start_thing + 1 | |
per = math.ceil(total / num_workers) | |
batches = [] | |
for i in range(num_workers): | |
s = start_thing + i * per | |
e = min(s + per - 1, end_thing) | |
batches.append((s, e, i+1)) | |
all_results = [] | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
with ThreadPoolExecutor(max_workers=num_workers) as ex: | |
futures = {ex.submit(process_batch, s, e, wid): (s,e,wid) | |
for s,e,wid in batches} | |
for fut in as_completed(futures): | |
all_results.extend(fut.result()) | |
# upload combined file | |
if all_results: | |
df_all = pd.DataFrame(all_results) | |
upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv") | |
if __name__ == "__main__": | |
main() | |