thingiverse_scraper / scraper.py
michelerussoAA's picture
Update scraper.py
fb65833 verified
raw
history blame
6.64 kB
import os
import io
import time
import math
import traceback
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
NoSuchElementException, TimeoutException, StaleElementReferenceException
)
from huggingface_hub import HfApi, HfFolder
# Configuration: set via Space secrets
HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
HF_TOKEN = HfFolder.get_token()
def upload_df_to_hf(df: pd.DataFrame, filename: str):
"""
Upload a pandas DataFrame directly to HF dataset without writing to disk.
"""
buffer = io.StringIO()
df.to_csv(buffer, index=False)
buffer.seek(0)
api = HfApi()
api.upload_file(
path_or_fileobj=buffer,
path_in_repo=filename,
repo_id=HF_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
create_pr=False
)
print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
def safe_get_text(driver, xpath, retries=1, delay=0.5):
for _ in range(retries):
try:
return driver.find_element(By.XPATH, xpath).text
except (StaleElementReferenceException, NoSuchElementException):
time.sleep(delay)
return ""
def initialize_driver():
options = Options()
options.headless = True
# Linux container flags
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--disable-software-rasterizer")
options.add_argument("--disable-setuid-sandbox")
# Use /tmp for profile & remote-debugging
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--user-data-dir=/tmp/chrome-user-data")
options.add_argument("window-size=1920,1080")
options.add_argument(
"user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
)
# webdriver-manager will download the correct ChromeDriver version
service = Service(ChromeDriverManager(path="/tmp/.wdm").install())
driver = webdriver.Chrome(service=service, options=options)
return driver
def process_batch(start_id, end_id, worker_id):
print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
try:
driver = initialize_driver()
wait = WebDriverWait(driver, 10)
results = []
total = end_id - start_id + 1
count = 0
for thing_id in range(end_id, start_id - 1, -1):
count += 1
url = f"https://www.thingiverse.com/thing:{thing_id}"
print(f"[{worker_id}] ({count}/{total}) {url}")
try:
driver.get(url)
except Exception:
continue
time.sleep(1)
# skip error pages
try:
driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]")
continue
except NoSuchElementException:
pass
# wait for title link
try:
wait.until(EC.presence_of_element_located((By.XPATH,
"//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
))
except TimeoutException:
continue
title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
date_posted = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div")
def extract_aria(label):
try:
return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text
except NoSuchElementException:
return ""
remixes = extract_aria("Remixes")
files = extract_aria("Files")
makes = extract_aria("Makes")
comments = extract_aria("Comments")
tags = []
try:
tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")]
except NoSuchElementException:
pass
results.append({
"URL": url,
"Title": title,
"Author": author,
"Date": date_posted,
"Remixes": remixes,
"Files": files,
"Makes": makes,
"Comments": comments,
"Tags": tags
})
# checkpoint every 10 items
if len(results) % 10 == 0:
df_chk = pd.DataFrame(results)
chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv"
upload_df_to_hf(df_chk, chk_name)
# final batch upload
if results:
df_final = pd.DataFrame(results)
final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv"
upload_df_to_hf(df_final, final_name)
driver.quit()
return results
except Exception as e:
print(f"Worker {worker_id} error: {e}")
traceback.print_exc()
return []
def main():
# configure your range & parallelism
start_thing = 6993281
end_thing = 7003281
num_workers = 5
# split work
total = end_thing - start_thing + 1
per = math.ceil(total / num_workers)
batches = []
for i in range(num_workers):
s = start_thing + i * per
e = min(s + per - 1, end_thing)
batches.append((s, e, i+1))
all_results = []
from concurrent.futures import ThreadPoolExecutor, as_completed
with ThreadPoolExecutor(max_workers=num_workers) as ex:
futures = {ex.submit(process_batch, s, e, wid): (s,e,wid) for s,e,wid in batches}
for fut in as_completed(futures):
res = fut.result()
all_results.extend(res)
# upload combined file
if all_results:
df_all = pd.DataFrame(all_results)
upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")
if __name__ == "__main__":
main()