Spaces:

michelerussoAA
/

thingiverse_scraper

Runtime error

App Files Files Community

thingiverse_scraper / scraper.py

michelerussoAA

Update scraper.py

e7192f8 verified 5 months ago

raw

history blame

6.27 kB


	import os
	import io
	import time
	import math
	import traceback
	from datetime import datetime

	import pandas as pd
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.support.expected_conditions import staleness_of
	from selenium.common.exceptions import (
	NoSuchElementException,
	TimeoutException,
	StaleElementReferenceException,
	)
	from selenium.webdriver.firefox.service import Service
	from selenium.webdriver.firefox.options import Options

	from huggingface_hub import HfApi, HfFolder

	# Configuration: set via Space secrets
	HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
	HF_TOKEN = HfFolder.get_token()


	def upload_df_to_hf(df: pd.DataFrame, filename: str):
	"""
	Upload a pandas DataFrame directly to HF dataset without writing to disk.
	"""
	buffer = io.StringIO()
	df.to_csv(buffer, index=False)
	buffer.seek(0)
	api = HfApi()
	api.upload_file(
	path_or_fileobj=buffer,
	path_in_repo=filename,
	repo_id=HF_REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN,
	create_pr=False,
	)
	print(f"✅ Uploaded {filename} to {HF_REPO_ID}")


	def safe_get_text(driver, xpath, retries=1, delay=0.5):
	for _ in range(retries):
	try:
	return driver.find_element(By.XPATH, xpath).text
	except (StaleElementReferenceException, NoSuchElementException):
	time.sleep(delay)
	return ""


	def initialize_driver():
	# point to the geckodriver binary you installed
	service = Service("/usr/local/bin/geckodriver")

	opts = Options()
	opts.headless = True
	# disable sandbox (in many container environments)
	opts.add_argument("--no-sandbox")
	opts.add_argument("--disable-gpu")
	# ensure a full window so responsive pages load properly
	opts.add_argument("--window-size=1920,1080")

	driver = webdriver.Firefox(service=service, options=opts)
	return driver




	def process_batch(start_id, end_id, worker_id):
	print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
	try:
	driver = initialize_driver()
	wait = WebDriverWait(driver, 10)
	results = []

	total = end_id - start_id + 1
	count = 0
	for thing_id in range(end_id, start_id - 1, -1):
	count += 1
	url = f"https://www.thingiverse.com/thing:{thing_id}"
	print(f"[{worker_id}] ({count}/{total}) {url}")
	try:
	driver.get(url)
	except Exception:
	continue
	time.sleep(1)

	# skip error pages
	try:
	driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]")
	continue
	except NoSuchElementException:
	pass

	# wait for title link
	try:
	wait.until(EC.presence_of_element_located((
	By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]"
	)))
	except TimeoutException:
	continue

	title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
	author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
	date_posted = safe_get_text(
	driver,
	"//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div"
	)

	def extract_aria(label):
	try:
	return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text
	except NoSuchElementException:
	return ""

	remixes = extract_aria("Remixes")
	files = extract_aria("Files")
	makes = extract_aria("Makes")
	comments = extract_aria("Comments")

	tags = []
	try:
	tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
	tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")]
	except NoSuchElementException:
	pass

	results.append({
	"URL": url,
	"Title": title,
	"Author": author,
	"Date": date_posted,
	"Remixes": remixes,
	"Files": files,
	"Makes": makes,
	"Comments": comments,
	"Tags": tags
	})

	# checkpoint every 10 items
	if len(results) % 10 == 0:
	df_chk = pd.DataFrame(results)
	chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv"
	upload_df_to_hf(df_chk, chk_name)

	# final batch upload
	if results:
	df_final = pd.DataFrame(results)
	final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv"
	upload_df_to_hf(df_final, final_name)

	driver.quit()
	return results

	except Exception as e:
	print(f"Worker {worker_id} error: {e}")
	traceback.print_exc()
	return []


	def main():
	# configure your range & parallelism
	start_thing = 6993281
	end_thing = 7003281
	num_workers = 5

	# split work
	total = end_thing - start_thing + 1
	per = math.ceil(total / num_workers)
	batches = []
	for i in range(num_workers):
	s = start_thing + i * per
	e = min(s + per - 1, end_thing)
	batches.append((s, e, i+1))

	all_results = []
	from concurrent.futures import ThreadPoolExecutor, as_completed
	with ThreadPoolExecutor(max_workers=num_workers) as ex:
	futures = {ex.submit(process_batch, s, e, wid): (s,e,wid)
	for s,e,wid in batches}
	for fut in as_completed(futures):
	all_results.extend(fut.result())

	# upload combined file
	if all_results:
	df_all = pd.DataFrame(all_results)
	upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")


	if __name__ == "__main__":
	main()