Spaces:
Runtime error
Runtime error
File size: 6,268 Bytes
e7192f8 654e71a 042a08c 654e71a e7192f8 654e71a e7192f8 654e71a f9dd994 e7192f8 654e71a e7192f8 654e71a e7192f8 654e71a e7192f8 654e71a e7192f8 654e71a e7192f8 654e71a e7192f8 f9dd994 e7192f8 f9dd994 e7192f8 f9dd994 e7192f8 f9dd994 b5b43c5 8dd6168 d6da54f f9dd994 e7192f8 654e71a b5b43c5 654e71a 042a08c 654e71a b5b43c5 654e71a 042a08c 654e71a 042a08c 654e71a 042a08c 654e71a 042a08c 654e71a 042a08c 654e71a 042a08c 654e71a 042a08c 654e71a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import os
import io
import time
import math
import traceback
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import staleness_of
from selenium.common.exceptions import (
NoSuchElementException,
TimeoutException,
StaleElementReferenceException,
)
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from huggingface_hub import HfApi, HfFolder
# Configuration: set via Space secrets
HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
HF_TOKEN = HfFolder.get_token()
def upload_df_to_hf(df: pd.DataFrame, filename: str):
"""
Upload a pandas DataFrame directly to HF dataset without writing to disk.
"""
buffer = io.StringIO()
df.to_csv(buffer, index=False)
buffer.seek(0)
api = HfApi()
api.upload_file(
path_or_fileobj=buffer,
path_in_repo=filename,
repo_id=HF_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
create_pr=False,
)
print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
def safe_get_text(driver, xpath, retries=1, delay=0.5):
for _ in range(retries):
try:
return driver.find_element(By.XPATH, xpath).text
except (StaleElementReferenceException, NoSuchElementException):
time.sleep(delay)
return ""
def initialize_driver():
# point to the geckodriver binary you installed
service = Service("/usr/local/bin/geckodriver")
opts = Options()
opts.headless = True
# disable sandbox (in many container environments)
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-gpu")
# ensure a full window so responsive pages load properly
opts.add_argument("--window-size=1920,1080")
driver = webdriver.Firefox(service=service, options=opts)
return driver
def process_batch(start_id, end_id, worker_id):
print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
try:
driver = initialize_driver()
wait = WebDriverWait(driver, 10)
results = []
total = end_id - start_id + 1
count = 0
for thing_id in range(end_id, start_id - 1, -1):
count += 1
url = f"https://www.thingiverse.com/thing:{thing_id}"
print(f"[{worker_id}] ({count}/{total}) {url}")
try:
driver.get(url)
except Exception:
continue
time.sleep(1)
# skip error pages
try:
driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]")
continue
except NoSuchElementException:
pass
# wait for title link
try:
wait.until(EC.presence_of_element_located((
By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]"
)))
except TimeoutException:
continue
title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
date_posted = safe_get_text(
driver,
"//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div"
)
def extract_aria(label):
try:
return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text
except NoSuchElementException:
return ""
remixes = extract_aria("Remixes")
files = extract_aria("Files")
makes = extract_aria("Makes")
comments = extract_aria("Comments")
tags = []
try:
tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")]
except NoSuchElementException:
pass
results.append({
"URL": url,
"Title": title,
"Author": author,
"Date": date_posted,
"Remixes": remixes,
"Files": files,
"Makes": makes,
"Comments": comments,
"Tags": tags
})
# checkpoint every 10 items
if len(results) % 10 == 0:
df_chk = pd.DataFrame(results)
chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv"
upload_df_to_hf(df_chk, chk_name)
# final batch upload
if results:
df_final = pd.DataFrame(results)
final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv"
upload_df_to_hf(df_final, final_name)
driver.quit()
return results
except Exception as e:
print(f"Worker {worker_id} error: {e}")
traceback.print_exc()
return []
def main():
# configure your range & parallelism
start_thing = 6993281
end_thing = 7003281
num_workers = 5
# split work
total = end_thing - start_thing + 1
per = math.ceil(total / num_workers)
batches = []
for i in range(num_workers):
s = start_thing + i * per
e = min(s + per - 1, end_thing)
batches.append((s, e, i+1))
all_results = []
from concurrent.futures import ThreadPoolExecutor, as_completed
with ThreadPoolExecutor(max_workers=num_workers) as ex:
futures = {ex.submit(process_batch, s, e, wid): (s,e,wid)
for s,e,wid in batches}
for fut in as_completed(futures):
all_results.extend(fut.result())
# upload combined file
if all_results:
df_all = pd.DataFrame(all_results)
upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")
if __name__ == "__main__":
main()
|