Spaces:
Runtime error
Runtime error
Create scraper.py
Browse files- scraper.py +177 -0
scraper.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import time
|
4 |
+
import math
|
5 |
+
import traceback
|
6 |
+
from datetime import datetime
|
7 |
+
import pandas as pd
|
8 |
+
from selenium import webdriver
|
9 |
+
from selenium.webdriver.chrome.options import Options
|
10 |
+
from selenium.webdriver.common.by import By
|
11 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
12 |
+
from selenium.webdriver.support import expected_conditions as EC
|
13 |
+
from selenium.common.exceptions import (
|
14 |
+
NoSuchElementException, TimeoutException, StaleElementReferenceException
|
15 |
+
)
|
16 |
+
from huggingface_hub import HfApi, HfFolder
|
17 |
+
|
18 |
+
# Configuration: set via Space secrets
|
19 |
+
HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
|
20 |
+
HF_TOKEN = HfFolder.get_token()
|
21 |
+
|
22 |
+
|
23 |
+
def upload_df_to_hf(df: pd.DataFrame, filename: str):
|
24 |
+
"""
|
25 |
+
Upload a pandas DataFrame directly to HF dataset without writing to disk.
|
26 |
+
"""
|
27 |
+
buffer = io.StringIO()
|
28 |
+
df.to_csv(buffer, index=False)
|
29 |
+
buffer.seek(0)
|
30 |
+
api = HfApi()
|
31 |
+
api.upload_file(
|
32 |
+
path_or_fileobj=buffer,
|
33 |
+
path_in_repo=filename,
|
34 |
+
repo_id=HF_REPO_ID,
|
35 |
+
repo_type="dataset",
|
36 |
+
token=HF_TOKEN,
|
37 |
+
create_pr=False
|
38 |
+
)
|
39 |
+
print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
|
40 |
+
|
41 |
+
|
42 |
+
def safe_get_text(driver, xpath, retries=1, delay=0.5):
|
43 |
+
for _ in range(retries):
|
44 |
+
try:
|
45 |
+
return driver.find_element(By.XPATH, xpath).text
|
46 |
+
except (StaleElementReferenceException, NoSuchElementException):
|
47 |
+
time.sleep(delay)
|
48 |
+
return ""
|
49 |
+
|
50 |
+
|
51 |
+
def initialize_driver():
|
52 |
+
options = Options()
|
53 |
+
options.headless = True
|
54 |
+
options.add_argument("--disable-dev-shm-usage")
|
55 |
+
options.add_argument("--no-sandbox")
|
56 |
+
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
|
57 |
+
driver = webdriver.Chrome(options=options)
|
58 |
+
return driver
|
59 |
+
|
60 |
+
|
61 |
+
def process_batch(start_id, end_id, worker_id):
|
62 |
+
print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
|
63 |
+
try:
|
64 |
+
driver = initialize_driver()
|
65 |
+
wait = WebDriverWait(driver, 10)
|
66 |
+
results = []
|
67 |
+
|
68 |
+
total = end_id - start_id + 1
|
69 |
+
count = 0
|
70 |
+
for thing_id in range(end_id, start_id - 1, -1):
|
71 |
+
count += 1
|
72 |
+
url = f"https://www.thingiverse.com/thing:{thing_id}"
|
73 |
+
print(f"[{worker_id}] ({count}/{total}) {url}")
|
74 |
+
try:
|
75 |
+
driver.get(url)
|
76 |
+
except Exception:
|
77 |
+
continue
|
78 |
+
time.sleep(1)
|
79 |
+
|
80 |
+
# skip error pages
|
81 |
+
try:
|
82 |
+
driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]")
|
83 |
+
continue
|
84 |
+
except NoSuchElementException:
|
85 |
+
pass
|
86 |
+
|
87 |
+
# wait for title link
|
88 |
+
try:
|
89 |
+
wait.until(EC.presence_of_element_located((By.XPATH,
|
90 |
+
"//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
|
91 |
+
))
|
92 |
+
except TimeoutException:
|
93 |
+
continue
|
94 |
+
|
95 |
+
title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
|
96 |
+
author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
|
97 |
+
date_posted = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div")
|
98 |
+
|
99 |
+
def extract_aria(label):
|
100 |
+
try:
|
101 |
+
return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text
|
102 |
+
except NoSuchElementException:
|
103 |
+
return ""
|
104 |
+
|
105 |
+
remixes = extract_aria("Remixes")
|
106 |
+
files = extract_aria("Files")
|
107 |
+
makes = extract_aria("Makes")
|
108 |
+
comments = extract_aria("Comments")
|
109 |
+
tags = []
|
110 |
+
try:
|
111 |
+
tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
|
112 |
+
tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")]
|
113 |
+
except NoSuchElementException:
|
114 |
+
pass
|
115 |
+
|
116 |
+
results.append({
|
117 |
+
"URL": url,
|
118 |
+
"Title": title,
|
119 |
+
"Author": author,
|
120 |
+
"Date": date_posted,
|
121 |
+
"Remixes": remixes,
|
122 |
+
"Files": files,
|
123 |
+
"Makes": makes,
|
124 |
+
"Comments": comments,
|
125 |
+
"Tags": tags
|
126 |
+
})
|
127 |
+
|
128 |
+
# checkpoint every 10 items
|
129 |
+
if len(results) % 10 == 0:
|
130 |
+
df_chk = pd.DataFrame(results)
|
131 |
+
chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv"
|
132 |
+
upload_df_to_hf(df_chk, chk_name)
|
133 |
+
|
134 |
+
# final batch upload
|
135 |
+
if results:
|
136 |
+
df_final = pd.DataFrame(results)
|
137 |
+
final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv"
|
138 |
+
upload_df_to_hf(df_final, final_name)
|
139 |
+
|
140 |
+
driver.quit()
|
141 |
+
return results
|
142 |
+
except Exception as e:
|
143 |
+
print(f"Worker {worker_id} error: {e}")
|
144 |
+
traceback.print_exc()
|
145 |
+
return []
|
146 |
+
|
147 |
+
|
148 |
+
def main():
|
149 |
+
# configure your range & parallelism
|
150 |
+
start_thing = 6993281
|
151 |
+
end_thing = 7003281
|
152 |
+
num_workers = 5
|
153 |
+
|
154 |
+
# split work
|
155 |
+
total = end_thing - start_thing + 1
|
156 |
+
per = math.ceil(total / num_workers)
|
157 |
+
batches = []
|
158 |
+
for i in range(num_workers):
|
159 |
+
s = start_thing + i * per
|
160 |
+
e = min(s + per - 1, end_thing)
|
161 |
+
batches.append((s, e, i+1))
|
162 |
+
|
163 |
+
all_results = []
|
164 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
165 |
+
with ThreadPoolExecutor(max_workers=num_workers) as ex:
|
166 |
+
futures = {ex.submit(process_batch, s, e, wid): (s,e,wid) for s,e,wid in batches}
|
167 |
+
for fut in as_completed(futures):
|
168 |
+
res = fut.result()
|
169 |
+
all_results.extend(res)
|
170 |
+
|
171 |
+
# upload combined file
|
172 |
+
if all_results:
|
173 |
+
df_all = pd.DataFrame(all_results)
|
174 |
+
upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")
|
175 |
+
|
176 |
+
if __name__ == "__main__":
|
177 |
+
main()
|