michelerussoAA commited on
Commit
654e71a
·
verified ·
1 Parent(s): 512aebf

Create scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +177 -0
scraper.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import time
4
+ import math
5
+ import traceback
6
+ from datetime import datetime
7
+ import pandas as pd
8
+ from selenium import webdriver
9
+ from selenium.webdriver.chrome.options import Options
10
+ from selenium.webdriver.common.by import By
11
+ from selenium.webdriver.support.ui import WebDriverWait
12
+ from selenium.webdriver.support import expected_conditions as EC
13
+ from selenium.common.exceptions import (
14
+ NoSuchElementException, TimeoutException, StaleElementReferenceException
15
+ )
16
+ from huggingface_hub import HfApi, HfFolder
17
+
18
+ # Configuration: set via Space secrets
19
+ HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
20
+ HF_TOKEN = HfFolder.get_token()
21
+
22
+
23
+ def upload_df_to_hf(df: pd.DataFrame, filename: str):
24
+ """
25
+ Upload a pandas DataFrame directly to HF dataset without writing to disk.
26
+ """
27
+ buffer = io.StringIO()
28
+ df.to_csv(buffer, index=False)
29
+ buffer.seek(0)
30
+ api = HfApi()
31
+ api.upload_file(
32
+ path_or_fileobj=buffer,
33
+ path_in_repo=filename,
34
+ repo_id=HF_REPO_ID,
35
+ repo_type="dataset",
36
+ token=HF_TOKEN,
37
+ create_pr=False
38
+ )
39
+ print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
40
+
41
+
42
+ def safe_get_text(driver, xpath, retries=1, delay=0.5):
43
+ for _ in range(retries):
44
+ try:
45
+ return driver.find_element(By.XPATH, xpath).text
46
+ except (StaleElementReferenceException, NoSuchElementException):
47
+ time.sleep(delay)
48
+ return ""
49
+
50
+
51
+ def initialize_driver():
52
+ options = Options()
53
+ options.headless = True
54
+ options.add_argument("--disable-dev-shm-usage")
55
+ options.add_argument("--no-sandbox")
56
+ options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
57
+ driver = webdriver.Chrome(options=options)
58
+ return driver
59
+
60
+
61
+ def process_batch(start_id, end_id, worker_id):
62
+ print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
63
+ try:
64
+ driver = initialize_driver()
65
+ wait = WebDriverWait(driver, 10)
66
+ results = []
67
+
68
+ total = end_id - start_id + 1
69
+ count = 0
70
+ for thing_id in range(end_id, start_id - 1, -1):
71
+ count += 1
72
+ url = f"https://www.thingiverse.com/thing:{thing_id}"
73
+ print(f"[{worker_id}] ({count}/{total}) {url}")
74
+ try:
75
+ driver.get(url)
76
+ except Exception:
77
+ continue
78
+ time.sleep(1)
79
+
80
+ # skip error pages
81
+ try:
82
+ driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]")
83
+ continue
84
+ except NoSuchElementException:
85
+ pass
86
+
87
+ # wait for title link
88
+ try:
89
+ wait.until(EC.presence_of_element_located((By.XPATH,
90
+ "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
91
+ ))
92
+ except TimeoutException:
93
+ continue
94
+
95
+ title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
96
+ author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
97
+ date_posted = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div")
98
+
99
+ def extract_aria(label):
100
+ try:
101
+ return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text
102
+ except NoSuchElementException:
103
+ return ""
104
+
105
+ remixes = extract_aria("Remixes")
106
+ files = extract_aria("Files")
107
+ makes = extract_aria("Makes")
108
+ comments = extract_aria("Comments")
109
+ tags = []
110
+ try:
111
+ tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
112
+ tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")]
113
+ except NoSuchElementException:
114
+ pass
115
+
116
+ results.append({
117
+ "URL": url,
118
+ "Title": title,
119
+ "Author": author,
120
+ "Date": date_posted,
121
+ "Remixes": remixes,
122
+ "Files": files,
123
+ "Makes": makes,
124
+ "Comments": comments,
125
+ "Tags": tags
126
+ })
127
+
128
+ # checkpoint every 10 items
129
+ if len(results) % 10 == 0:
130
+ df_chk = pd.DataFrame(results)
131
+ chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv"
132
+ upload_df_to_hf(df_chk, chk_name)
133
+
134
+ # final batch upload
135
+ if results:
136
+ df_final = pd.DataFrame(results)
137
+ final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv"
138
+ upload_df_to_hf(df_final, final_name)
139
+
140
+ driver.quit()
141
+ return results
142
+ except Exception as e:
143
+ print(f"Worker {worker_id} error: {e}")
144
+ traceback.print_exc()
145
+ return []
146
+
147
+
148
+ def main():
149
+ # configure your range & parallelism
150
+ start_thing = 6993281
151
+ end_thing = 7003281
152
+ num_workers = 5
153
+
154
+ # split work
155
+ total = end_thing - start_thing + 1
156
+ per = math.ceil(total / num_workers)
157
+ batches = []
158
+ for i in range(num_workers):
159
+ s = start_thing + i * per
160
+ e = min(s + per - 1, end_thing)
161
+ batches.append((s, e, i+1))
162
+
163
+ all_results = []
164
+ from concurrent.futures import ThreadPoolExecutor, as_completed
165
+ with ThreadPoolExecutor(max_workers=num_workers) as ex:
166
+ futures = {ex.submit(process_batch, s, e, wid): (s,e,wid) for s,e,wid in batches}
167
+ for fut in as_completed(futures):
168
+ res = fut.result()
169
+ all_results.extend(res)
170
+
171
+ # upload combined file
172
+ if all_results:
173
+ df_all = pd.DataFrame(all_results)
174
+ upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")
175
+
176
+ if __name__ == "__main__":
177
+ main()