File size: 6,268 Bytes
e7192f8
654e71a
 
 
 
 
 
042a08c
654e71a
 
 
 
 
e7192f8
654e71a
e7192f8
 
 
654e71a
f9dd994
e7192f8
 
654e71a
 
 
 
e7192f8
 
654e71a
 
e7192f8
 
 
654e71a
 
 
 
 
 
 
 
 
 
e7192f8
654e71a
 
 
e7192f8
654e71a
 
 
 
 
 
 
 
e7192f8
654e71a
e7192f8
f9dd994
e7192f8
f9dd994
e7192f8
 
f9dd994
 
e7192f8
f9dd994
 
 
b5b43c5
8dd6168
d6da54f
f9dd994
e7192f8
654e71a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5b43c5
 
 
654e71a
 
 
 
 
042a08c
 
 
 
654e71a
 
 
 
 
 
 
b5b43c5
 
 
654e71a
042a08c
654e71a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
042a08c
654e71a
 
 
 
 
 
 
 
 
042a08c
654e71a
 
 
 
042a08c
654e71a
 
 
 
 
 
 
 
 
042a08c
 
654e71a
042a08c
654e71a
 
 
 
 
 
042a08c
654e71a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

import os
import io
import time
import math
import traceback
from datetime import datetime

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import staleness_of
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    StaleElementReferenceException,
)
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options

from huggingface_hub import HfApi, HfFolder

# Configuration: set via Space secrets
HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
HF_TOKEN   = HfFolder.get_token()


def upload_df_to_hf(df: pd.DataFrame, filename: str):
    """
    Upload a pandas DataFrame directly to HF dataset without writing to disk.
    """
    buffer = io.StringIO()
    df.to_csv(buffer, index=False)
    buffer.seek(0)
    api = HfApi()
    api.upload_file(
        path_or_fileobj=buffer,
        path_in_repo=filename,
        repo_id=HF_REPO_ID,
        repo_type="dataset",
        token=HF_TOKEN,
        create_pr=False,
    )
    print(f"✅ Uploaded {filename} to {HF_REPO_ID}")


def safe_get_text(driver, xpath, retries=1, delay=0.5):
    for _ in range(retries):
        try:
            return driver.find_element(By.XPATH, xpath).text
        except (StaleElementReferenceException, NoSuchElementException):
            time.sleep(delay)
    return ""


def initialize_driver():
    # point to the geckodriver binary you installed
    service = Service("/usr/local/bin/geckodriver")

    opts = Options()
    opts.headless = True
    # disable sandbox (in many container environments)
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-gpu")
    # ensure a full window so responsive pages load properly
    opts.add_argument("--window-size=1920,1080")

    driver = webdriver.Firefox(service=service, options=opts)
    return driver




def process_batch(start_id, end_id, worker_id):
    print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
    try:
        driver = initialize_driver()
        wait = WebDriverWait(driver, 10)
        results = []

        total = end_id - start_id + 1
        count = 0
        for thing_id in range(end_id, start_id - 1, -1):
            count += 1
            url = f"https://www.thingiverse.com/thing:{thing_id}"
            print(f"[{worker_id}] ({count}/{total}) {url}")
            try:
                driver.get(url)
            except Exception:
                continue
            time.sleep(1)

            # skip error pages
            try:
                driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]")
                continue
            except NoSuchElementException:
                pass

            # wait for title link
            try:
                wait.until(EC.presence_of_element_located((
                    By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]"
                )))
            except TimeoutException:
                continue

            title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
            author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
            date_posted = safe_get_text(
                driver,
                "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div"
            )

            def extract_aria(label):
                try:
                    return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text
                except NoSuchElementException:
                    return ""

            remixes  = extract_aria("Remixes")
            files    = extract_aria("Files")
            makes    = extract_aria("Makes")
            comments = extract_aria("Comments")

            tags = []
            try:
                tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
                tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")]
            except NoSuchElementException:
                pass

            results.append({
                "URL": url,
                "Title": title,
                "Author": author,
                "Date": date_posted,
                "Remixes": remixes,
                "Files": files,
                "Makes": makes,
                "Comments": comments,
                "Tags": tags
            })

            # checkpoint every 10 items
            if len(results) % 10 == 0:
                df_chk = pd.DataFrame(results)
                chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv"
                upload_df_to_hf(df_chk, chk_name)

        # final batch upload
        if results:
            df_final = pd.DataFrame(results)
            final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv"
            upload_df_to_hf(df_final, final_name)

        driver.quit()
        return results

    except Exception as e:
        print(f"Worker {worker_id} error: {e}")
        traceback.print_exc()
        return []


def main():
    # configure your range & parallelism
    start_thing = 6993281
    end_thing   = 7003281
    num_workers = 5

    # split work
    total = end_thing - start_thing + 1
    per   = math.ceil(total / num_workers)
    batches = []
    for i in range(num_workers):
        s = start_thing + i * per
        e = min(s + per - 1, end_thing)
        batches.append((s, e, i+1))

    all_results = []
    from concurrent.futures import ThreadPoolExecutor, as_completed
    with ThreadPoolExecutor(max_workers=num_workers) as ex:
        futures = {ex.submit(process_batch, s, e, wid): (s,e,wid)
                   for s,e,wid in batches}
        for fut in as_completed(futures):
            all_results.extend(fut.result())

    # upload combined file
    if all_results:
        df_all = pd.DataFrame(all_results)
        upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")


if __name__ == "__main__":
    main()