Spaces:

michelerussoAA
/

thingiverse_scraper

Runtime error

App Files Files Community

michelerussoAA commited on Apr 17

Commit

654e71a

verified ·

1 Parent(s): 512aebf

Create scraper.py

Browse files

Files changed (1) hide show

scraper.py +177 -0

scraper.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os
+import io
+import time
+import math
+import traceback
+from datetime import datetime
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import (
+    NoSuchElementException, TimeoutException, StaleElementReferenceException
+)
+from huggingface_hub import HfApi, HfFolder
+# Configuration: set via Space secrets
+HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
+HF_TOKEN = HfFolder.get_token()
+def upload_df_to_hf(df: pd.DataFrame, filename: str):
+    """
+    Upload a pandas DataFrame directly to HF dataset without writing to disk.
+    """
+    buffer = io.StringIO()
+    df.to_csv(buffer, index=False)
+    buffer.seek(0)
+    api = HfApi()
+    api.upload_file(
+        path_or_fileobj=buffer,
+        path_in_repo=filename,
+        repo_id=HF_REPO_ID,
+        repo_type="dataset",
+        token=HF_TOKEN,
+        create_pr=False
+    )
+    print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
+def safe_get_text(driver, xpath, retries=1, delay=0.5):
+    for _ in range(retries):
+        try:
+            return driver.find_element(By.XPATH, xpath).text
+        except (StaleElementReferenceException, NoSuchElementException):
+            time.sleep(delay)
+    return ""
+def initialize_driver():
+    options = Options()
+    options.headless = True
+    options.add_argument("--disable-dev-shm-usage")
+    options.add_argument("--no-sandbox")
+    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
+    driver = webdriver.Chrome(options=options)
+    return driver
+def process_batch(start_id, end_id, worker_id):
+    print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
+    try:
+        driver = initialize_driver()
+        wait = WebDriverWait(driver, 10)
+        results = []
+        total = end_id - start_id + 1
+        count = 0
+        for thing_id in range(end_id, start_id - 1, -1):
+            count += 1
+            url = f"https://www.thingiverse.com/thing:{thing_id}"
+            print(f"[{worker_id}] ({count}/{total}) {url}")
+            try:
+                driver.get(url)
+            except Exception:
+                continue
+            time.sleep(1)
+            # skip error pages
+            try:
+                driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]")
+                continue
+            except NoSuchElementException:
+                pass
+            # wait for title link
+            try:
+                wait.until(EC.presence_of_element_located((By.XPATH,
+                    "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
+                ))
+            except TimeoutException:
+                continue
+            title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
+            author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
+            date_posted = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div")
+            def extract_aria(label):
+                try:
+                    return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text
+                except NoSuchElementException:
+                    return ""
+            remixes = extract_aria("Remixes")
+            files = extract_aria("Files")
+            makes = extract_aria("Makes")
+            comments = extract_aria("Comments")
+            tags = []
+            try:
+                tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
+                tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")]
+            except NoSuchElementException:
+                pass
+            results.append({
+                "URL": url,
+                "Title": title,
+                "Author": author,
+                "Date": date_posted,
+                "Remixes": remixes,
+                "Files": files,
+                "Makes": makes,
+                "Comments": comments,
+                "Tags": tags
+            })
+            # checkpoint every 10 items
+            if len(results) % 10 == 0:
+                df_chk = pd.DataFrame(results)
+                chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv"
+                upload_df_to_hf(df_chk, chk_name)
+        # final batch upload
+        if results:
+            df_final = pd.DataFrame(results)
+            final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv"
+            upload_df_to_hf(df_final, final_name)
+        driver.quit()
+        return results
+    except Exception as e:
+        print(f"Worker {worker_id} error: {e}")
+        traceback.print_exc()
+        return []
+def main():
+    # configure your range & parallelism
+    start_thing = 6993281
+    end_thing = 7003281
+    num_workers = 5
+    # split work
+    total = end_thing - start_thing + 1
+    per = math.ceil(total / num_workers)
+    batches = []
+    for i in range(num_workers):
+        s = start_thing + i * per
+        e = min(s + per - 1, end_thing)
+        batches.append((s, e, i+1))
+    all_results = []
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    with ThreadPoolExecutor(max_workers=num_workers) as ex:
+        futures = {ex.submit(process_batch, s, e, wid): (s,e,wid) for s,e,wid in batches}
+        for fut in as_completed(futures):
+            res = fut.result()
+            all_results.extend(res)
+    # upload combined file
+    if all_results:
+        df_all = pd.DataFrame(all_results)
+        upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")
+if __name__ == "__main__":
+    main()