Spaces:
Runtime error
Runtime error
Update scraper.py
Browse files- scraper.py +23 -22
scraper.py
CHANGED
@@ -4,12 +4,11 @@ import time
|
|
4 |
import math
|
5 |
import traceback
|
6 |
from datetime import datetime
|
|
|
7 |
import pandas as pd
|
8 |
from selenium import webdriver
|
9 |
from selenium.webdriver.chrome.options import Options
|
10 |
from selenium.webdriver.common.by import By
|
11 |
-
from selenium.webdriver.chrome.service import Service
|
12 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
13 |
from selenium.webdriver.support.ui import WebDriverWait
|
14 |
from selenium.webdriver.support import expected_conditions as EC
|
15 |
from selenium.common.exceptions import (
|
@@ -49,19 +48,18 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
|
|
49 |
time.sleep(delay)
|
50 |
return ""
|
51 |
|
|
|
52 |
def initialize_driver():
|
53 |
options = Options()
|
54 |
-
# new headless mode
|
55 |
options.add_argument("--headless=new")
|
56 |
-
|
57 |
# container flags
|
58 |
options.add_argument("--disable-dev-shm-usage")
|
59 |
options.add_argument("--no-sandbox")
|
60 |
options.add_argument("--disable-gpu")
|
61 |
options.add_argument("--disable-software-rasterizer")
|
62 |
options.add_argument("--disable-setuid-sandbox")
|
63 |
-
|
64 |
-
# use /tmp for profile & cache
|
65 |
options.add_argument("--remote-debugging-port=9222")
|
66 |
options.add_argument("--user-data-dir=/tmp/chrome-user-data")
|
67 |
options.add_argument("--window-size=1920,1080")
|
@@ -70,12 +68,9 @@ def initialize_driver():
|
|
70 |
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
71 |
"Chrome/135.0.0.0 Safari/537.36"
|
72 |
)
|
73 |
-
|
74 |
-
#
|
75 |
-
|
76 |
-
return driver
|
77 |
-
|
78 |
-
|
79 |
|
80 |
|
81 |
def process_batch(start_id, end_id, worker_id):
|
@@ -106,15 +101,18 @@ def process_batch(start_id, end_id, worker_id):
|
|
106 |
|
107 |
# wait for title link
|
108 |
try:
|
109 |
-
wait.until(EC.presence_of_element_located(
|
110 |
-
"//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
|
111 |
))
|
112 |
except TimeoutException:
|
113 |
continue
|
114 |
|
115 |
title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
|
116 |
author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
|
117 |
-
date_posted = safe_get_text(
|
|
|
|
|
|
|
118 |
|
119 |
def extract_aria(label):
|
120 |
try:
|
@@ -123,9 +121,10 @@ def process_batch(start_id, end_id, worker_id):
|
|
123 |
return ""
|
124 |
|
125 |
remixes = extract_aria("Remixes")
|
126 |
-
files
|
127 |
-
makes
|
128 |
comments = extract_aria("Comments")
|
|
|
129 |
tags = []
|
130 |
try:
|
131 |
tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
|
@@ -159,6 +158,7 @@ def process_batch(start_id, end_id, worker_id):
|
|
159 |
|
160 |
driver.quit()
|
161 |
return results
|
|
|
162 |
except Exception as e:
|
163 |
print(f"Worker {worker_id} error: {e}")
|
164 |
traceback.print_exc()
|
@@ -168,12 +168,12 @@ def process_batch(start_id, end_id, worker_id):
|
|
168 |
def main():
|
169 |
# configure your range & parallelism
|
170 |
start_thing = 6993281
|
171 |
-
end_thing
|
172 |
num_workers = 5
|
173 |
|
174 |
# split work
|
175 |
total = end_thing - start_thing + 1
|
176 |
-
per
|
177 |
batches = []
|
178 |
for i in range(num_workers):
|
179 |
s = start_thing + i * per
|
@@ -183,15 +183,16 @@ def main():
|
|
183 |
all_results = []
|
184 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
185 |
with ThreadPoolExecutor(max_workers=num_workers) as ex:
|
186 |
-
futures = {ex.submit(process_batch, s, e, wid): (s,e,wid)
|
|
|
187 |
for fut in as_completed(futures):
|
188 |
-
|
189 |
-
all_results.extend(res)
|
190 |
|
191 |
# upload combined file
|
192 |
if all_results:
|
193 |
df_all = pd.DataFrame(all_results)
|
194 |
upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")
|
195 |
|
|
|
196 |
if __name__ == "__main__":
|
197 |
main()
|
|
|
4 |
import math
|
5 |
import traceback
|
6 |
from datetime import datetime
|
7 |
+
|
8 |
import pandas as pd
|
9 |
from selenium import webdriver
|
10 |
from selenium.webdriver.chrome.options import Options
|
11 |
from selenium.webdriver.common.by import By
|
|
|
|
|
12 |
from selenium.webdriver.support.ui import WebDriverWait
|
13 |
from selenium.webdriver.support import expected_conditions as EC
|
14 |
from selenium.common.exceptions import (
|
|
|
48 |
time.sleep(delay)
|
49 |
return ""
|
50 |
|
51 |
+
|
52 |
def initialize_driver():
|
53 |
options = Options()
|
54 |
+
# use new headless mode
|
55 |
options.add_argument("--headless=new")
|
|
|
56 |
# container flags
|
57 |
options.add_argument("--disable-dev-shm-usage")
|
58 |
options.add_argument("--no-sandbox")
|
59 |
options.add_argument("--disable-gpu")
|
60 |
options.add_argument("--disable-software-rasterizer")
|
61 |
options.add_argument("--disable-setuid-sandbox")
|
62 |
+
# profile & cache in /tmp
|
|
|
63 |
options.add_argument("--remote-debugging-port=9222")
|
64 |
options.add_argument("--user-data-dir=/tmp/chrome-user-data")
|
65 |
options.add_argument("--window-size=1920,1080")
|
|
|
68 |
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
69 |
"Chrome/135.0.0.0 Safari/537.36"
|
70 |
)
|
71 |
+
# Selenium 4 Manager will auto‑download matching driver into SE_CACHE_PATH
|
72 |
+
# (ensure you set ENV SE_CACHE_PATH=/tmp/.cache/selenium in your Dockerfile)
|
73 |
+
return webdriver.Chrome(options=options)
|
|
|
|
|
|
|
74 |
|
75 |
|
76 |
def process_batch(start_id, end_id, worker_id):
|
|
|
101 |
|
102 |
# wait for title link
|
103 |
try:
|
104 |
+
wait.until(EC.presence_of_element_located(
|
105 |
+
(By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
|
106 |
))
|
107 |
except TimeoutException:
|
108 |
continue
|
109 |
|
110 |
title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
|
111 |
author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
|
112 |
+
date_posted = safe_get_text(
|
113 |
+
driver,
|
114 |
+
"//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div"
|
115 |
+
)
|
116 |
|
117 |
def extract_aria(label):
|
118 |
try:
|
|
|
121 |
return ""
|
122 |
|
123 |
remixes = extract_aria("Remixes")
|
124 |
+
files = extract_aria("Files")
|
125 |
+
makes = extract_aria("Makes")
|
126 |
comments = extract_aria("Comments")
|
127 |
+
|
128 |
tags = []
|
129 |
try:
|
130 |
tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
|
|
|
158 |
|
159 |
driver.quit()
|
160 |
return results
|
161 |
+
|
162 |
except Exception as e:
|
163 |
print(f"Worker {worker_id} error: {e}")
|
164 |
traceback.print_exc()
|
|
|
168 |
def main():
|
169 |
# configure your range & parallelism
|
170 |
start_thing = 6993281
|
171 |
+
end_thing = 7003281
|
172 |
num_workers = 5
|
173 |
|
174 |
# split work
|
175 |
total = end_thing - start_thing + 1
|
176 |
+
per = math.ceil(total / num_workers)
|
177 |
batches = []
|
178 |
for i in range(num_workers):
|
179 |
s = start_thing + i * per
|
|
|
183 |
all_results = []
|
184 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
185 |
with ThreadPoolExecutor(max_workers=num_workers) as ex:
|
186 |
+
futures = {ex.submit(process_batch, s, e, wid): (s,e,wid)
|
187 |
+
for s,e,wid in batches}
|
188 |
for fut in as_completed(futures):
|
189 |
+
all_results.extend(fut.result())
|
|
|
190 |
|
191 |
# upload combined file
|
192 |
if all_results:
|
193 |
df_all = pd.DataFrame(all_results)
|
194 |
upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")
|
195 |
|
196 |
+
|
197 |
if __name__ == "__main__":
|
198 |
main()
|