michelerussoAA commited on
Commit
042a08c
·
verified ·
1 Parent(s): 1c936b3

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +23 -22
scraper.py CHANGED
@@ -4,12 +4,11 @@ import time
4
  import math
5
  import traceback
6
  from datetime import datetime
 
7
  import pandas as pd
8
  from selenium import webdriver
9
  from selenium.webdriver.chrome.options import Options
10
  from selenium.webdriver.common.by import By
11
- from selenium.webdriver.chrome.service import Service
12
- from webdriver_manager.chrome import ChromeDriverManager
13
  from selenium.webdriver.support.ui import WebDriverWait
14
  from selenium.webdriver.support import expected_conditions as EC
15
  from selenium.common.exceptions import (
@@ -49,19 +48,18 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
49
  time.sleep(delay)
50
  return ""
51
 
 
52
  def initialize_driver():
53
  options = Options()
54
- # new headless mode in recent Chrome
55
  options.add_argument("--headless=new")
56
-
57
  # container flags
58
  options.add_argument("--disable-dev-shm-usage")
59
  options.add_argument("--no-sandbox")
60
  options.add_argument("--disable-gpu")
61
  options.add_argument("--disable-software-rasterizer")
62
  options.add_argument("--disable-setuid-sandbox")
63
-
64
- # use /tmp for profile & cache
65
  options.add_argument("--remote-debugging-port=9222")
66
  options.add_argument("--user-data-dir=/tmp/chrome-user-data")
67
  options.add_argument("--window-size=1920,1080")
@@ -70,12 +68,9 @@ def initialize_driver():
70
  "AppleWebKit/537.36 (KHTML, like Gecko) "
71
  "Chrome/135.0.0.0 Safari/537.36"
72
  )
73
-
74
- # now ChromeDriver is already at /usr/local/bin/chromedriver
75
- driver = webdriver.Chrome(options=options)
76
- return driver
77
-
78
-
79
 
80
 
81
  def process_batch(start_id, end_id, worker_id):
@@ -106,15 +101,18 @@ def process_batch(start_id, end_id, worker_id):
106
 
107
  # wait for title link
108
  try:
109
- wait.until(EC.presence_of_element_located((By.XPATH,
110
- "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
111
  ))
112
  except TimeoutException:
113
  continue
114
 
115
  title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
116
  author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
117
- date_posted = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div")
 
 
 
118
 
119
  def extract_aria(label):
120
  try:
@@ -123,9 +121,10 @@ def process_batch(start_id, end_id, worker_id):
123
  return ""
124
 
125
  remixes = extract_aria("Remixes")
126
- files = extract_aria("Files")
127
- makes = extract_aria("Makes")
128
  comments = extract_aria("Comments")
 
129
  tags = []
130
  try:
131
  tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
@@ -159,6 +158,7 @@ def process_batch(start_id, end_id, worker_id):
159
 
160
  driver.quit()
161
  return results
 
162
  except Exception as e:
163
  print(f"Worker {worker_id} error: {e}")
164
  traceback.print_exc()
@@ -168,12 +168,12 @@ def process_batch(start_id, end_id, worker_id):
168
  def main():
169
  # configure your range & parallelism
170
  start_thing = 6993281
171
- end_thing = 7003281
172
  num_workers = 5
173
 
174
  # split work
175
  total = end_thing - start_thing + 1
176
- per = math.ceil(total / num_workers)
177
  batches = []
178
  for i in range(num_workers):
179
  s = start_thing + i * per
@@ -183,15 +183,16 @@ def main():
183
  all_results = []
184
  from concurrent.futures import ThreadPoolExecutor, as_completed
185
  with ThreadPoolExecutor(max_workers=num_workers) as ex:
186
- futures = {ex.submit(process_batch, s, e, wid): (s,e,wid) for s,e,wid in batches}
 
187
  for fut in as_completed(futures):
188
- res = fut.result()
189
- all_results.extend(res)
190
 
191
  # upload combined file
192
  if all_results:
193
  df_all = pd.DataFrame(all_results)
194
  upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")
195
 
 
196
  if __name__ == "__main__":
197
  main()
 
4
  import math
5
  import traceback
6
  from datetime import datetime
7
+
8
  import pandas as pd
9
  from selenium import webdriver
10
  from selenium.webdriver.chrome.options import Options
11
  from selenium.webdriver.common.by import By
 
 
12
  from selenium.webdriver.support.ui import WebDriverWait
13
  from selenium.webdriver.support import expected_conditions as EC
14
  from selenium.common.exceptions import (
 
48
  time.sleep(delay)
49
  return ""
50
 
51
+
52
  def initialize_driver():
53
  options = Options()
54
+ # use new headless mode
55
  options.add_argument("--headless=new")
 
56
  # container flags
57
  options.add_argument("--disable-dev-shm-usage")
58
  options.add_argument("--no-sandbox")
59
  options.add_argument("--disable-gpu")
60
  options.add_argument("--disable-software-rasterizer")
61
  options.add_argument("--disable-setuid-sandbox")
62
+ # profile & cache in /tmp
 
63
  options.add_argument("--remote-debugging-port=9222")
64
  options.add_argument("--user-data-dir=/tmp/chrome-user-data")
65
  options.add_argument("--window-size=1920,1080")
 
68
  "AppleWebKit/537.36 (KHTML, like Gecko) "
69
  "Chrome/135.0.0.0 Safari/537.36"
70
  )
71
+ # Selenium 4 Manager will auto‑download matching driver into SE_CACHE_PATH
72
+ # (ensure you set ENV SE_CACHE_PATH=/tmp/.cache/selenium in your Dockerfile)
73
+ return webdriver.Chrome(options=options)
 
 
 
74
 
75
 
76
  def process_batch(start_id, end_id, worker_id):
 
101
 
102
  # wait for title link
103
  try:
104
+ wait.until(EC.presence_of_element_located(
105
+ (By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
106
  ))
107
  except TimeoutException:
108
  continue
109
 
110
  title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]")
111
  author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
112
+ date_posted = safe_get_text(
113
+ driver,
114
+ "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div"
115
+ )
116
 
117
  def extract_aria(label):
118
  try:
 
121
  return ""
122
 
123
  remixes = extract_aria("Remixes")
124
+ files = extract_aria("Files")
125
+ makes = extract_aria("Makes")
126
  comments = extract_aria("Comments")
127
+
128
  tags = []
129
  try:
130
  tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]")
 
158
 
159
  driver.quit()
160
  return results
161
+
162
  except Exception as e:
163
  print(f"Worker {worker_id} error: {e}")
164
  traceback.print_exc()
 
168
  def main():
169
  # configure your range & parallelism
170
  start_thing = 6993281
171
+ end_thing = 7003281
172
  num_workers = 5
173
 
174
  # split work
175
  total = end_thing - start_thing + 1
176
+ per = math.ceil(total / num_workers)
177
  batches = []
178
  for i in range(num_workers):
179
  s = start_thing + i * per
 
183
  all_results = []
184
  from concurrent.futures import ThreadPoolExecutor, as_completed
185
  with ThreadPoolExecutor(max_workers=num_workers) as ex:
186
+ futures = {ex.submit(process_batch, s, e, wid): (s,e,wid)
187
+ for s,e,wid in batches}
188
  for fut in as_completed(futures):
189
+ all_results.extend(fut.result())
 
190
 
191
  # upload combined file
192
  if all_results:
193
  df_all = pd.DataFrame(all_results)
194
  upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv")
195
 
196
+
197
  if __name__ == "__main__":
198
  main()