michelerussoAA commited on
Commit
b5b43c5
·
verified ·
1 Parent(s): befcec8

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +17 -28
scraper.py CHANGED
@@ -7,7 +7,7 @@ from datetime import datetime
7
 
8
  import pandas as pd
9
  from selenium import webdriver
10
- from selenium.webdriver.chrome.options import Options
11
  from selenium.webdriver.common.by import By
12
  from selenium.webdriver.support.ui import WebDriverWait
13
  from selenium.webdriver.support import expected_conditions as EC
@@ -50,27 +50,16 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
50
 
51
 
52
  def initialize_driver():
53
- options = Options()
54
- # use new headless mode
55
- options.add_argument("--headless=new")
56
- # container flags
57
- options.add_argument("--disable-dev-shm-usage")
58
- options.add_argument("--no-sandbox")
59
- options.add_argument("--disable-gpu")
60
- options.add_argument("--disable-software-rasterizer")
61
- options.add_argument("--disable-setuid-sandbox")
62
- # profile & cache in /tmp
63
- options.add_argument("--remote-debugging-port=9222")
64
- options.add_argument("--user-data-dir=/tmp/chrome-user-data")
65
- options.add_argument("--window-size=1920,1080")
66
- options.add_argument(
67
- "user-agent=Mozilla/5.0 (X11; Linux x86_64) "
68
- "AppleWebKit/537.36 (KHTML, like Gecko) "
69
- "Chrome/135.0.0.0 Safari/537.36"
70
- )
71
- # Selenium 4 Manager will auto‑download matching driver into SE_CACHE_PATH
72
- # (ensure you set ENV SE_CACHE_PATH=/tmp/.cache/selenium in your Dockerfile)
73
- return webdriver.Chrome(options=options)
74
 
75
 
76
  def process_batch(start_id, end_id, worker_id):
@@ -101,9 +90,9 @@ def process_batch(start_id, end_id, worker_id):
101
 
102
  # wait for title link
103
  try:
104
- wait.until(EC.presence_of_element_located(
105
- (By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]")
106
- ))
107
  except TimeoutException:
108
  continue
109
 
@@ -120,9 +109,9 @@ def process_batch(start_id, end_id, worker_id):
120
  except NoSuchElementException:
121
  return ""
122
 
123
- remixes = extract_aria("Remixes")
124
- files = extract_aria("Files")
125
- makes = extract_aria("Makes")
126
  comments = extract_aria("Comments")
127
 
128
  tags = []
 
7
 
8
  import pandas as pd
9
  from selenium import webdriver
10
+ from selenium.webdriver.firefox.options import Options as FFOptions
11
  from selenium.webdriver.common.by import By
12
  from selenium.webdriver.support.ui import WebDriverWait
13
  from selenium.webdriver.support import expected_conditions as EC
 
50
 
51
 
52
  def initialize_driver():
53
+ options = FFOptions()
54
+ options.headless = True
55
+ # point Firefox cache & profile into /tmp
56
+ options.set_preference("browser.cache.disk.parent_directory", "/tmp")
57
+ options.set_preference("browser.cache.memory.enable", False)
58
+ options.set_preference("browser.download.dir", "/tmp")
59
+ # launch
60
+ driver = webdriver.Firefox(options=options)
61
+ driver.set_window_size(1920, 1080)
62
+ return driver
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  def process_batch(start_id, end_id, worker_id):
 
90
 
91
  # wait for title link
92
  try:
93
+ wait.until(EC.presence_of_element_located((
94
+ By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]"
95
+ )))
96
  except TimeoutException:
97
  continue
98
 
 
109
  except NoSuchElementException:
110
  return ""
111
 
112
+ remixes = extract_aria("Remixes")
113
+ files = extract_aria("Files")
114
+ makes = extract_aria("Makes")
115
  comments = extract_aria("Comments")
116
 
117
  tags = []