Spaces:
Runtime error
Runtime error
Update scraper.py
Browse files- scraper.py +17 -28
scraper.py
CHANGED
@@ -7,7 +7,7 @@ from datetime import datetime
|
|
7 |
|
8 |
import pandas as pd
|
9 |
from selenium import webdriver
|
10 |
-
from selenium.webdriver.
|
11 |
from selenium.webdriver.common.by import By
|
12 |
from selenium.webdriver.support.ui import WebDriverWait
|
13 |
from selenium.webdriver.support import expected_conditions as EC
|
@@ -50,27 +50,16 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
|
|
50 |
|
51 |
|
52 |
def initialize_driver():
|
53 |
-
options =
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
options.
|
58 |
-
options.
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
options.add_argument("--remote-debugging-port=9222")
|
64 |
-
options.add_argument("--user-data-dir=/tmp/chrome-user-data")
|
65 |
-
options.add_argument("--window-size=1920,1080")
|
66 |
-
options.add_argument(
|
67 |
-
"user-agent=Mozilla/5.0 (X11; Linux x86_64) "
|
68 |
-
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
69 |
-
"Chrome/135.0.0.0 Safari/537.36"
|
70 |
-
)
|
71 |
-
# Selenium 4 Manager will auto‑download matching driver into SE_CACHE_PATH
|
72 |
-
# (ensure you set ENV SE_CACHE_PATH=/tmp/.cache/selenium in your Dockerfile)
|
73 |
-
return webdriver.Chrome(options=options)
|
74 |
|
75 |
|
76 |
def process_batch(start_id, end_id, worker_id):
|
@@ -101,9 +90,9 @@ def process_batch(start_id, end_id, worker_id):
|
|
101 |
|
102 |
# wait for title link
|
103 |
try:
|
104 |
-
wait.until(EC.presence_of_element_located(
|
105 |
-
|
106 |
-
))
|
107 |
except TimeoutException:
|
108 |
continue
|
109 |
|
@@ -120,9 +109,9 @@ def process_batch(start_id, end_id, worker_id):
|
|
120 |
except NoSuchElementException:
|
121 |
return ""
|
122 |
|
123 |
-
remixes
|
124 |
-
files
|
125 |
-
makes
|
126 |
comments = extract_aria("Comments")
|
127 |
|
128 |
tags = []
|
|
|
7 |
|
8 |
import pandas as pd
|
9 |
from selenium import webdriver
|
10 |
+
from selenium.webdriver.firefox.options import Options as FFOptions
|
11 |
from selenium.webdriver.common.by import By
|
12 |
from selenium.webdriver.support.ui import WebDriverWait
|
13 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
50 |
|
51 |
|
52 |
def initialize_driver():
|
53 |
+
options = FFOptions()
|
54 |
+
options.headless = True
|
55 |
+
# point Firefox cache & profile into /tmp
|
56 |
+
options.set_preference("browser.cache.disk.parent_directory", "/tmp")
|
57 |
+
options.set_preference("browser.cache.memory.enable", False)
|
58 |
+
options.set_preference("browser.download.dir", "/tmp")
|
59 |
+
# launch
|
60 |
+
driver = webdriver.Firefox(options=options)
|
61 |
+
driver.set_window_size(1920, 1080)
|
62 |
+
return driver
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
|
65 |
def process_batch(start_id, end_id, worker_id):
|
|
|
90 |
|
91 |
# wait for title link
|
92 |
try:
|
93 |
+
wait.until(EC.presence_of_element_located((
|
94 |
+
By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]"
|
95 |
+
)))
|
96 |
except TimeoutException:
|
97 |
continue
|
98 |
|
|
|
109 |
except NoSuchElementException:
|
110 |
return ""
|
111 |
|
112 |
+
remixes = extract_aria("Remixes")
|
113 |
+
files = extract_aria("Files")
|
114 |
+
makes = extract_aria("Makes")
|
115 |
comments = extract_aria("Comments")
|
116 |
|
117 |
tags = []
|