Spaces:
Runtime error
Runtime error
Update scraper.py
Browse files- scraper.py +11 -11
scraper.py
CHANGED
@@ -49,35 +49,35 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
|
|
49 |
time.sleep(delay)
|
50 |
return ""
|
51 |
|
52 |
-
|
53 |
-
|
54 |
def initialize_driver():
|
55 |
options = Options()
|
56 |
-
|
|
|
57 |
|
58 |
-
#
|
59 |
options.add_argument("--disable-dev-shm-usage")
|
60 |
options.add_argument("--no-sandbox")
|
61 |
options.add_argument("--disable-gpu")
|
62 |
options.add_argument("--disable-software-rasterizer")
|
63 |
options.add_argument("--disable-setuid-sandbox")
|
64 |
|
65 |
-
#
|
66 |
options.add_argument("--remote-debugging-port=9222")
|
67 |
options.add_argument("--user-data-dir=/tmp/chrome-user-data")
|
68 |
-
options.add_argument("window-size=1920,1080")
|
69 |
options.add_argument(
|
70 |
-
"user-agent=Mozilla/5.0 (X11; Linux x86_64)
|
71 |
-
"(KHTML, like Gecko)
|
|
|
72 |
)
|
73 |
|
74 |
-
#
|
75 |
-
|
76 |
-
driver = webdriver.Chrome(service=service, options=options)
|
77 |
return driver
|
78 |
|
79 |
|
80 |
|
|
|
81 |
def process_batch(start_id, end_id, worker_id):
|
82 |
print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
|
83 |
try:
|
|
|
49 |
time.sleep(delay)
|
50 |
return ""
|
51 |
|
|
|
|
|
52 |
def initialize_driver():
|
53 |
options = Options()
|
54 |
+
# new headless mode in recent Chrome
|
55 |
+
options.add_argument("--headless=new")
|
56 |
|
57 |
+
# container flags
|
58 |
options.add_argument("--disable-dev-shm-usage")
|
59 |
options.add_argument("--no-sandbox")
|
60 |
options.add_argument("--disable-gpu")
|
61 |
options.add_argument("--disable-software-rasterizer")
|
62 |
options.add_argument("--disable-setuid-sandbox")
|
63 |
|
64 |
+
# use /tmp for profile & cache
|
65 |
options.add_argument("--remote-debugging-port=9222")
|
66 |
options.add_argument("--user-data-dir=/tmp/chrome-user-data")
|
67 |
+
options.add_argument("--window-size=1920,1080")
|
68 |
options.add_argument(
|
69 |
+
"user-agent=Mozilla/5.0 (X11; Linux x86_64) "
|
70 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
71 |
+
"Chrome/135.0.0.0 Safari/537.36"
|
72 |
)
|
73 |
|
74 |
+
# now ChromeDriver is already at /usr/local/bin/chromedriver
|
75 |
+
driver = webdriver.Chrome(options=options)
|
|
|
76 |
return driver
|
77 |
|
78 |
|
79 |
|
80 |
+
|
81 |
def process_batch(start_id, end_id, worker_id):
|
82 |
print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
|
83 |
try:
|