michelerussoAA commited on
Commit
55a067d
·
verified ·
1 Parent(s): e7192f8

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +16 -18
scraper.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import os
3
  import io
4
  import time
@@ -11,14 +10,13 @@ from selenium import webdriver
11
  from selenium.webdriver.common.by import By
12
  from selenium.webdriver.support.ui import WebDriverWait
13
  from selenium.webdriver.support import expected_conditions as EC
14
- from selenium.webdriver.support.expected_conditions import staleness_of
15
  from selenium.common.exceptions import (
16
  NoSuchElementException,
17
  TimeoutException,
18
  StaleElementReferenceException,
19
  )
20
- from selenium.webdriver.firefox.service import Service
21
- from selenium.webdriver.firefox.options import Options
22
 
23
  from huggingface_hub import HfApi, HfFolder
24
 
@@ -56,23 +54,23 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
56
 
57
 
58
  def initialize_driver():
59
- # point to the geckodriver binary you installed
60
- service = Service("/usr/local/bin/geckodriver")
61
-
62
- opts = Options()
63
- opts.headless = True
64
- # disable sandbox (in many container environments)
65
- opts.add_argument("--no-sandbox")
66
- opts.add_argument("--disable-gpu")
67
- # ensure a full window so responsive pages load properly
68
- opts.add_argument("--window-size=1920,1080")
69
-
70
- driver = webdriver.Firefox(service=service, options=opts)
 
 
71
  return driver
72
 
73
 
74
-
75
-
76
  def process_batch(start_id, end_id, worker_id):
77
  print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
78
  try:
 
 
1
  import os
2
  import io
3
  import time
 
10
  from selenium.webdriver.common.by import By
11
  from selenium.webdriver.support.ui import WebDriverWait
12
  from selenium.webdriver.support import expected_conditions as EC
 
13
  from selenium.common.exceptions import (
14
  NoSuchElementException,
15
  TimeoutException,
16
  StaleElementReferenceException,
17
  )
18
+ from selenium.webdriver.chrome.options import Options
19
+ from selenium.webdriver.chrome.service import Service
20
 
21
  from huggingface_hub import HfApi, HfFolder
22
 
 
54
 
55
 
56
  def initialize_driver():
57
+ # path to the chromedriver you installed in Dockerfile
58
+ service = Service("/usr/local/bin/chromedriver")
59
+
60
+ options = Options()
61
+ # tell Selenium exactly where Chrome itself lives
62
+ options.binary_location = "/usr/bin/google-chrome-stable"
63
+ options.add_argument("--headless=new")
64
+ options.add_argument("--disable-gpu")
65
+ options.add_argument("--no-sandbox")
66
+ options.add_argument("--disable-dev-shm-usage")
67
+ options.add_argument("--window-size=1920,1080")
68
+
69
+ # launch Chrome via the Service
70
+ driver = webdriver.Chrome(service=service, options=options)
71
  return driver
72
 
73
 
 
 
74
  def process_batch(start_id, end_id, worker_id):
75
  print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
76
  try: