michelerussoAA commited on
Commit
d6da54f
·
verified ·
1 Parent(s): 1324411

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +11 -11
scraper.py CHANGED
@@ -49,35 +49,35 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
49
  time.sleep(delay)
50
  return ""
51
 
52
-
53
-
54
  def initialize_driver():
55
  options = Options()
56
- options.headless = True
 
57
 
58
- # Linux container flags
59
  options.add_argument("--disable-dev-shm-usage")
60
  options.add_argument("--no-sandbox")
61
  options.add_argument("--disable-gpu")
62
  options.add_argument("--disable-software-rasterizer")
63
  options.add_argument("--disable-setuid-sandbox")
64
 
65
- # Use /tmp for profile & remote-debugging
66
  options.add_argument("--remote-debugging-port=9222")
67
  options.add_argument("--user-data-dir=/tmp/chrome-user-data")
68
- options.add_argument("window-size=1920,1080")
69
  options.add_argument(
70
- "user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
71
- "(KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
 
72
  )
73
 
74
- # webdriver-manager will download the correct ChromeDriver version
75
- service = Service(ChromeDriverManager(path="/tmp/.wdm").install())
76
- driver = webdriver.Chrome(service=service, options=options)
77
  return driver
78
 
79
 
80
 
 
81
  def process_batch(start_id, end_id, worker_id):
82
  print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
83
  try:
 
49
  time.sleep(delay)
50
  return ""
51
 
 
 
52
  def initialize_driver():
53
  options = Options()
54
+ # new headless mode in recent Chrome
55
+ options.add_argument("--headless=new")
56
 
57
+ # container flags
58
  options.add_argument("--disable-dev-shm-usage")
59
  options.add_argument("--no-sandbox")
60
  options.add_argument("--disable-gpu")
61
  options.add_argument("--disable-software-rasterizer")
62
  options.add_argument("--disable-setuid-sandbox")
63
 
64
+ # use /tmp for profile & cache
65
  options.add_argument("--remote-debugging-port=9222")
66
  options.add_argument("--user-data-dir=/tmp/chrome-user-data")
67
+ options.add_argument("--window-size=1920,1080")
68
  options.add_argument(
69
+ "user-agent=Mozilla/5.0 (X11; Linux x86_64) "
70
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
71
+ "Chrome/135.0.0.0 Safari/537.36"
72
  )
73
 
74
+ # now ChromeDriver is already at /usr/local/bin/chromedriver
75
+ driver = webdriver.Chrome(options=options)
 
76
  return driver
77
 
78
 
79
 
80
+
81
  def process_batch(start_id, end_id, worker_id):
82
  print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
83
  try: