michelerussoAA commited on
Commit
f9dd994
·
verified ·
1 Parent(s): 0f0ec44

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +16 -17
scraper.py CHANGED
@@ -7,24 +7,21 @@ from datetime import datetime
7
 
8
  import pandas as pd
9
  from selenium import webdriver
10
- from selenium.webdriver.firefox.options import Options as FFOptions
11
  from selenium.webdriver.common.by import By
12
  from selenium.webdriver.support.ui import WebDriverWait
13
  from selenium.webdriver.support import expected_conditions as EC
14
  from selenium.common.exceptions import (
15
  NoSuchElementException, TimeoutException, StaleElementReferenceException
16
  )
 
 
17
  from huggingface_hub import HfApi, HfFolder
18
 
19
  # Configuration: set via Space secrets
20
  HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
21
- HF_TOKEN = HfFolder.get_token()
22
-
23
 
24
  def upload_df_to_hf(df: pd.DataFrame, filename: str):
25
- """
26
- Upload a pandas DataFrame directly to HF dataset without writing to disk.
27
- """
28
  buffer = io.StringIO()
29
  df.to_csv(buffer, index=False)
30
  buffer.seek(0)
@@ -39,7 +36,6 @@ def upload_df_to_hf(df: pd.DataFrame, filename: str):
39
  )
40
  print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
41
 
42
-
43
  def safe_get_text(driver, xpath, retries=1, delay=0.5):
44
  for _ in range(retries):
45
  try:
@@ -48,20 +44,23 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
48
  time.sleep(delay)
49
  return ""
50
 
51
-
52
  def initialize_driver():
53
- options = FFOptions()
54
- options.headless = True
55
- # point Firefox cache & profile into /tmp
56
- options.set_preference("browser.cache.disk.parent_directory", "/tmp")
57
- options.set_preference("browser.cache.memory.enable", False)
58
- options.set_preference("browser.download.dir", "/tmp")
59
- # launch
60
- driver = webdriver.Firefox(options=options)
61
- driver.set_window_size(1920, 1080)
 
 
 
62
  return driver
63
 
64
 
 
65
  def process_batch(start_id, end_id, worker_id):
66
  print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
67
  try:
 
7
 
8
  import pandas as pd
9
  from selenium import webdriver
 
10
  from selenium.webdriver.common.by import By
11
  from selenium.webdriver.support.ui import WebDriverWait
12
  from selenium.webdriver.support import expected_conditions as EC
13
  from selenium.common.exceptions import (
14
  NoSuchElementException, TimeoutException, StaleElementReferenceException
15
  )
16
+ from selenium.webdriver.firefox.options import Options
17
+ from selenium.webdriver.firefox.service import Service
18
  from huggingface_hub import HfApi, HfFolder
19
 
20
  # Configuration: set via Space secrets
21
  HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
22
+ HF_TOKEN = HfFolder.get_token()
 
23
 
24
  def upload_df_to_hf(df: pd.DataFrame, filename: str):
 
 
 
25
  buffer = io.StringIO()
26
  df.to_csv(buffer, index=False)
27
  buffer.seek(0)
 
36
  )
37
  print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
38
 
 
39
  def safe_get_text(driver, xpath, retries=1, delay=0.5):
40
  for _ in range(retries):
41
  try:
 
44
  time.sleep(delay)
45
  return ""
46
 
 
47
  def initialize_driver():
48
+ # point at the geckodriver we baked into the image
49
+ service = Service("/usr/local/bin/geckodriver")
50
+ opts = Options()
51
+ opts.add_argument("--headless") # Firefox headless
52
+ opts.add_argument("--no-sandbox")
53
+ opts.add_argument("--disable-gpu")
54
+ opts.add_argument("--window-size=1920,1080")
55
+
56
+ # use a tmp profile directory
57
+ opts.add_argument(f"--profile=/tmp/firefox-profile-{os.getpid()}")
58
+
59
+ driver = webdriver.Firefox(service=service, options=opts)
60
  return driver
61
 
62
 
63
+
64
  def process_batch(start_id, end_id, worker_id):
65
  print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
66
  try: