michelerussoAA commited on
Commit
e7192f8
·
verified ·
1 Parent(s): f9dd994

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +21 -9
scraper.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import io
3
  import time
@@ -10,18 +11,26 @@ from selenium import webdriver
10
  from selenium.webdriver.common.by import By
11
  from selenium.webdriver.support.ui import WebDriverWait
12
  from selenium.webdriver.support import expected_conditions as EC
 
13
  from selenium.common.exceptions import (
14
- NoSuchElementException, TimeoutException, StaleElementReferenceException
 
 
15
  )
16
- from selenium.webdriver.firefox.options import Options
17
  from selenium.webdriver.firefox.service import Service
 
 
18
  from huggingface_hub import HfApi, HfFolder
19
 
20
  # Configuration: set via Space secrets
21
  HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
22
- HF_TOKEN = HfFolder.get_token()
 
23
 
24
  def upload_df_to_hf(df: pd.DataFrame, filename: str):
 
 
 
25
  buffer = io.StringIO()
26
  df.to_csv(buffer, index=False)
27
  buffer.seek(0)
@@ -32,10 +41,11 @@ def upload_df_to_hf(df: pd.DataFrame, filename: str):
32
  repo_id=HF_REPO_ID,
33
  repo_type="dataset",
34
  token=HF_TOKEN,
35
- create_pr=False
36
  )
37
  print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
38
 
 
39
  def safe_get_text(driver, xpath, retries=1, delay=0.5):
40
  for _ in range(retries):
41
  try:
@@ -44,23 +54,25 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
44
  time.sleep(delay)
45
  return ""
46
 
 
47
  def initialize_driver():
48
- # point at the geckodriver we baked into the image
49
  service = Service("/usr/local/bin/geckodriver")
 
50
  opts = Options()
51
- opts.add_argument("--headless") # Firefox headless
 
52
  opts.add_argument("--no-sandbox")
53
  opts.add_argument("--disable-gpu")
 
54
  opts.add_argument("--window-size=1920,1080")
55
 
56
- # use a tmp profile directory
57
- opts.add_argument(f"--profile=/tmp/firefox-profile-{os.getpid()}")
58
-
59
  driver = webdriver.Firefox(service=service, options=opts)
60
  return driver
61
 
62
 
63
 
 
64
  def process_batch(start_id, end_id, worker_id):
65
  print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
66
  try:
 
1
+
2
  import os
3
  import io
4
  import time
 
11
  from selenium.webdriver.common.by import By
12
  from selenium.webdriver.support.ui import WebDriverWait
13
  from selenium.webdriver.support import expected_conditions as EC
14
+ from selenium.webdriver.support.expected_conditions import staleness_of
15
  from selenium.common.exceptions import (
16
+ NoSuchElementException,
17
+ TimeoutException,
18
+ StaleElementReferenceException,
19
  )
 
20
  from selenium.webdriver.firefox.service import Service
21
+ from selenium.webdriver.firefox.options import Options
22
+
23
  from huggingface_hub import HfApi, HfFolder
24
 
25
  # Configuration: set via Space secrets
26
  HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
27
+ HF_TOKEN = HfFolder.get_token()
28
+
29
 
30
  def upload_df_to_hf(df: pd.DataFrame, filename: str):
31
+ """
32
+ Upload a pandas DataFrame directly to HF dataset without writing to disk.
33
+ """
34
  buffer = io.StringIO()
35
  df.to_csv(buffer, index=False)
36
  buffer.seek(0)
 
41
  repo_id=HF_REPO_ID,
42
  repo_type="dataset",
43
  token=HF_TOKEN,
44
+ create_pr=False,
45
  )
46
  print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
47
 
48
+
49
  def safe_get_text(driver, xpath, retries=1, delay=0.5):
50
  for _ in range(retries):
51
  try:
 
54
  time.sleep(delay)
55
  return ""
56
 
57
+
58
  def initialize_driver():
59
+ # point to the geckodriver binary you installed
60
  service = Service("/usr/local/bin/geckodriver")
61
+
62
  opts = Options()
63
+ opts.headless = True
64
+ # disable sandbox (in many container environments)
65
  opts.add_argument("--no-sandbox")
66
  opts.add_argument("--disable-gpu")
67
+ # ensure a full window so responsive pages load properly
68
  opts.add_argument("--window-size=1920,1080")
69
 
 
 
 
70
  driver = webdriver.Firefox(service=service, options=opts)
71
  return driver
72
 
73
 
74
 
75
+
76
  def process_batch(start_id, end_id, worker_id):
77
  print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
78
  try: