Spaces:
Runtime error
Runtime error
Update scraper.py
Browse files- scraper.py +21 -9
scraper.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
import io
|
3 |
import time
|
@@ -10,18 +11,26 @@ from selenium import webdriver
|
|
10 |
from selenium.webdriver.common.by import By
|
11 |
from selenium.webdriver.support.ui import WebDriverWait
|
12 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
13 |
from selenium.common.exceptions import (
|
14 |
-
NoSuchElementException,
|
|
|
|
|
15 |
)
|
16 |
-
from selenium.webdriver.firefox.options import Options
|
17 |
from selenium.webdriver.firefox.service import Service
|
|
|
|
|
18 |
from huggingface_hub import HfApi, HfFolder
|
19 |
|
20 |
# Configuration: set via Space secrets
|
21 |
HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
|
22 |
-
HF_TOKEN
|
|
|
23 |
|
24 |
def upload_df_to_hf(df: pd.DataFrame, filename: str):
|
|
|
|
|
|
|
25 |
buffer = io.StringIO()
|
26 |
df.to_csv(buffer, index=False)
|
27 |
buffer.seek(0)
|
@@ -32,10 +41,11 @@ def upload_df_to_hf(df: pd.DataFrame, filename: str):
|
|
32 |
repo_id=HF_REPO_ID,
|
33 |
repo_type="dataset",
|
34 |
token=HF_TOKEN,
|
35 |
-
create_pr=False
|
36 |
)
|
37 |
print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
|
38 |
|
|
|
39 |
def safe_get_text(driver, xpath, retries=1, delay=0.5):
|
40 |
for _ in range(retries):
|
41 |
try:
|
@@ -44,23 +54,25 @@ def safe_get_text(driver, xpath, retries=1, delay=0.5):
|
|
44 |
time.sleep(delay)
|
45 |
return ""
|
46 |
|
|
|
47 |
def initialize_driver():
|
48 |
-
# point
|
49 |
service = Service("/usr/local/bin/geckodriver")
|
|
|
50 |
opts = Options()
|
51 |
-
opts.
|
|
|
52 |
opts.add_argument("--no-sandbox")
|
53 |
opts.add_argument("--disable-gpu")
|
|
|
54 |
opts.add_argument("--window-size=1920,1080")
|
55 |
|
56 |
-
# use a tmp profile directory
|
57 |
-
opts.add_argument(f"--profile=/tmp/firefox-profile-{os.getpid()}")
|
58 |
-
|
59 |
driver = webdriver.Firefox(service=service, options=opts)
|
60 |
return driver
|
61 |
|
62 |
|
63 |
|
|
|
64 |
def process_batch(start_id, end_id, worker_id):
|
65 |
print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
|
66 |
try:
|
|
|
1 |
+
|
2 |
import os
|
3 |
import io
|
4 |
import time
|
|
|
11 |
from selenium.webdriver.common.by import By
|
12 |
from selenium.webdriver.support.ui import WebDriverWait
|
13 |
from selenium.webdriver.support import expected_conditions as EC
|
14 |
+
from selenium.webdriver.support.expected_conditions import staleness_of
|
15 |
from selenium.common.exceptions import (
|
16 |
+
NoSuchElementException,
|
17 |
+
TimeoutException,
|
18 |
+
StaleElementReferenceException,
|
19 |
)
|
|
|
20 |
from selenium.webdriver.firefox.service import Service
|
21 |
+
from selenium.webdriver.firefox.options import Options
|
22 |
+
|
23 |
from huggingface_hub import HfApi, HfFolder
|
24 |
|
25 |
# Configuration: set via Space secrets
|
26 |
HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data")
|
27 |
+
HF_TOKEN = HfFolder.get_token()
|
28 |
+
|
29 |
|
30 |
def upload_df_to_hf(df: pd.DataFrame, filename: str):
|
31 |
+
"""
|
32 |
+
Upload a pandas DataFrame directly to HF dataset without writing to disk.
|
33 |
+
"""
|
34 |
buffer = io.StringIO()
|
35 |
df.to_csv(buffer, index=False)
|
36 |
buffer.seek(0)
|
|
|
41 |
repo_id=HF_REPO_ID,
|
42 |
repo_type="dataset",
|
43 |
token=HF_TOKEN,
|
44 |
+
create_pr=False,
|
45 |
)
|
46 |
print(f"✅ Uploaded {filename} to {HF_REPO_ID}")
|
47 |
|
48 |
+
|
49 |
def safe_get_text(driver, xpath, retries=1, delay=0.5):
|
50 |
for _ in range(retries):
|
51 |
try:
|
|
|
54 |
time.sleep(delay)
|
55 |
return ""
|
56 |
|
57 |
+
|
58 |
def initialize_driver():
|
59 |
+
# point to the geckodriver binary you installed
|
60 |
service = Service("/usr/local/bin/geckodriver")
|
61 |
+
|
62 |
opts = Options()
|
63 |
+
opts.headless = True
|
64 |
+
# disable sandbox (in many container environments)
|
65 |
opts.add_argument("--no-sandbox")
|
66 |
opts.add_argument("--disable-gpu")
|
67 |
+
# ensure a full window so responsive pages load properly
|
68 |
opts.add_argument("--window-size=1920,1080")
|
69 |
|
|
|
|
|
|
|
70 |
driver = webdriver.Firefox(service=service, options=opts)
|
71 |
return driver
|
72 |
|
73 |
|
74 |
|
75 |
+
|
76 |
def process_batch(start_id, end_id, worker_id):
|
77 |
print(f"Worker {worker_id} processing IDs {start_id} to {end_id}")
|
78 |
try:
|