Spaces:
Build error
Build error
# tracklight_server/hf/pull.py | |
from datasets import load_dataset | |
from huggingface_hub import HfFolder | |
from ..db import duckdb | |
def pull_from_hub(repo_id: str, hf_token: str): | |
""" | |
Pulls data from a Hugging Face Dataset repo and merges it into the local DuckDB. | |
""" | |
# Authenticate with Hugging Face | |
HfFolder.save_token(hf_token) | |
try: | |
# Load the dataset from the Hub | |
dataset = load_dataset(repo_id) | |
except Exception as e: | |
print(f"Failed to load dataset from {repo_id}: {e}") | |
return | |
if 'train' not in dataset: | |
print(f"No 'train' split found in the dataset {repo_id}.") | |
return | |
df = dataset['train'].to_pandas() | |
if df.empty: | |
print("No data to pull.") | |
return | |
# Merge the data into the local DuckDB | |
with duckdb.get_connection() as con: | |
# A simple approach is to just insert all data. | |
# A more sophisticated approach would be to handle duplicates. | |
con.execute(f"CREATE TABLE IF NOT EXISTS temp_table AS SELECT * FROM {duckdb.TABLE_NAME} WHERE 1=0") | |
con.execute("INSERT INTO temp_table SELECT * FROM df") | |
con.execute(f"INSERT INTO {duckdb.TABLE_NAME} SELECT * FROM temp_table ON CONFLICT DO NOTHING") | |
con.execute("DROP TABLE temp_table") | |
print(f"Successfully pulled and merged data from {repo_id}") | |