# tracklight_server/hf/pull.py from datasets import load_dataset from huggingface_hub import HfFolder from ..db import duckdb def pull_from_hub(repo_id: str, hf_token: str): """ Pulls data from a Hugging Face Dataset repo and merges it into the local DuckDB. """ # Authenticate with Hugging Face HfFolder.save_token(hf_token) try: # Load the dataset from the Hub dataset = load_dataset(repo_id) except Exception as e: print(f"Failed to load dataset from {repo_id}: {e}") return if 'train' not in dataset: print(f"No 'train' split found in the dataset {repo_id}.") return df = dataset['train'].to_pandas() if df.empty: print("No data to pull.") return # Merge the data into the local DuckDB with duckdb.get_connection() as con: # A simple approach is to just insert all data. # A more sophisticated approach would be to handle duplicates. con.execute(f"CREATE TABLE IF NOT EXISTS temp_table AS SELECT * FROM {duckdb.TABLE_NAME} WHERE 1=0") con.execute("INSERT INTO temp_table SELECT * FROM df") con.execute(f"INSERT INTO {duckdb.TABLE_NAME} SELECT * FROM temp_table ON CONFLICT DO NOTHING") con.execute("DROP TABLE temp_table") print(f"Successfully pulled and merged data from {repo_id}")