mys's picture
Upload folder using huggingface_hub
1c75c98 verified
# tracklight_server/hf/pull.py
from datasets import load_dataset
from huggingface_hub import HfFolder
from ..db import duckdb
def pull_from_hub(repo_id: str, hf_token: str):
"""
Pulls data from a Hugging Face Dataset repo and merges it into the local DuckDB.
"""
# Authenticate with Hugging Face
HfFolder.save_token(hf_token)
try:
# Load the dataset from the Hub
dataset = load_dataset(repo_id)
except Exception as e:
print(f"Failed to load dataset from {repo_id}: {e}")
return
if 'train' not in dataset:
print(f"No 'train' split found in the dataset {repo_id}.")
return
df = dataset['train'].to_pandas()
if df.empty:
print("No data to pull.")
return
# Merge the data into the local DuckDB
with duckdb.get_connection() as con:
# A simple approach is to just insert all data.
# A more sophisticated approach would be to handle duplicates.
con.execute(f"CREATE TABLE IF NOT EXISTS temp_table AS SELECT * FROM {duckdb.TABLE_NAME} WHERE 1=0")
con.execute("INSERT INTO temp_table SELECT * FROM df")
con.execute(f"INSERT INTO {duckdb.TABLE_NAME} SELECT * FROM temp_table ON CONFLICT DO NOTHING")
con.execute("DROP TABLE temp_table")
print(f"Successfully pulled and merged data from {repo_id}")