Spaces:
Sleeping
Sleeping
import pyarrow as pa | |
import pyarrow.parquet as pq | |
from huggingface_hub.hf_api import HfApi | |
from huggingface_hub import whoami | |
import json | |
import tempfile | |
# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info) | |
schema = {'username': {'_type': 'Value', 'dtype': 'string'}, | |
'unit1': {'_type': 'Value', 'dtype': 'float64'}, | |
'unit2': {'_type': 'Value', 'dtype': 'float64'}, | |
'unit3': {'_type': 'Value', 'dtype': 'float64'}, | |
'unit4': {'_type': 'Value', 'dtype': 'float64'}, | |
'certified': {'_type': 'Value', 'dtype': 'int64'}, | |
} | |
def to_parquet(api,repo,username="",unit1=0.,unit2=0.,unit3=0.,unit4=0.,certified=0): | |
data = { | |
"username": username, | |
"unit1": float(unit1), | |
"unit2" : float(unit2), | |
"unit3" : float(unit3), | |
"unit4" : float(unit4), | |
"certified" : certified, | |
} | |
# Export data to Arrow format | |
table = pa.Table.from_pylist([data]) | |
# Add metadata (used by datasets library) | |
table = table.replace_schema_metadata( | |
{"huggingface": json.dumps({"info": {"features": schema}})} | |
) | |
# Write to parquet file | |
archive_file = tempfile.NamedTemporaryFile(delete=False) | |
pq.write_table(table, archive_file.name) | |
archive_file.close() | |
api.upload_file( | |
repo_id=repo, # manually created repo | |
repo_type="dataset", | |
path_in_repo=f"{username}.parquet", # each user will have their own parquet | |
path_or_fileobj=archive_file.name, | |
) |