Spaces:
Sleeping
Sleeping
File size: 1,541 Bytes
7bc6ba8 c6d4059 7bc6ba8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import pyarrow as pa
import pyarrow.parquet as pq
from huggingface_hub.hf_api import HfApi
from huggingface_hub import whoami
import json
import tempfile
# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
schema = {'username': {'_type': 'Value', 'dtype': 'string'},
'unit1': {'_type': 'Value', 'dtype': 'float64'},
'unit2': {'_type': 'Value', 'dtype': 'float64'},
'unit3': {'_type': 'Value', 'dtype': 'float64'},
'unit4': {'_type': 'Value', 'dtype': 'float64'},
'certified': {'_type': 'Value', 'dtype': 'int64'},
}
def to_parquet(api,repo,username="",unit1=0.,unit2=0.,unit3=0.,unit4=0.,certified=0):
data = {
"username": username,
"unit1": float(unit1),
"unit2" : float(unit2),
"unit3" : float(unit3),
"unit4" : float(unit4),
"certified" : certified,
}
# Export data to Arrow format
table = pa.Table.from_pylist([data])
# Add metadata (used by datasets library)
table = table.replace_schema_metadata(
{"huggingface": json.dumps({"info": {"features": schema}})}
)
# Write to parquet file
archive_file = tempfile.NamedTemporaryFile(delete=False)
pq.write_table(table, archive_file.name)
archive_file.close()
api.upload_file(
repo_id=repo, # manually created repo
repo_type="dataset",
path_in_repo=f"{username}.parquet", # each user will have their own parquet
path_or_fileobj=archive_file.name,
) |