unit_1_quiz / data_to_parquet.py
not-lain's picture
Update data_to_parquet.py
c6d4059 verified
raw
history blame
1.54 kB
import pyarrow as pa
import pyarrow.parquet as pq
from huggingface_hub.hf_api import HfApi
from huggingface_hub import whoami
import json
import tempfile
# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
schema = {'username': {'_type': 'Value', 'dtype': 'string'},
'unit1': {'_type': 'Value', 'dtype': 'float64'},
'unit2': {'_type': 'Value', 'dtype': 'float64'},
'unit3': {'_type': 'Value', 'dtype': 'float64'},
'unit4': {'_type': 'Value', 'dtype': 'float64'},
'certified': {'_type': 'Value', 'dtype': 'int64'},
}
def to_parquet(api,repo,username="",unit1=0.,unit2=0.,unit3=0.,unit4=0.,certified=0):
data = {
"username": username,
"unit1": float(unit1),
"unit2" : float(unit2),
"unit3" : float(unit3),
"unit4" : float(unit4),
"certified" : certified,
}
# Export data to Arrow format
table = pa.Table.from_pylist([data])
# Add metadata (used by datasets library)
table = table.replace_schema_metadata(
{"huggingface": json.dumps({"info": {"features": schema}})}
)
# Write to parquet file
archive_file = tempfile.NamedTemporaryFile(delete=False)
pq.write_table(table, archive_file.name)
archive_file.close()
api.upload_file(
repo_id=repo, # manually created repo
repo_type="dataset",
path_in_repo=f"{username}.parquet", # each user will have their own parquet
path_or_fileobj=archive_file.name,
)