unit_1_quiz

Sleeping

unit_1_quiz / data_to_parquet.py

Update data_to_parquet.py

c6d4059 verified 5 months ago

1.54 kB

	import pyarrow as pa
	import pyarrow.parquet as pq
	from huggingface_hub.hf_api import HfApi
	from huggingface_hub import whoami
	import json
	import tempfile


	# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
	schema = {'username': {'_type': 'Value', 'dtype': 'string'},
	'unit1': {'_type': 'Value', 'dtype': 'float64'},
	'unit2': {'_type': 'Value', 'dtype': 'float64'},
	'unit3': {'_type': 'Value', 'dtype': 'float64'},
	'unit4': {'_type': 'Value', 'dtype': 'float64'},
	'certified': {'_type': 'Value', 'dtype': 'int64'},
	}



	def to_parquet(api,repo,username="",unit1=0.,unit2=0.,unit3=0.,unit4=0.,certified=0):
	data = {
	"username": username,
	"unit1": float(unit1),
	"unit2" : float(unit2),
	"unit3" : float(unit3),
	"unit4" : float(unit4),
	"certified" : certified,
	}
	# Export data to Arrow format
	table = pa.Table.from_pylist([data])
	# Add metadata (used by datasets library)
	table = table.replace_schema_metadata(
	{"huggingface": json.dumps({"info": {"features": schema}})}
	)
	# Write to parquet file
	archive_file = tempfile.NamedTemporaryFile(delete=False)
	pq.write_table(table, archive_file.name)
	archive_file.close()

	api.upload_file(
	repo_id=repo, # manually created repo
	repo_type="dataset",
	path_in_repo=f"{username}.parquet", # each user will have their own parquet
	path_or_fileobj=archive_file.name,
	)