File size: 1,541 Bytes
7bc6ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6d4059
 
 
 
7bc6ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pyarrow as pa
import pyarrow.parquet as pq
from huggingface_hub.hf_api import HfApi
from huggingface_hub import whoami
import json
import tempfile


# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
schema = {'username': {'_type': 'Value', 'dtype': 'string'},
 'unit1': {'_type': 'Value', 'dtype': 'float64'},
 'unit2': {'_type': 'Value', 'dtype': 'float64'},
 'unit3': {'_type': 'Value', 'dtype': 'float64'},
 'unit4': {'_type': 'Value', 'dtype': 'float64'},
 'certified': {'_type': 'Value', 'dtype': 'int64'},
 }



def to_parquet(api,repo,username="",unit1=0.,unit2=0.,unit3=0.,unit4=0.,certified=0):
    data = {
        "username": username,
        "unit1": float(unit1),
        "unit2" : float(unit2),
        "unit3" : float(unit3),
        "unit4" : float(unit4),
        "certified" : certified,
    }
    # Export data to Arrow format
    table = pa.Table.from_pylist([data])
    # Add metadata (used by datasets library)
    table = table.replace_schema_metadata(
                {"huggingface": json.dumps({"info": {"features": schema}})}
            )
    # Write to parquet file
    archive_file = tempfile.NamedTemporaryFile(delete=False)
    pq.write_table(table, archive_file.name)
    archive_file.close()

    api.upload_file(
        repo_id=repo, # manually created repo
        repo_type="dataset",
        path_in_repo=f"{username}.parquet", # each user will have their own parquet
        path_or_fileobj=archive_file.name,
    )