not-lain commited on
Commit
b20c417
·
verified ·
1 Parent(s): c6d4059

Update data_to_parquet.py

Browse files
Files changed (1) hide show
  1. data_to_parquet.py +28 -21
data_to_parquet.py CHANGED
@@ -1,45 +1,52 @@
1
  import pyarrow as pa
2
  import pyarrow.parquet as pq
3
- from huggingface_hub.hf_api import HfApi
4
- from huggingface_hub import whoami
5
  import json
6
  import tempfile
7
 
8
 
9
  # current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
10
- schema = {'username': {'_type': 'Value', 'dtype': 'string'},
11
- 'unit1': {'_type': 'Value', 'dtype': 'float64'},
12
- 'unit2': {'_type': 'Value', 'dtype': 'float64'},
13
- 'unit3': {'_type': 'Value', 'dtype': 'float64'},
14
- 'unit4': {'_type': 'Value', 'dtype': 'float64'},
15
- 'certified': {'_type': 'Value', 'dtype': 'int64'},
16
- }
 
17
 
18
 
19
-
20
- def to_parquet(api,repo,username="",unit1=0.,unit2=0.,unit3=0.,unit4=0.,certified=0):
 
 
 
 
 
 
 
 
21
  data = {
22
  "username": username,
23
- "unit1": float(unit1),
24
- "unit2" : float(unit2),
25
- "unit3" : float(unit3),
26
- "unit4" : float(unit4),
27
- "certified" : certified,
28
  }
29
  # Export data to Arrow format
30
  table = pa.Table.from_pylist([data])
31
  # Add metadata (used by datasets library)
32
  table = table.replace_schema_metadata(
33
- {"huggingface": json.dumps({"info": {"features": schema}})}
34
- )
35
  # Write to parquet file
36
  archive_file = tempfile.NamedTemporaryFile(delete=False)
37
  pq.write_table(table, archive_file.name)
38
  archive_file.close()
39
 
40
  api.upload_file(
41
- repo_id=repo, # manually created repo
42
  repo_type="dataset",
43
- path_in_repo=f"{username}.parquet", # each user will have their own parquet
44
  path_or_fileobj=archive_file.name,
45
- )
 
1
  import pyarrow as pa
2
  import pyarrow.parquet as pq
 
 
3
  import json
4
  import tempfile
5
 
6
 
7
  # current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
8
+ schema = {
9
+ "username": {"_type": "Value", "dtype": "string"},
10
+ "unit1": {"_type": "Value", "dtype": "float64"},
11
+ "unit2": {"_type": "Value", "dtype": "float64"},
12
+ "unit3": {"_type": "Value", "dtype": "float64"},
13
+ "unit4": {"_type": "Value", "dtype": "float64"},
14
+ "certified": {"_type": "Value", "dtype": "int64"},
15
+ }
16
 
17
 
18
+ def to_parquet(
19
+ api,
20
+ repo: str,
21
+ username: str = "",
22
+ unit1: float = 0.0,
23
+ unit2: float = 0.0,
24
+ unit3: float = 0.0,
25
+ unit4: float = 0.0,
26
+ certified: int = 0,
27
+ ):
28
  data = {
29
  "username": username,
30
+ "unit1": unit1 * 100 if unit1 != 0 else 0.0,
31
+ "unit2": unit2 * 100 if unit2 != 0 else 0.0,
32
+ "unit3": unit3 * 100 if unit3 != 0 else 0.0,
33
+ "unit4": unit4 * 100 if unit4 != 0 else 0.0,
34
+ "certified": certified,
35
  }
36
  # Export data to Arrow format
37
  table = pa.Table.from_pylist([data])
38
  # Add metadata (used by datasets library)
39
  table = table.replace_schema_metadata(
40
+ {"huggingface": json.dumps({"info": {"features": schema}})}
41
+ )
42
  # Write to parquet file
43
  archive_file = tempfile.NamedTemporaryFile(delete=False)
44
  pq.write_table(table, archive_file.name)
45
  archive_file.close()
46
 
47
  api.upload_file(
48
+ repo_id=repo, # manually created repo
49
  repo_type="dataset",
50
+ path_in_repo=f"{username}.parquet", # each user will have their own parquet
51
  path_or_fileobj=archive_file.name,
52
+ )