Ben Burtenshaw commited on
Commit
1fdaf11
·
0 Parent(s):

first commit

Browse files
app.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src import argilla_utils
4
+ from src import dataset
5
+ from src import spaces
6
+
7
+
8
+
9
+ def refresh_dataset_settings_view(
10
+ columns,
11
+ question_columns,
12
+ field_columns,
13
+ split,
14
+ settings,
15
+ dataset_name,
16
+ argilla_dataset_name,
17
+ mapping,
18
+ ):
19
+ """This is a utility function to refresh the gradio applications state variables when a new dataset is loaded."""
20
+ columns = dataset.load_columns()
21
+ field_columns = dataset.get_field_columns()
22
+ question_columns = dataset.get_question_columns()
23
+ metadata_columns = []
24
+ vector_columns = []
25
+ split = dataset.load_split()
26
+ settings = None
27
+ dataset_name = dataset.load_dataset_name()
28
+ argilla_dataset_name = dataset.load_argilla_dataset_name()
29
+ mapping = None
30
+ return (
31
+ columns,
32
+ field_columns,
33
+ question_columns,
34
+ metadata_columns,
35
+ vector_columns,
36
+ split,
37
+ settings,
38
+ dataset_name,
39
+ argilla_dataset_name,
40
+ mapping,
41
+ )
42
+
43
+
44
+ with gr.Blocks() as app:
45
+ ##############################################
46
+ # Define the app state
47
+ ##############################################
48
+
49
+ columns = gr.State(dataset.load_columns)
50
+ question_columns = gr.State(dataset.get_question_columns)
51
+ field_columns = gr.State(dataset.get_field_columns)
52
+ split = gr.State(dataset.load_split)
53
+ settings = gr.State(None)
54
+ dataset_name = gr.State(dataset.load_dataset_name)
55
+ argilla_dataset_name = gr.State(dataset.load_argilla_dataset_name)
56
+ mapping = gr.State(None)
57
+
58
+ state_variables = [
59
+ columns,
60
+ question_columns,
61
+ field_columns,
62
+ split,
63
+ settings,
64
+ dataset_name,
65
+ argilla_dataset_name,
66
+ mapping,
67
+ ]
68
+
69
+ ##############################################
70
+ # Define the app dataset and argilla space
71
+ ##############################################
72
+
73
+ gr.Markdown(
74
+ """# 🚂 Argilla Direct
75
+ A direct connection from a Hub dataset to an Argilla dataset.
76
+ This app allows you to create an Argilla dataset from a Hugging Face dataset.
77
+ You will need to load a dataset from the Hugging Face Hub, create an Argilla space,
78
+ define the dataset's settings, and add records to the dataset.
79
+ """
80
+ )
81
+
82
+ with gr.Group():
83
+ with gr.Row():
84
+ with gr.Column():
85
+ with gr.Row():
86
+ with gr.Column():
87
+ dataset_name_input = gr.Textbox(
88
+ label="Dataset Repo ID", value=dataset.load_dataset_name()
89
+ )
90
+ with gr.Column():
91
+ split_input = gr.Dropdown(
92
+ label="Dataset Split",
93
+ choices=dataset.load_split_choices(),
94
+ allow_custom_value=True,
95
+ value=dataset.load_split(),
96
+ )
97
+ load_dataset_btn = gr.Button(value="1️⃣ Load Dataset")
98
+ with gr.Column():
99
+ argilla_space_name = gr.Textbox(
100
+ label="Argilla Space Name", value=f"{dataset_name.value}_argilla"
101
+ )
102
+
103
+ create_argilla_space_btn = gr.Button(value="2️⃣ Create Argilla Space")
104
+
105
+ ##############################################
106
+ # Define the Argilla dataset configuration
107
+ ##############################################
108
+
109
+ gr.Markdown(
110
+ """## 3️⃣ Define Argilla Dataset
111
+ Define the settings for the Argilla dataset including fields, questions, metadata, and vectors.
112
+ Select the columns from the Hugging Face dataset to be used as Argilla dataset attributes.
113
+ """
114
+ )
115
+
116
+ with gr.Row():
117
+ with gr.Group():
118
+ with gr.Column():
119
+ # DATASET SETTINGS
120
+
121
+ # Argilla dataset name
122
+ argilla_dataset_name_view = gr.Textbox(
123
+ label="Dataset Name",
124
+ info="The name of the dataset in Argilla to be created or used",
125
+ value=dataset.load_argilla_dataset_name(),
126
+ )
127
+ argilla_dataset_name_view.change(
128
+ fn=lambda value: gr.update(
129
+ value=dataset.load_argilla_dataset_name()
130
+ ),
131
+ inputs=[argilla_dataset_name_view],
132
+ outputs=[argilla_dataset_name_view],
133
+ )
134
+
135
+ # Field columns
136
+ field_columns_view = gr.Dropdown(
137
+ label="Field Columns",
138
+ info="Columns to be used as fields in the Argilla dataset",
139
+ choices=dataset.load_columns(),
140
+ multiselect=True,
141
+ value=dataset.get_field_columns(),
142
+ allow_custom_value=True,
143
+ )
144
+ field_columns_view.change(
145
+ fn=lambda value: gr.update(choices=dataset.load_columns()),
146
+ inputs=[field_columns_view],
147
+ outputs=[field_columns_view],
148
+ )
149
+
150
+ # Question columns
151
+ question_columns_view = gr.Dropdown(
152
+ label="Question Columns",
153
+ info="Columns to be used as question suggestions in the Argilla dataset",
154
+ choices=dataset.load_columns(),
155
+ multiselect=True,
156
+ value=dataset.get_field_columns(),
157
+ allow_custom_value=True,
158
+ )
159
+
160
+ question_columns_view.change(
161
+ fn=lambda value: gr.update(choices=dataset.load_columns()),
162
+ inputs=[question_columns_view],
163
+ outputs=[question_columns_view],
164
+ )
165
+
166
+ with gr.Accordion(label="Define New Questions", open=False):
167
+ with gr.Group():
168
+ with gr.Column():
169
+ question_type = gr.Dropdown(
170
+ label="Question Type",
171
+ info="The type of question to be added to the Argilla dataset",
172
+ choices=["Text", "Label", "Rating"],
173
+ )
174
+ with gr.Column():
175
+ question_name = gr.Textbox(
176
+ label="Question Name",
177
+ info="The name of the question to be added to the Argilla dataset",
178
+ )
179
+ with gr.Column():
180
+ gr.Button(value="Add Question").click(
181
+ fn=lambda type, name, questions: questions
182
+ + [(type, name)],
183
+ inputs=[
184
+ question_type,
185
+ question_name,
186
+ question_columns_view,
187
+ ],
188
+ outputs=[question_columns_view],
189
+ )
190
+
191
+ with gr.Accordion(label="Define Metadata and Vectors", open=False):
192
+ metadata_columns_view = gr.Dropdown(
193
+ label="Metadata Columns",
194
+ info="Columns to be used as metadata in the Argilla dataset",
195
+ choices=dataset.load_columns(),
196
+ multiselect=True,
197
+ )
198
+ vector_columns_view = gr.Dropdown(
199
+ label="Vector Columns",
200
+ info="Columns to be used as vectors in the Argilla dataset",
201
+ choices=dataset.load_columns(),
202
+ multiselect=True,
203
+ )
204
+
205
+ n_records = gr.Slider(1, 10000, 100, label="Number of Records")
206
+ create_argilla_dataset_btn = gr.Button(value="Create Argilla Dataset")
207
+ add_records_btn = gr.Button(value="Add Records to Argilla")
208
+ delete_dataset_btn = gr.Button(value="Delete Argilla Dataset")
209
+
210
+ with gr.Column():
211
+ dataset_view = gr.Dataframe(
212
+ label="Dataset Viewer",
213
+ column_widths="20%",
214
+ headers=columns.value,
215
+ wrap=True,
216
+ )
217
+ records_view = gr.Text(label="Status", value="")
218
+
219
+ ##############################################
220
+ # Define the app logic
221
+ ##############################################
222
+
223
+ load_dataset_btn.click(
224
+ fn=dataset.load_dataset_from_hub,
225
+ inputs=[dataset_name_input],
226
+ outputs=[dataset_view],
227
+ ).then(
228
+ fn=refresh_dataset_settings_view,
229
+ inputs=state_variables,
230
+ outputs=[
231
+ columns,
232
+ question_columns_view,
233
+ field_columns_view,
234
+ split_input,
235
+ settings,
236
+ dataset_name,
237
+ argilla_dataset_name_view,
238
+ mapping,
239
+ ],
240
+ )
241
+
242
+ create_argilla_space_btn.click(
243
+ fn=spaces.create_argilla_space,
244
+ inputs=[argilla_space_name],
245
+ outputs=[records_view],
246
+ )
247
+
248
+ delete_dataset_btn.click(
249
+ fn=argilla_utils.delete_dataset,
250
+ inputs=[argilla_dataset_name_view],
251
+ outputs=[records_view],
252
+ )
253
+
254
+ create_argilla_dataset_btn.click(
255
+ fn=argilla_utils.define_dataset_setting,
256
+ inputs=[
257
+ argilla_dataset_name_view,
258
+ field_columns_view,
259
+ question_columns_view,
260
+ metadata_columns_view,
261
+ vector_columns_view,
262
+ ],
263
+ outputs=[records_view, mapping],
264
+ )
265
+
266
+ add_records_btn.click(
267
+ fn=argilla_utils.add_records,
268
+ inputs=[argilla_dataset_name_view, mapping, n_records],
269
+ outputs=[records_view],
270
+ )
271
+
272
+
273
+ if __name__ == "__main__":
274
+ app.launch()
dataset_dir/dataset_dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"splits": ["train"]}
dataset_dir/train/dataset_info.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "parquet",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "10k_prompts_ranked",
6
+ "dataset_size": 8711680,
7
+ "description": "",
8
+ "download_checksums": {
9
+ "hf://datasets/DIBT/10k_prompts_ranked@3a9e44c398d92681e58b5c8ad39502203a002bac/data/train-00000-of-00001.parquet": {
10
+ "num_bytes": 3579688,
11
+ "checksum": null
12
+ }
13
+ },
14
+ "download_size": 3579688,
15
+ "features": {
16
+ "prompt": {
17
+ "dtype": "string",
18
+ "id": "field",
19
+ "_type": "Value"
20
+ },
21
+ "quality": [
22
+ {
23
+ "user_id": {
24
+ "dtype": "string",
25
+ "id": "question",
26
+ "_type": "Value"
27
+ },
28
+ "value": {
29
+ "dtype": "string",
30
+ "id": "suggestion",
31
+ "_type": "Value"
32
+ },
33
+ "status": {
34
+ "dtype": "string",
35
+ "id": "question",
36
+ "_type": "Value"
37
+ }
38
+ }
39
+ ],
40
+ "metadata": {
41
+ "dtype": "string",
42
+ "id": "metadata",
43
+ "_type": "Value"
44
+ },
45
+ "avg_rating": {
46
+ "dtype": "float64",
47
+ "_type": "Value"
48
+ },
49
+ "num_responses": {
50
+ "dtype": "int64",
51
+ "_type": "Value"
52
+ },
53
+ "agreement_ratio": {
54
+ "dtype": "float64",
55
+ "_type": "Value"
56
+ },
57
+ "raw_responses": {
58
+ "feature": {
59
+ "dtype": "int64",
60
+ "_type": "Value"
61
+ },
62
+ "_type": "Sequence"
63
+ },
64
+ "kind": {
65
+ "dtype": "string",
66
+ "_type": "Value"
67
+ },
68
+ "cluster_description": {
69
+ "dtype": "string",
70
+ "_type": "Value"
71
+ },
72
+ "topic": {
73
+ "dtype": "string",
74
+ "_type": "Value"
75
+ }
76
+ },
77
+ "homepage": "",
78
+ "license": "",
79
+ "size_in_bytes": 12291368,
80
+ "splits": {
81
+ "train": {
82
+ "name": "train",
83
+ "num_bytes": 8711680,
84
+ "num_examples": 10331,
85
+ "dataset_name": "10k_prompts_ranked"
86
+ }
87
+ },
88
+ "version": {
89
+ "version_str": "0.0.0",
90
+ "major": 0,
91
+ "minor": 0,
92
+ "patch": 0
93
+ }
94
+ }
src/__init__.py ADDED
File without changes
src/argilla_utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argilla_sdk as rg
2
+ from datasets import load_dataset
3
+
4
+ from datasets import load_dataset
5
+
6
+ from src.dataset import (
7
+ load_split,
8
+ is_label,
9
+ is_rating,
10
+ is_int,
11
+ is_float,
12
+ get_feature_values,
13
+ get_feature_labels,
14
+ )
15
+
16
+ client = rg.Argilla(api_url="http://localhost:6900", api_key="owner.apikey")
17
+
18
+
19
+ def define_dataset_setting(
20
+ dataset_name, field_columns, question_columns, metadata_columns, vector_columns
21
+ ):
22
+ split = load_split()
23
+
24
+ fields, questions, metadata, vectors = [], [], [], []
25
+ mapping = {}
26
+
27
+ # Add field columns
28
+ for column_name in field_columns:
29
+ field_column_name = f"{column_name}_field"
30
+ fields.append(rg.TextField(name=field_column_name))
31
+ mapping[column_name] = field_column_name
32
+
33
+ # Add question columns
34
+ for column_name in question_columns:
35
+ if isinstance(column_name, (list, tuple)):
36
+ question_type, column_name = column_name
37
+ elif is_label(split, column_name):
38
+ question_type = "Label"
39
+ elif is_rating(split, column_name):
40
+ question_type = "Rating"
41
+ else:
42
+ question_type = "Text"
43
+
44
+ question_column_name = f"{column_name}_question"
45
+ if question_type == "Label":
46
+ values = get_feature_values(split, column_name)
47
+ titles = get_feature_labels(split, column_name)
48
+ labels = {str(l): feature for l, feature in zip(values, titles)}
49
+ questions.append(rg.LabelQuestion(name=question_column_name, labels=labels))
50
+ elif question_type == "Rating":
51
+ values = get_feature_values(split, column_name)
52
+ questions.append(
53
+ rg.RatingQuestion(name=question_column_name, values=values)
54
+ )
55
+ else:
56
+ questions.append(rg.TextQuestion(name=question_column_name))
57
+
58
+ if column_name in mapping:
59
+ column_name = f"{column_name}__"
60
+ mapping[column_name] = question_column_name
61
+
62
+ # Add metadata columns
63
+ if not metadata_columns:
64
+ metadata_columns = []
65
+
66
+ for column_name in metadata_columns:
67
+ metadata_column_name = f"{column_name}_metadata"
68
+ if is_int(split, column_name):
69
+ metadata.append(rg.IntegerMetadataProperty(name=metadata_column_name))
70
+ elif is_float(split, column_name):
71
+ metadata.append(rg.FloatMetadataProperty(name=metadata_column_name))
72
+ elif is_label:
73
+ values = list(map(str, get_feature_values(split, column_name)))
74
+ metadata.append(
75
+ rg.TermsMetadataProperty(name=metadata_column_name, options=values)
76
+ )
77
+ mapping[column_name] = metadata_column_name
78
+
79
+ # Add vector columns
80
+ if not vector_columns:
81
+ vector_columns = []
82
+
83
+ for column_name in vector_columns:
84
+ vectors.append(rg.VectorField(name=column_name))
85
+
86
+ settings = rg.Settings(
87
+ fields=fields, questions=questions, metadata=metadata, vectors=vectors
88
+ )
89
+
90
+ dataset = rg.Dataset(name=dataset_name, settings=settings, client=client)
91
+
92
+ if not dataset.exists():
93
+ dataset.create()
94
+
95
+ return str(settings.serialize()), mapping
96
+
97
+
98
+ def add_records(argilla_dataset_name, mapping, n_records):
99
+ split = load_split()
100
+ df = load_dataset(load_repo_id())[split].take(n_records).to_pandas()
101
+ dataset = client.datasets(argilla_dataset_name)
102
+ questions = dataset.settings.questions
103
+ for question in questions:
104
+ if question.name in mapping.values():
105
+ column_name = [k for k, v in mapping.items() if v == question.name][0]
106
+ column_name = column_name.replace("__", "")
107
+ if is_label(split, column_name):
108
+ df[column_name] = df[column_name].apply(str)
109
+ for source, target in mapping.items():
110
+ if source.endswith("__"):
111
+ df[source] = df[source.replace("__", "")]
112
+ records = df.to_dict(orient="records")
113
+ dataset.records.log(records, mapping=mapping)
114
+ return f"{len(df)} records added with mapping {mapping}"
115
+
116
+
117
+ def delete_dataset(argilla_dataset_name):
118
+ dataset = client.datasets(argilla_dataset_name)
119
+ dataset.delete()
120
+ return f"Dataset {argilla_dataset_name} deleted"
src/constants.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ LOCAL_DATASET_PATH = "dataset_dir"
2
+ LOCAL_CONFIG_PATH = "config_dir"
src/dataset.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ from datetime import datetime
5
+
6
+ from datasets import load_dataset
7
+
8
+ from src.constants import LOCAL_CONFIG_PATH, LOCAL_DATASET_PATH
9
+
10
+
11
+ ##############################################
12
+ # Get the dataset app
13
+ ##############################################
14
+
15
+
16
+ def load_dataset_from_hub(dataset_name):
17
+ # delete the existing dataset
18
+ if os.path.exists(LOCAL_DATASET_PATH):
19
+ os.system(f"rm -rf {LOCAL_DATASET_PATH}")
20
+ ds = load_dataset(dataset_name)
21
+ ds.save_to_disk(LOCAL_DATASET_PATH)
22
+ split = load_split()
23
+ columns = list(ds[split].features.keys())
24
+ df = ds[split].to_pandas()
25
+ with open(LOCAL_CONFIG_PATH, "w") as f:
26
+ json.dump({"columns": columns, "split": split, "name": dataset_name}, f)
27
+ return df
28
+
29
+
30
+ ##############################################
31
+ # Define the dataset app
32
+ ##############################################
33
+
34
+
35
+ def load_repo_id():
36
+ with open(LOCAL_CONFIG_PATH, "r") as f:
37
+ config = json.load(f)
38
+ return config["name"]
39
+
40
+
41
+ def load_dataset_dict_json(split):
42
+ dataset_dict_fn = "dataset_info.json"
43
+ path = os.path.join(LOCAL_DATASET_PATH, split, dataset_dict_fn)
44
+ with open(path, "r") as f:
45
+ return json.load(f)
46
+
47
+
48
+ def load_dataset_name():
49
+ dataset_dict = load_dataset_dict_json("train")
50
+ return dataset_dict["dataset_name"]
51
+
52
+
53
+ def load_argilla_dataset_name():
54
+ name = load_dataset_name()
55
+ now = datetime.now()
56
+ name = f"{name}_{now.strftime('%Y%m%d%H%M%S')}"
57
+ return name
58
+
59
+
60
+ def load_split_choices():
61
+ dataset_dict = load_dataset_dict_json("train")
62
+ return list(dataset_dict["splits"].keys())
63
+
64
+
65
+ def load_split():
66
+ return load_split_choices()[0]
67
+
68
+
69
+ def load_columns():
70
+ dataset_dict = load_dataset_dict_json("train")
71
+ return list(dataset_dict["features"].keys())
72
+
73
+
74
+ def get_split_features(split):
75
+ dataset_dict = load_dataset_dict_json(split)
76
+ return dataset_dict["features"]
77
+
78
+
79
+ def get_feature_type(split, column_name):
80
+ features = get_split_features(split)
81
+ return features[column_name]["_type"]
82
+
83
+
84
+ def get_feature_dtype(split, column_name):
85
+ features = get_split_features(split)
86
+ try:
87
+ return features[column_name]["dtype"]
88
+ except TypeError:
89
+ return None
90
+
91
+
92
+ def is_field(split, column_name):
93
+ try:
94
+ return (
95
+ get_feature_dtype(split, column_name) == "string"
96
+ and get_feature_type(split, column_name) == "Value"
97
+ )
98
+ except KeyError:
99
+ return False
100
+
101
+
102
+ def is_label(split, column_name):
103
+ feature_type = get_feature_type(split, column_name)
104
+ return feature_type == "ClassLabel"
105
+
106
+
107
+ def is_float(split, column_name):
108
+ try:
109
+ feature_type = get_feature_type(split, column_name)
110
+ feature_dtype = get_feature_dtype(split, column_name)
111
+ return feature_type == "Value" and feature_dtype.startswith("float")
112
+ except KeyError:
113
+ return False
114
+
115
+
116
+ def is_int(split, column_name):
117
+ try:
118
+ feature_type = get_feature_type(split, column_name)
119
+ feature_dtype = get_feature_dtype(split, column_name)
120
+ return feature_type == "Value" and feature_dtype.startswith("int")
121
+ except KeyError:
122
+ return False
123
+
124
+
125
+ def get_feature_labels(split, column_name):
126
+ features = get_split_features(split)
127
+ return features[column_name]["names"]
128
+
129
+
130
+ def get_feature_values(split, column_name):
131
+ ds = load_dataset(load_repo_id())
132
+ return list(set(ds[split][column_name]))
133
+
134
+
135
+ def is_rating(split, column_name):
136
+ feature_values = get_feature_values(split, column_name)
137
+ if not is_int(split, column_name):
138
+ return False
139
+ if len(feature_values) > 10:
140
+ return False
141
+ return True
142
+
143
+
144
+ def get_field_columns():
145
+ split = load_split()
146
+ columns = load_columns()
147
+ return [column for column in columns if is_field(split, column)]
148
+
149
+
150
+ def get_question_columns():
151
+ split = load_split()
152
+ columns = load_columns()
153
+ return [column for column in columns if not is_field(split, column)]
154
+
155
+
156
+ def load_dataset_df():
157
+ split = load_split()
158
+ ds = load_dataset(load_repo_id())
159
+ return ds[split].to_pandas()
src/spaces.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import duplicate_space
2
+
3
+
4
+ def create_argilla_space(target_argilla_space):
5
+ duplicate_space(
6
+ from_id="argilla/argilla-template-space",
7
+ to_id=target_argilla_space,
8
+ private=False,
9
+ exist_ok=True,
10
+ )
11
+ return target_argilla_space