ksatzke commited on
Commit
3485f9a
·
verified ·
1 Parent(s): e6a0876

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -299
app.py DELETED
@@ -1,299 +0,0 @@
1
- from pathlib import Path
2
- import io
3
- import json
4
- import math
5
- import statistics
6
- import sys
7
- import time
8
-
9
- from datasets import concatenate_datasets, Dataset
10
- from datasets import load_dataset
11
-
12
- from huggingface_hub import hf_hub_url
13
-
14
- import pandas as pd
15
- import numpy as np
16
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
17
- from evaluate import load
18
-
19
-
20
- # 1. record each file name included
21
- # 1.1 read different file formats depending on parameters (i.e., filetype)
22
- # 2. determine column types and report how many rows for each type (format check)
23
- # (in a well-formatted dataset, each column should only have one type)
24
- # 3. report on the null values
25
- # 4. for certain column types, report statistics
26
- # 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
27
- # 4.2 strings: length ranges
28
- # 4.3 lists: length ranges
29
- # 4.3 int/float/double: their percentiles, min, max, mean
30
-
31
- CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
32
- CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]
33
-
34
- PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]
35
-
36
- def read_data(all_files, filetype):
37
- df = None
38
-
39
- func_name = ""
40
-
41
- if filetype in ["parquet", "csv", "json"]:
42
- if filetype == "parquet":
43
- func_name = pd.read_parquet
44
- elif filetype == "csv":
45
- func_name = pd.read_csv
46
- elif filetype == "json":
47
- func_name = pd.read_json
48
-
49
- df = pd.concat(func_name(f) for f in all_files)
50
-
51
- elif filetype == "arrow":
52
- ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
53
- df = pd.DataFrame(data=ds)
54
-
55
- elif filetype == "jsonl":
56
- func_name = pd.read_json
57
- all_lines = []
58
- for fname in all_files:
59
- with open(fname, "r") as f:
60
- all_lines.extend(f.readlines())
61
-
62
- df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])
63
-
64
- return df
65
-
66
- def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
67
- cell_length_ranges = {}
68
- cell_length_ranges = {}
69
- string_categorical = {}
70
- # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
71
- # with few unique items (need to check that while reading the cell),
72
- # so no need to treat it as a normal string
73
- if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
74
- string_categorical = str(len(cell_unique_string_values)) + " class(es)"
75
-
76
- elif cell_lengths:
77
- cell_lengths = sorted(cell_lengths)
78
- min_val = cell_lengths[0]
79
- max_val = cell_lengths[-1]
80
- distance = math.ceil((max_val - min_val) / 10.0)
81
- ranges = []
82
- if min_val != max_val:
83
- for j in range(min_val, max_val, distance):
84
- ranges.append(j)
85
- for j in range(len(ranges)-1):
86
- cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
87
- ranges.append(max_val)
88
-
89
- j = 1
90
- c = 0
91
- for k in cell_lengths:
92
- if j == len(ranges):
93
- c += 1
94
- elif k < ranges[j]:
95
- c += 1
96
- else:
97
- cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
98
- j += 1
99
- c = 1
100
-
101
- cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c
102
-
103
- else:
104
- ranges = [min_val]
105
- c = 0
106
- for k in cell_lengths:
107
- c += 1
108
- cell_length_ranges[str(min_val)] = c
109
-
110
- return cell_length_ranges, string_categorical
111
-
112
- def _compute_percentiles(values, percentiles=PERCENTILES):
113
- result = {}
114
- quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
115
- for p in percentiles:
116
- result[p/10] = quantiles[p-1]
117
- return result
118
-
119
- def compute_cell_value_statistics(cell_values):
120
- stats = {}
121
- if cell_values:
122
- cell_values = sorted(cell_values)
123
-
124
- stats["min"] = cell_values[0]
125
- stats["max"] = cell_values[-1]
126
- stats["mean"] = statistics.mean(cell_values)
127
- stats["stdev"] = statistics.stdev(cell_values)
128
- stats["variance"] = statistics.variance(cell_values)
129
-
130
- stats["percentiles"] = _compute_percentiles(cell_values)
131
-
132
- return stats
133
-
134
- def check_null(cell, cell_type):
135
- if cell_type == "<class 'float'>":
136
- if math.isnan(cell):
137
- return True
138
- elif cell is None:
139
- return True
140
- return False
141
-
142
- def compute_property(data_path, glob, filetype):
143
- output = {}
144
-
145
- data_dir = Path(data_path)
146
-
147
- filenames = []
148
- all_files = list(data_dir.glob(glob))
149
- for f in all_files:
150
- print(str(f))
151
- base_fname = str(f)[len(str(data_path)):]
152
- if not data_path.endswith("/"):
153
- base_fname = base_fname[1:]
154
- filenames.append(base_fname)
155
-
156
- output["filenames"] = filenames
157
-
158
- df = read_data(all_files, filetype)
159
-
160
- column_info = {}
161
-
162
- for col_name in df.columns:
163
- if col_name not in column_info:
164
- column_info[col_name] = {}
165
-
166
- cell_types = {}
167
-
168
- cell_lengths = {}
169
- cell_unique_string_values = {}
170
- cell_values = {}
171
- null_count = 0
172
- col_values = df[col_name].to_list()
173
- for cell in col_values:
174
- # for index, row in df.iterrows():
175
- # cell = row[col_name]
176
- cell_type = str(type(cell))
177
- cell_type = str(type(cell))
178
- # print(cell, cell_type)
179
- if check_null(cell, cell_type):
180
- null_count += 1
181
- continue
182
-
183
- if cell_type not in cell_types:
184
- cell_types[cell_type] = 1
185
- else:
186
- cell_types[cell_type] += 1
187
-
188
- if cell_type in CELL_TYPES_LENGTH:
189
- cell_length = len(cell)
190
- if cell_type not in cell_lengths:
191
- cell_lengths[cell_type] = []
192
-
193
- cell_lengths[cell_type].append(cell_length)
194
- if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
195
- cell_unique_string_values[cell] = True
196
-
197
- elif cell_type in CELL_TYPES_NUMERIC:
198
- if cell_type not in cell_values:
199
- cell_values[cell_type] = []
200
-
201
- cell_values[cell_type].append(cell)
202
-
203
- else:
204
- print(cell_type)
205
-
206
- clrs = {}
207
- ccs = {}
208
- for cell_type in CELL_TYPES_LENGTH:
209
- if cell_type in cell_lengths:
210
- clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
211
- clrs[cell_type] = clr
212
- ccs[cell_type] = cc
213
-
214
- css = {}
215
- for cell_type in CELL_TYPES_NUMERIC:
216
- if cell_type in cell_values:
217
- cell_stats = compute_cell_value_statistics(cell_values[cell_type])
218
- css[cell_type] = cell_stats
219
-
220
- column_info[col_name]["cell_types"] = cell_types
221
- column_info[col_name]["cell_length_ranges"] = clrs
222
- column_info[col_name]["cell_categories"] = ccs
223
- column_info[col_name]["cell_stats"] = css
224
- column_info[col_name]["cell_missing"] = null_count
225
-
226
- output["column_info"] = column_info
227
- output["number_of_items"] = len(df)
228
- output["timestamp"] = time.time()
229
-
230
- return output
231
-
232
- def preprocess_function(examples):
233
- return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
234
-
235
- def compute_metrics(eval_pred):
236
- predictions, labels = eval_pred
237
- predictions = np.argmax(predictions, axis=1)
238
- return metric.compute(predictions=predictions, references=labels)
239
-
240
- def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric):
241
- tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
242
- model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
243
- batch_size = 16
244
- args = TrainingArguments(
245
- "test-glue",
246
- evaluation_strategy = "epoch",
247
- learning_rate=5e-5,
248
- seed=42,
249
- lr_scheduler_type="linear",
250
- per_device_train_batch_size=batch_size,
251
- per_device_eval_batch_size=batch_size,
252
- num_train_epochs=3,
253
- weight_decay=0.01,
254
- load_best_model_at_end=False,
255
- metric_for_best_model="accuracy",
256
- report_to="none"
257
- )
258
-
259
- trainer = Trainer(
260
- model,
261
- args,
262
- train_dataset=tokenized_datasets["train"],
263
- eval_dataset=tokenized_datasets["validation"],
264
- tokenizer=tokenizer,
265
- compute_metrics=compute_metrics
266
- )
267
- result = trainer.evaluate()
268
- return result
269
-
270
-
271
- if __name__ == "__main__":
272
-
273
- in_container = True
274
- if len(sys.argv) > 1:
275
- model_checkpoint = sys.argv[1]
276
- dataset_name = sys.argv[2]
277
- metric = sys.argv[3]
278
- in_container = False
279
- else:
280
- model_checkpoint = "sgugger/glue-mrpc"
281
- dataset_name = "nyu-mll/glue"
282
- metric = ["glue", "mrpc"]
283
- in_container = False
284
-
285
- print(model_checkpoint, dataset_name, metric)
286
-
287
-
288
- model_checkpoint = model_checkpoint
289
- raw_datasets = load_dataset(dataset_name, "mrpc")
290
- metric = load("glue", "mrpc")
291
- tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
292
- output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric)
293
- print(json.dumps(output))
294
-
295
- if in_container:
296
- with open("/tmp/outputs/computation_result.json", "w") as f:
297
- json.dump(output, f, indent=4, sort_keys=True)
298
- else:
299
- print(json.dumps(output, indent=4, sort_keys=True))