ksatzke commited on
Commit
0010524
·
verified ·
1 Parent(s): d382535

Update compute_model_property.py

Browse files
Files changed (1) hide show
  1. compute_model_property.py +2 -253
compute_model_property.py CHANGED
@@ -24,217 +24,6 @@ metric = load("glue", "mrpc")
24
 
25
  app = FastAPI()
26
 
27
- # 1. record each file name included
28
- # 1.1 read different file formats depending on parameters (i.e., filetype)
29
- # 2. determine column types and report how many rows for each type (format check)
30
- # (in a well-formatted dataset, each column should only have one type)
31
- # 3. report on the null values
32
- # 4. for certain column types, report statistics
33
- # 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
34
- # 4.2 strings: length ranges
35
- # 4.3 lists: length ranges
36
- # 4.3 int/float/double: their percentiles, min, max, mean
37
-
38
- CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
39
- CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]
40
-
41
- PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]
42
-
43
- def read_data(all_files, filetype):
44
- df = None
45
-
46
- func_name = ""
47
-
48
- if filetype in ["parquet", "csv", "json"]:
49
- if filetype == "parquet":
50
- func_name = pd.read_parquet
51
- elif filetype == "csv":
52
- func_name = pd.read_csv
53
- elif filetype == "json":
54
- func_name = pd.read_json
55
-
56
- df = pd.concat(func_name(f) for f in all_files)
57
-
58
- elif filetype == "arrow":
59
- ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
60
- df = pd.DataFrame(data=ds)
61
-
62
- elif filetype == "jsonl":
63
- func_name = pd.read_json
64
- all_lines = []
65
- for fname in all_files:
66
- with open(fname, "r") as f:
67
- all_lines.extend(f.readlines())
68
-
69
- df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])
70
-
71
- return df
72
-
73
- def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
74
- cell_length_ranges = {}
75
- cell_length_ranges = {}
76
- string_categorical = {}
77
- # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
78
- # with few unique items (need to check that while reading the cell),
79
- # so no need to treat it as a normal string
80
- if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
81
- string_categorical = str(len(cell_unique_string_values)) + " class(es)"
82
-
83
- elif cell_lengths:
84
- cell_lengths = sorted(cell_lengths)
85
- min_val = cell_lengths[0]
86
- max_val = cell_lengths[-1]
87
- distance = math.ceil((max_val - min_val) / 10.0)
88
- ranges = []
89
- if min_val != max_val:
90
- for j in range(min_val, max_val, distance):
91
- ranges.append(j)
92
- for j in range(len(ranges)-1):
93
- cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
94
- ranges.append(max_val)
95
-
96
- j = 1
97
- c = 0
98
- for k in cell_lengths:
99
- if j == len(ranges):
100
- c += 1
101
- elif k < ranges[j]:
102
- c += 1
103
- else:
104
- cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
105
- j += 1
106
- c = 1
107
-
108
- cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c
109
-
110
- else:
111
- ranges = [min_val]
112
- c = 0
113
- for k in cell_lengths:
114
- c += 1
115
- cell_length_ranges[str(min_val)] = c
116
-
117
- return cell_length_ranges, string_categorical
118
-
119
- def _compute_percentiles(values, percentiles=PERCENTILES):
120
- result = {}
121
- quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
122
- for p in percentiles:
123
- result[p/10] = quantiles[p-1]
124
- return result
125
-
126
- def compute_cell_value_statistics(cell_values):
127
- stats = {}
128
- if cell_values:
129
- cell_values = sorted(cell_values)
130
-
131
- stats["min"] = cell_values[0]
132
- stats["max"] = cell_values[-1]
133
- stats["mean"] = statistics.mean(cell_values)
134
- stats["stdev"] = statistics.stdev(cell_values)
135
- stats["variance"] = statistics.variance(cell_values)
136
-
137
- stats["percentiles"] = _compute_percentiles(cell_values)
138
-
139
- return stats
140
-
141
- def check_null(cell, cell_type):
142
- if cell_type == "<class 'float'>":
143
- if math.isnan(cell):
144
- return True
145
- elif cell is None:
146
- return True
147
- return False
148
-
149
- def compute_property(data_path, glob, filetype):
150
- output = {}
151
-
152
- data_dir = Path(data_path)
153
-
154
- filenames = []
155
- all_files = list(data_dir.glob(glob))
156
- for f in all_files:
157
- print(str(f))
158
- base_fname = str(f)[len(str(data_path)):]
159
- if not data_path.endswith("/"):
160
- base_fname = base_fname[1:]
161
- filenames.append(base_fname)
162
-
163
- output["filenames"] = filenames
164
-
165
- df = read_data(all_files, filetype)
166
-
167
- column_info = {}
168
-
169
- for col_name in df.columns:
170
- if col_name not in column_info:
171
- column_info[col_name] = {}
172
-
173
- cell_types = {}
174
-
175
- cell_lengths = {}
176
- cell_unique_string_values = {}
177
- cell_values = {}
178
- null_count = 0
179
- col_values = df[col_name].to_list()
180
- for cell in col_values:
181
- # for index, row in df.iterrows():
182
- # cell = row[col_name]
183
- cell_type = str(type(cell))
184
- cell_type = str(type(cell))
185
- # print(cell, cell_type)
186
- if check_null(cell, cell_type):
187
- null_count += 1
188
- continue
189
-
190
- if cell_type not in cell_types:
191
- cell_types[cell_type] = 1
192
- else:
193
- cell_types[cell_type] += 1
194
-
195
- if cell_type in CELL_TYPES_LENGTH:
196
- cell_length = len(cell)
197
- if cell_type not in cell_lengths:
198
- cell_lengths[cell_type] = []
199
-
200
- cell_lengths[cell_type].append(cell_length)
201
- if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
202
- cell_unique_string_values[cell] = True
203
-
204
- elif cell_type in CELL_TYPES_NUMERIC:
205
- if cell_type not in cell_values:
206
- cell_values[cell_type] = []
207
-
208
- cell_values[cell_type].append(cell)
209
-
210
- else:
211
- print(cell_type)
212
-
213
- clrs = {}
214
- ccs = {}
215
- for cell_type in CELL_TYPES_LENGTH:
216
- if cell_type in cell_lengths:
217
- clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
218
- clrs[cell_type] = clr
219
- ccs[cell_type] = cc
220
-
221
- css = {}
222
- for cell_type in CELL_TYPES_NUMERIC:
223
- if cell_type in cell_values:
224
- cell_stats = compute_cell_value_statistics(cell_values[cell_type])
225
- css[cell_type] = cell_stats
226
-
227
- column_info[col_name]["cell_types"] = cell_types
228
- column_info[col_name]["cell_length_ranges"] = clrs
229
- column_info[col_name]["cell_categories"] = ccs
230
- column_info[col_name]["cell_stats"] = css
231
- column_info[col_name]["cell_missing"] = null_count
232
-
233
- output["column_info"] = column_info
234
- output["number_of_items"] = len(df)
235
- output["timestamp"] = time.time()
236
-
237
- return output
238
 
239
  def preprocess_function(examples):
240
  tokenizer = AutoTokenizer.from_pretrained("sgugger/glue-mrpc")
@@ -289,7 +78,7 @@ def api_home():
289
  return {'detail': 'Welcome to Bastions Model evaluation!'}
290
 
291
  @app.post("/api/evaluate", summary = "Input dataset and model identifiers", tags = ["Test API"])
292
- def return_output():
293
 
294
  model_checkpoint = "sgugger/glue-mrpc"
295
  dataset_name = "nyu-mll/glue"
@@ -302,44 +91,4 @@ def return_output():
302
  #tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
303
  output = compute_model_card_evaluation_results(model_checkpoint, raw_datasets, metric)
304
 
305
- return output
306
-
307
- #if __name__ == "__main__":
308
- # uvicorn.run(app, host="0.0.0.0", port=8080, log_level="debug")
309
-
310
- """
311
- in_container = True
312
-
313
-
314
- if len(sys.argv) > 1:
315
- model_checkpoint = sys.argv[1]
316
- dataset_name = sys.argv[2]
317
- metric = sys.argv[3]
318
- in_container = False
319
- else:
320
- model_checkpoint = "sgugger/glue-mrpc"
321
- dataset_name = "nyu-mll/glue"
322
- metric = ["glue", "mrpc"]
323
- in_container = False
324
-
325
-
326
- model_checkpoint = "sgugger/glue-mrpc"
327
- dataset_name = "nyu-mll/glue"
328
- metric = ["glue", "mrpc"]
329
-
330
- print(model_checkpoint, dataset_name, metric)
331
-
332
- model_checkpoint = model_checkpoint
333
- raw_datasets = load_dataset(dataset_name, "mrpc")
334
- metric = load("glue", "mrpc")
335
- #tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
336
- output = compute_model_card_evaluation_results(model_checkpoint, raw_datasets, metric)
337
-
338
- print(json.dumps(output))
339
-
340
- if in_container:
341
- with open("/tmp/outputs/computation_result.json", "w") as f:
342
- json.dump(output, f, indent=4, sort_keys=True)
343
- else:
344
- print(json.dumps(output, indent=4, sort_keys=True))
345
- """
 
24
 
25
  app = FastAPI()
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def preprocess_function(examples):
29
  tokenizer = AutoTokenizer.from_pretrained("sgugger/glue-mrpc")
 
78
  return {'detail': 'Welcome to Bastions Model evaluation!'}
79
 
80
  @app.post("/api/evaluate", summary = "Input dataset and model identifiers", tags = ["Test API"])
81
+ def return_output(model_checkpoint, dataset_name):
82
 
83
  model_checkpoint = "sgugger/glue-mrpc"
84
  dataset_name = "nyu-mll/glue"
 
91
  #tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
92
  output = compute_model_card_evaluation_results(model_checkpoint, raw_datasets, metric)
93
 
94
+ return output