ksatzke commited on
Commit
e711649
·
verified ·
1 Parent(s): dc2bd70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -228
app.py CHANGED
@@ -1,243 +1,30 @@
1
  import streamlit as st
2
  import gradio as gr
3
- import torch
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
  import logging
6
  from typing import List, Dict
7
  import gc
8
  import os
 
 
9
 
10
- from pathlib import Path
11
- import io
12
  import json
13
- import math
14
- import statistics
15
- import sys
16
- import time
17
 
18
  from datasets import concatenate_datasets, Dataset
19
  from datasets import load_dataset
20
 
21
  from huggingface_hub import hf_hub_url
22
 
23
- import pandas as pd
24
- import numpy as np
25
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
26
  from evaluate import load
27
 
28
-
29
- # 1. record each file name included
30
- # 1.1 read different file formats depending on parameters (i.e., filetype)
31
- # 2. determine column types and report how many rows for each type (format check)
32
- # (in a well-formatted dataset, each column should only have one type)
33
- # 3. report on the null values
34
- # 4. for certain column types, report statistics
35
- # 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
36
- # 4.2 strings: length ranges
37
- # 4.3 lists: length ranges
38
- # 4.3 int/float/double: their percentiles, min, max, mean
39
-
40
- CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
41
- CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]
42
-
43
- PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]
44
-
45
- def read_data(all_files, filetype):
46
- df = None
47
-
48
- func_name = ""
49
-
50
- if filetype in ["parquet", "csv", "json"]:
51
- if filetype == "parquet":
52
- func_name = pd.read_parquet
53
- elif filetype == "csv":
54
- func_name = pd.read_csv
55
- elif filetype == "json":
56
- func_name = pd.read_json
57
-
58
- df = pd.concat(func_name(f) for f in all_files)
59
-
60
- elif filetype == "arrow":
61
- ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
62
- df = pd.DataFrame(data=ds)
63
-
64
- elif filetype == "jsonl":
65
- func_name = pd.read_json
66
- all_lines = []
67
- for fname in all_files:
68
- with open(fname, "r") as f:
69
- all_lines.extend(f.readlines())
70
-
71
- df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])
72
-
73
- return df
74
-
75
- def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
76
- cell_length_ranges = {}
77
- cell_length_ranges = {}
78
- string_categorical = {}
79
- # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
80
- # with few unique items (need to check that while reading the cell),
81
- # so no need to treat it as a normal string
82
- if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
83
- string_categorical = str(len(cell_unique_string_values)) + " class(es)"
84
-
85
- elif cell_lengths:
86
- cell_lengths = sorted(cell_lengths)
87
- min_val = cell_lengths[0]
88
- max_val = cell_lengths[-1]
89
- distance = math.ceil((max_val - min_val) / 10.0)
90
- ranges = []
91
- if min_val != max_val:
92
- for j in range(min_val, max_val, distance):
93
- ranges.append(j)
94
- for j in range(len(ranges)-1):
95
- cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
96
- ranges.append(max_val)
97
-
98
- j = 1
99
- c = 0
100
- for k in cell_lengths:
101
- if j == len(ranges):
102
- c += 1
103
- elif k < ranges[j]:
104
- c += 1
105
- else:
106
- cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
107
- j += 1
108
- c = 1
109
-
110
- cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c
111
-
112
- else:
113
- ranges = [min_val]
114
- c = 0
115
- for k in cell_lengths:
116
- c += 1
117
- cell_length_ranges[str(min_val)] = c
118
-
119
- return cell_length_ranges, string_categorical
120
-
121
- def _compute_percentiles(values, percentiles=PERCENTILES):
122
- result = {}
123
- quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
124
- for p in percentiles:
125
- result[p/10] = quantiles[p-1]
126
- return result
127
-
128
- def compute_cell_value_statistics(cell_values):
129
- stats = {}
130
- if cell_values:
131
- cell_values = sorted(cell_values)
132
-
133
- stats["min"] = cell_values[0]
134
- stats["max"] = cell_values[-1]
135
- stats["mean"] = statistics.mean(cell_values)
136
- stats["stdev"] = statistics.stdev(cell_values)
137
- stats["variance"] = statistics.variance(cell_values)
138
-
139
- stats["percentiles"] = _compute_percentiles(cell_values)
140
-
141
- return stats
142
-
143
- def check_null(cell, cell_type):
144
- if cell_type == "<class 'float'>":
145
- if math.isnan(cell):
146
- return True
147
- elif cell is None:
148
- return True
149
- return False
150
-
151
- def compute_property(data_path, glob, filetype):
152
- output = {}
153
-
154
- data_dir = Path(data_path)
155
-
156
- filenames = []
157
- all_files = list(data_dir.glob(glob))
158
- for f in all_files:
159
- print(str(f))
160
- base_fname = str(f)[len(str(data_path)):]
161
- if not data_path.endswith("/"):
162
- base_fname = base_fname[1:]
163
- filenames.append(base_fname)
164
-
165
- output["filenames"] = filenames
166
-
167
- df = read_data(all_files, filetype)
168
-
169
- column_info = {}
170
-
171
- for col_name in df.columns:
172
- if col_name not in column_info:
173
- column_info[col_name] = {}
174
-
175
- cell_types = {}
176
-
177
- cell_lengths = {}
178
- cell_unique_string_values = {}
179
- cell_values = {}
180
- null_count = 0
181
- col_values = df[col_name].to_list()
182
- for cell in col_values:
183
- # for index, row in df.iterrows():
184
- # cell = row[col_name]
185
- cell_type = str(type(cell))
186
- cell_type = str(type(cell))
187
- # print(cell, cell_type)
188
- if check_null(cell, cell_type):
189
- null_count += 1
190
- continue
191
-
192
- if cell_type not in cell_types:
193
- cell_types[cell_type] = 1
194
- else:
195
- cell_types[cell_type] += 1
196
-
197
- if cell_type in CELL_TYPES_LENGTH:
198
- cell_length = len(cell)
199
- if cell_type not in cell_lengths:
200
- cell_lengths[cell_type] = []
201
-
202
- cell_lengths[cell_type].append(cell_length)
203
- if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
204
- cell_unique_string_values[cell] = True
205
-
206
- elif cell_type in CELL_TYPES_NUMERIC:
207
- if cell_type not in cell_values:
208
- cell_values[cell_type] = []
209
-
210
- cell_values[cell_type].append(cell)
211
-
212
- else:
213
- print(cell_type)
214
-
215
- clrs = {}
216
- ccs = {}
217
- for cell_type in CELL_TYPES_LENGTH:
218
- if cell_type in cell_lengths:
219
- clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
220
- clrs[cell_type] = clr
221
- ccs[cell_type] = cc
222
-
223
- css = {}
224
- for cell_type in CELL_TYPES_NUMERIC:
225
- if cell_type in cell_values:
226
- cell_stats = compute_cell_value_statistics(cell_values[cell_type])
227
- css[cell_type] = cell_stats
228
-
229
- column_info[col_name]["cell_types"] = cell_types
230
- column_info[col_name]["cell_length_ranges"] = clrs
231
- column_info[col_name]["cell_categories"] = ccs
232
- column_info[col_name]["cell_stats"] = css
233
- column_info[col_name]["cell_missing"] = null_count
234
-
235
- output["column_info"] = column_info
236
- output["number_of_items"] = len(df)
237
- output["timestamp"] = time.time()
238
-
239
- return output
240
-
241
  def preprocess_function(examples):
242
  return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
243
 
@@ -280,10 +67,9 @@ if __name__ == "__main__":
280
 
281
  st.title("Hugging Face Model Evaluation Demo")
282
 
283
- with st.form("my_form"):
284
 
285
  # Create an input text box
286
- #input_text = st.text_area("Enter model and dataset identifiers", "")
287
  dataset_name = st.text_input("Enter dataset identifier", "")
288
  model_checkpoint = st.text_input("Enter model identifier", "")
289
  # Every form must have a submit button.
@@ -292,12 +78,12 @@ if __name__ == "__main__":
292
  if submitted:
293
  print(dataset_name, model_checkpoint)
294
  # hardcode input data
295
- model_checkpoint = "sgugger/glue-mrpc"
296
- dataset_name = "nyu-mll/glue"
297
 
298
  metric = load("glue", "mrpc")
299
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
300
- in_container = False
301
  model_checkpoint = model_checkpoint
302
  raw_datasets = load_dataset(dataset_name, "mrpc")
303
  metric = load("glue", "mrpc")
 
1
  import streamlit as st
2
  import gradio as gr
3
+ imort torch
 
4
  import logging
5
  from typing import List, Dict
6
  import gc
7
  import os
8
+ import pandas as pd
9
+ import numpy as np
10
 
11
+ #from pathlib import Path
12
+ #import io
13
  import json
14
+ #import math
15
+ #import statistics
16
+ #import sys
17
+ #import time
18
 
19
  from datasets import concatenate_datasets, Dataset
20
  from datasets import load_dataset
21
 
22
  from huggingface_hub import hf_hub_url
23
 
24
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
25
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
26
  from evaluate import load
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def preprocess_function(examples):
29
  return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
30
 
 
67
 
68
  st.title("Hugging Face Model Evaluation Demo")
69
 
70
+ with st.form("my_st_form"):
71
 
72
  # Create an input text box
 
73
  dataset_name = st.text_input("Enter dataset identifier", "")
74
  model_checkpoint = st.text_input("Enter model identifier", "")
75
  # Every form must have a submit button.
 
78
  if submitted:
79
  print(dataset_name, model_checkpoint)
80
  # hardcode input data
81
+ #model_checkpoint = "sgugger/glue-mrpc"
82
+ #dataset_name = "nyu-mll/glue"
83
 
84
  metric = load("glue", "mrpc")
85
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
86
+ #in_container = False
87
  model_checkpoint = model_checkpoint
88
  raw_datasets = load_dataset(dataset_name, "mrpc")
89
  metric = load("glue", "mrpc")