bndl commited on
Commit
edf1058
·
1 Parent(s): 155354b

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils.py +234 -0
utils.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Utils functions for preprocessing"""
2
+ import pandas as pd
3
+ from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
4
+ import pickle
5
+ import tensorflow as tf
6
+ import numpy as np
7
+
8
+
9
+ def aggregate_transform_df(original_df, transformed_df, transformed_cols):
10
+ """
11
+ Helper function to aggregate the columns transformed with the original dataset
12
+ """
13
+ print(original_df.shape)
14
+ print(transformed_df.shape)
15
+ df_final = original_df.drop(columns=transformed_cols)
16
+ df_final = df_final.merge(transformed_df, left_index=True, right_index=True)
17
+ print(df_final.shape)
18
+ return df_final
19
+
20
+
21
+ def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True):
22
+ """
23
+ Returns the dataframe where the categorical columns have been replaced
24
+ according to the method selected
25
+
26
+ Right now only OneHot is supported
27
+ """
28
+ print(f"Running {method} encoding")
29
+ if fit:
30
+ encoder = OneHotEncoder()
31
+ encoder.fit(df[categorical_cols])
32
+ array_transformed = encoder.transform(df[categorical_cols]).toarray()
33
+ df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index)
34
+ df_final = aggregate_transform_df(df, df_encoded, categorical_cols)
35
+ if fit:
36
+ return df_final, encoder
37
+ else:
38
+ return df_final
39
+
40
+
41
+ def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True):
42
+ """
43
+ Returns the dataframe where the numerical columns have been scaled
44
+ according to the method selected
45
+
46
+ Right now only MinMax is supported
47
+ """
48
+ print(f"Running {method} scaling")
49
+ if fit:
50
+ scaler = MinMaxScaler()
51
+ scaler.fit(df[numerical_cols])
52
+ array_transformed = scaler.transform(df[numerical_cols])
53
+ df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index)
54
+ df_final = aggregate_transform_df(df, df_transformed, numerical_cols)
55
+ if fit:
56
+ return df_final, scaler
57
+ else:
58
+ return df_final
59
+
60
+
61
+ def scale_numerical_w_missing(df, numerical_cols, scaler):
62
+ """
63
+ Scale the dataframe when there are missing columns from the columns used to fit the scaler
64
+ """
65
+ additional_cols = [c for c in numerical_cols if c not in df.columns]
66
+ df_w_cols = df.copy()
67
+ df_w_cols[additional_cols] = 0
68
+ df_w_cols_scaled = scale_numerical(df_w_cols, numerical_cols, scaler=scaler, fit=False)
69
+ df_scaled = df_w_cols_scaled.drop(columns=additional_cols)
70
+ return df_scaled
71
+
72
+
73
+ def fill_nans(df, cols, method="mean"):
74
+ df_filled = df.copy()
75
+ print(f"Fill nans in {cols} with the {method} method")
76
+ for col in cols:
77
+ if method == "mean":
78
+ df_filled[col] = df_filled[col].fillna(df[col].mean())
79
+ elif method == "mode":
80
+ df_filled[col] = df_filled[col].fillna(df[col].mode())
81
+ return df_filled
82
+
83
+
84
+ def encode_and_predict(
85
+ model_path,
86
+ data,
87
+ one_hot_scaler,
88
+ minmax_scaler_inputs,
89
+ minmax_scaler_targets,
90
+ categorical_columns,
91
+ numerical_columns,
92
+ target_columns,
93
+ explainer=None,
94
+ ):
95
+ model = tf.keras.models.load_model(model_path)
96
+ data = encode_categorical(data, categorical_columns, encoder=one_hot_scaler, fit=False)
97
+ data = scale_numerical(data, numerical_columns, scaler=minmax_scaler_inputs, fit=False)
98
+ if explainer:
99
+ return model.predict(data), data.columns, explainer.shap_values(data[-10:])
100
+ else:
101
+ return model.predict(data)
102
+
103
+
104
+ class EnsembleModel:
105
+ """
106
+ Class to store a list of models and to run predictions as the mean of those models
107
+ """
108
+
109
+ def __init__(self, models_list, history_list, loss_threshold=0, scaler_targets=None) -> None:
110
+ """
111
+ Initialized the Ensemble model and cleans the models that stayed stuck, or that didn't achieve a sufficient performance (if loss_threshold parameter is set)
112
+ By assumption the content of models_list are AI models that have a predict method
113
+ """
114
+ self.models = []
115
+ self.models_history = []
116
+ self.loss_threshold = loss_threshold
117
+ for i, model in enumerate(models_list):
118
+ model_history = history_list[i]
119
+ if np.abs(min(model_history.history["loss"]) - max(model_history.history["loss"])) < 0.001:
120
+ print(f"Model {i} skipped due to loss getting stuck")
121
+ continue
122
+ if (self.loss_threshold > 0) and (model_history.history["loss"][-1] > self.loss_threshold):
123
+ print(f"Model {i} skipped due to performance")
124
+ continue
125
+ self.models.append(model)
126
+ self.models_history.append(model)
127
+ self.scaler_targets = scaler_targets
128
+ print(f"Ensemble model initialized with {len(self.models)} models")
129
+
130
+ def predict_list(self, data):
131
+ pred_list = [model.predict(data) for model in self.models]
132
+ if self.scaler_targets is not None:
133
+ pred_list = [self.scaler_targets.inverse_transform(pred) for pred in pred_list]
134
+
135
+ return pred_list
136
+
137
+ def predict_w_uncertainty(self, data, uncertainty_type="confidence_interval", model_bias=0.03):
138
+ """
139
+ Returns the prediction and the confidence interval on the data
140
+ """
141
+ # The prediction is the average of all predictions and the uncertainty is the variance of all predictions
142
+ # LB: not sure this works if multiple targets are predicted with the same model
143
+ n_models = len(self.models)
144
+ pred_mean, pred_list = self.predict(data, return_list=True)
145
+
146
+ pred_std = np.std(pred_list, axis=0)
147
+
148
+ training_average_dict = {
149
+ "%C": 0.587936,
150
+ "%Co": 0.306122,
151
+ "%Cr": 0,
152
+ "%V": 0,
153
+ "%Mo": 0,
154
+ "%W": 0.363942,
155
+ "Temperature_C": 0.387755,
156
+ }
157
+ eps = 0.1
158
+ if uncertainty_type == "confidence_interval":
159
+ print("Confidence interval")
160
+ # Confidence interval = mean +- z * std/sqrt(n)
161
+ z = 1.96 # 95%: 1.96, 90% 1.645
162
+ model_bias_vector = np.ones(pred_mean.shape) * model_bias * pred_mean
163
+ pred_uncertainty = z * (pred_std + model_bias_vector) / np.sqrt(n_models)
164
+ elif uncertainty_type == "std":
165
+ print("Standard deviation")
166
+ pred_uncertainty = pred_std.copy()
167
+ else:
168
+ print("Weighted uncertainty")
169
+ pred_uncertainty = pred_std.copy()
170
+ uncertainty_weights = np.ones((pred_std.shape[0],))
171
+ dist_df = pd.DataFrame()
172
+ for col in training_average_dict.keys():
173
+ print(training_average_dict[col])
174
+ dist_vector = (data[col] - training_average_dict[col]) ** 2
175
+ # dist_vector = np.abs(data[col] - training_average_dict[col])
176
+ # Quick fix for the constant elements that are not properly scaled
177
+ if col in ["%Cr", "%V", "%Mo"]:
178
+ dist_vector = dist_vector / 10
179
+ dist_df[col] = dist_vector
180
+ print(dist_vector)
181
+ uncertainty_weights = np.sqrt(dist_df.sum(axis=1)) + eps
182
+ pred_uncertainty = np.multiply(uncertainty_weights, pred_uncertainty[:, 0])
183
+
184
+ return pred_mean, pred_uncertainty
185
+
186
+ def predict(self, data, return_list=False):
187
+ """
188
+ Returns only the prediction of the Ensemble models on the data
189
+ """
190
+ pred_list = self.predict_list(data)
191
+ preds = np.mean(pred_list, axis=0)
192
+ if return_list:
193
+ return preds, pred_list
194
+ return preds
195
+
196
+
197
+ def unpickle_file(path):
198
+ with open(path, "rb") as file:
199
+ unpickler = pickle.Unpickler(file)
200
+ unpickled_file = unpickler.load()
201
+ return unpickled_file
202
+
203
+
204
+ def read_data(data_path, sep=","):
205
+ """
206
+ Opens the file based on the extension
207
+ """
208
+ file_extension = data_path.split(".")[-1]
209
+ if file_extension == "csv":
210
+ return pd.read_csv(data_path, sep=sep)
211
+ elif file_extension in ["xls", "xlsx"]:
212
+ return pd.read_excel(data_path)
213
+ else:
214
+ return unpickle_file(data_path)
215
+
216
+
217
+ class NoPhysicsModels:
218
+ """
219
+ Class to hide the physics-informed features to be able to run the shap interpreter on it
220
+ """
221
+
222
+ def __init__(self, model, scaler_inputs=None, preprocessing_physics_fn=None):
223
+ self.model = model
224
+ self.scaler_inputs = scaler_inputs
225
+ self.physics_fn = preprocessing_physics_fn
226
+
227
+ def predict(self, x):
228
+ x_w_p = self.physics_fn(x)
229
+ x_w_p_for_scaling = x_w_p[self.scaler_inputs.feature_names_in_]
230
+ x_w_p_scaled = scale_numerical(
231
+ x_w_p_for_scaling, self.scaler_inputs.feature_names_in_, scaler=self.scaler_inputs, fit=False
232
+ )
233
+
234
+ return self.model.predict(x_w_p_scaled)