Spaces:
Sleeping
Sleeping
Upload utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" Utils functions for preprocessing"""
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
|
4 |
+
import pickle
|
5 |
+
import tensorflow as tf
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
|
9 |
+
def aggregate_transform_df(original_df, transformed_df, transformed_cols):
|
10 |
+
"""
|
11 |
+
Helper function to aggregate the columns transformed with the original dataset
|
12 |
+
"""
|
13 |
+
print(original_df.shape)
|
14 |
+
print(transformed_df.shape)
|
15 |
+
df_final = original_df.drop(columns=transformed_cols)
|
16 |
+
df_final = df_final.merge(transformed_df, left_index=True, right_index=True)
|
17 |
+
print(df_final.shape)
|
18 |
+
return df_final
|
19 |
+
|
20 |
+
|
21 |
+
def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True):
|
22 |
+
"""
|
23 |
+
Returns the dataframe where the categorical columns have been replaced
|
24 |
+
according to the method selected
|
25 |
+
|
26 |
+
Right now only OneHot is supported
|
27 |
+
"""
|
28 |
+
print(f"Running {method} encoding")
|
29 |
+
if fit:
|
30 |
+
encoder = OneHotEncoder()
|
31 |
+
encoder.fit(df[categorical_cols])
|
32 |
+
array_transformed = encoder.transform(df[categorical_cols]).toarray()
|
33 |
+
df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index)
|
34 |
+
df_final = aggregate_transform_df(df, df_encoded, categorical_cols)
|
35 |
+
if fit:
|
36 |
+
return df_final, encoder
|
37 |
+
else:
|
38 |
+
return df_final
|
39 |
+
|
40 |
+
|
41 |
+
def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True):
|
42 |
+
"""
|
43 |
+
Returns the dataframe where the numerical columns have been scaled
|
44 |
+
according to the method selected
|
45 |
+
|
46 |
+
Right now only MinMax is supported
|
47 |
+
"""
|
48 |
+
print(f"Running {method} scaling")
|
49 |
+
if fit:
|
50 |
+
scaler = MinMaxScaler()
|
51 |
+
scaler.fit(df[numerical_cols])
|
52 |
+
array_transformed = scaler.transform(df[numerical_cols])
|
53 |
+
df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index)
|
54 |
+
df_final = aggregate_transform_df(df, df_transformed, numerical_cols)
|
55 |
+
if fit:
|
56 |
+
return df_final, scaler
|
57 |
+
else:
|
58 |
+
return df_final
|
59 |
+
|
60 |
+
|
61 |
+
def scale_numerical_w_missing(df, numerical_cols, scaler):
|
62 |
+
"""
|
63 |
+
Scale the dataframe when there are missing columns from the columns used to fit the scaler
|
64 |
+
"""
|
65 |
+
additional_cols = [c for c in numerical_cols if c not in df.columns]
|
66 |
+
df_w_cols = df.copy()
|
67 |
+
df_w_cols[additional_cols] = 0
|
68 |
+
df_w_cols_scaled = scale_numerical(df_w_cols, numerical_cols, scaler=scaler, fit=False)
|
69 |
+
df_scaled = df_w_cols_scaled.drop(columns=additional_cols)
|
70 |
+
return df_scaled
|
71 |
+
|
72 |
+
|
73 |
+
def fill_nans(df, cols, method="mean"):
|
74 |
+
df_filled = df.copy()
|
75 |
+
print(f"Fill nans in {cols} with the {method} method")
|
76 |
+
for col in cols:
|
77 |
+
if method == "mean":
|
78 |
+
df_filled[col] = df_filled[col].fillna(df[col].mean())
|
79 |
+
elif method == "mode":
|
80 |
+
df_filled[col] = df_filled[col].fillna(df[col].mode())
|
81 |
+
return df_filled
|
82 |
+
|
83 |
+
|
84 |
+
def encode_and_predict(
|
85 |
+
model_path,
|
86 |
+
data,
|
87 |
+
one_hot_scaler,
|
88 |
+
minmax_scaler_inputs,
|
89 |
+
minmax_scaler_targets,
|
90 |
+
categorical_columns,
|
91 |
+
numerical_columns,
|
92 |
+
target_columns,
|
93 |
+
explainer=None,
|
94 |
+
):
|
95 |
+
model = tf.keras.models.load_model(model_path)
|
96 |
+
data = encode_categorical(data, categorical_columns, encoder=one_hot_scaler, fit=False)
|
97 |
+
data = scale_numerical(data, numerical_columns, scaler=minmax_scaler_inputs, fit=False)
|
98 |
+
if explainer:
|
99 |
+
return model.predict(data), data.columns, explainer.shap_values(data[-10:])
|
100 |
+
else:
|
101 |
+
return model.predict(data)
|
102 |
+
|
103 |
+
|
104 |
+
class EnsembleModel:
|
105 |
+
"""
|
106 |
+
Class to store a list of models and to run predictions as the mean of those models
|
107 |
+
"""
|
108 |
+
|
109 |
+
def __init__(self, models_list, history_list, loss_threshold=0, scaler_targets=None) -> None:
|
110 |
+
"""
|
111 |
+
Initialized the Ensemble model and cleans the models that stayed stuck, or that didn't achieve a sufficient performance (if loss_threshold parameter is set)
|
112 |
+
By assumption the content of models_list are AI models that have a predict method
|
113 |
+
"""
|
114 |
+
self.models = []
|
115 |
+
self.models_history = []
|
116 |
+
self.loss_threshold = loss_threshold
|
117 |
+
for i, model in enumerate(models_list):
|
118 |
+
model_history = history_list[i]
|
119 |
+
if np.abs(min(model_history.history["loss"]) - max(model_history.history["loss"])) < 0.001:
|
120 |
+
print(f"Model {i} skipped due to loss getting stuck")
|
121 |
+
continue
|
122 |
+
if (self.loss_threshold > 0) and (model_history.history["loss"][-1] > self.loss_threshold):
|
123 |
+
print(f"Model {i} skipped due to performance")
|
124 |
+
continue
|
125 |
+
self.models.append(model)
|
126 |
+
self.models_history.append(model)
|
127 |
+
self.scaler_targets = scaler_targets
|
128 |
+
print(f"Ensemble model initialized with {len(self.models)} models")
|
129 |
+
|
130 |
+
def predict_list(self, data):
|
131 |
+
pred_list = [model.predict(data) for model in self.models]
|
132 |
+
if self.scaler_targets is not None:
|
133 |
+
pred_list = [self.scaler_targets.inverse_transform(pred) for pred in pred_list]
|
134 |
+
|
135 |
+
return pred_list
|
136 |
+
|
137 |
+
def predict_w_uncertainty(self, data, uncertainty_type="confidence_interval", model_bias=0.03):
|
138 |
+
"""
|
139 |
+
Returns the prediction and the confidence interval on the data
|
140 |
+
"""
|
141 |
+
# The prediction is the average of all predictions and the uncertainty is the variance of all predictions
|
142 |
+
# LB: not sure this works if multiple targets are predicted with the same model
|
143 |
+
n_models = len(self.models)
|
144 |
+
pred_mean, pred_list = self.predict(data, return_list=True)
|
145 |
+
|
146 |
+
pred_std = np.std(pred_list, axis=0)
|
147 |
+
|
148 |
+
training_average_dict = {
|
149 |
+
"%C": 0.587936,
|
150 |
+
"%Co": 0.306122,
|
151 |
+
"%Cr": 0,
|
152 |
+
"%V": 0,
|
153 |
+
"%Mo": 0,
|
154 |
+
"%W": 0.363942,
|
155 |
+
"Temperature_C": 0.387755,
|
156 |
+
}
|
157 |
+
eps = 0.1
|
158 |
+
if uncertainty_type == "confidence_interval":
|
159 |
+
print("Confidence interval")
|
160 |
+
# Confidence interval = mean +- z * std/sqrt(n)
|
161 |
+
z = 1.96 # 95%: 1.96, 90% 1.645
|
162 |
+
model_bias_vector = np.ones(pred_mean.shape) * model_bias * pred_mean
|
163 |
+
pred_uncertainty = z * (pred_std + model_bias_vector) / np.sqrt(n_models)
|
164 |
+
elif uncertainty_type == "std":
|
165 |
+
print("Standard deviation")
|
166 |
+
pred_uncertainty = pred_std.copy()
|
167 |
+
else:
|
168 |
+
print("Weighted uncertainty")
|
169 |
+
pred_uncertainty = pred_std.copy()
|
170 |
+
uncertainty_weights = np.ones((pred_std.shape[0],))
|
171 |
+
dist_df = pd.DataFrame()
|
172 |
+
for col in training_average_dict.keys():
|
173 |
+
print(training_average_dict[col])
|
174 |
+
dist_vector = (data[col] - training_average_dict[col]) ** 2
|
175 |
+
# dist_vector = np.abs(data[col] - training_average_dict[col])
|
176 |
+
# Quick fix for the constant elements that are not properly scaled
|
177 |
+
if col in ["%Cr", "%V", "%Mo"]:
|
178 |
+
dist_vector = dist_vector / 10
|
179 |
+
dist_df[col] = dist_vector
|
180 |
+
print(dist_vector)
|
181 |
+
uncertainty_weights = np.sqrt(dist_df.sum(axis=1)) + eps
|
182 |
+
pred_uncertainty = np.multiply(uncertainty_weights, pred_uncertainty[:, 0])
|
183 |
+
|
184 |
+
return pred_mean, pred_uncertainty
|
185 |
+
|
186 |
+
def predict(self, data, return_list=False):
|
187 |
+
"""
|
188 |
+
Returns only the prediction of the Ensemble models on the data
|
189 |
+
"""
|
190 |
+
pred_list = self.predict_list(data)
|
191 |
+
preds = np.mean(pred_list, axis=0)
|
192 |
+
if return_list:
|
193 |
+
return preds, pred_list
|
194 |
+
return preds
|
195 |
+
|
196 |
+
|
197 |
+
def unpickle_file(path):
|
198 |
+
with open(path, "rb") as file:
|
199 |
+
unpickler = pickle.Unpickler(file)
|
200 |
+
unpickled_file = unpickler.load()
|
201 |
+
return unpickled_file
|
202 |
+
|
203 |
+
|
204 |
+
def read_data(data_path, sep=","):
|
205 |
+
"""
|
206 |
+
Opens the file based on the extension
|
207 |
+
"""
|
208 |
+
file_extension = data_path.split(".")[-1]
|
209 |
+
if file_extension == "csv":
|
210 |
+
return pd.read_csv(data_path, sep=sep)
|
211 |
+
elif file_extension in ["xls", "xlsx"]:
|
212 |
+
return pd.read_excel(data_path)
|
213 |
+
else:
|
214 |
+
return unpickle_file(data_path)
|
215 |
+
|
216 |
+
|
217 |
+
class NoPhysicsModels:
|
218 |
+
"""
|
219 |
+
Class to hide the physics-informed features to be able to run the shap interpreter on it
|
220 |
+
"""
|
221 |
+
|
222 |
+
def __init__(self, model, scaler_inputs=None, preprocessing_physics_fn=None):
|
223 |
+
self.model = model
|
224 |
+
self.scaler_inputs = scaler_inputs
|
225 |
+
self.physics_fn = preprocessing_physics_fn
|
226 |
+
|
227 |
+
def predict(self, x):
|
228 |
+
x_w_p = self.physics_fn(x)
|
229 |
+
x_w_p_for_scaling = x_w_p[self.scaler_inputs.feature_names_in_]
|
230 |
+
x_w_p_scaled = scale_numerical(
|
231 |
+
x_w_p_for_scaling, self.scaler_inputs.feature_names_in_, scaler=self.scaler_inputs, fit=False
|
232 |
+
)
|
233 |
+
|
234 |
+
return self.model.predict(x_w_p_scaled)
|