File size: 8,729 Bytes
edf1058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
""" Utils functions for preprocessing"""
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pickle
import tensorflow as tf
import numpy as np


def aggregate_transform_df(original_df, transformed_df, transformed_cols):
    """
    Helper function to aggregate the columns transformed with the original dataset
    """
    print(original_df.shape)
    print(transformed_df.shape)
    df_final = original_df.drop(columns=transformed_cols)
    df_final = df_final.merge(transformed_df, left_index=True, right_index=True)
    print(df_final.shape)
    return df_final


def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True):
    """
    Returns the dataframe where the categorical columns have been replaced
    according to the method selected

    Right now only OneHot is supported
    """
    print(f"Running {method} encoding")
    if fit:
        encoder = OneHotEncoder()
        encoder.fit(df[categorical_cols])
    array_transformed = encoder.transform(df[categorical_cols]).toarray()
    df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index)
    df_final = aggregate_transform_df(df, df_encoded, categorical_cols)
    if fit:
        return df_final, encoder
    else:
        return df_final


def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True):
    """
    Returns the dataframe where the numerical columns have been scaled
    according to the method selected

    Right now only MinMax is supported
    """
    print(f"Running {method} scaling")
    if fit:
        scaler = MinMaxScaler()
        scaler.fit(df[numerical_cols])
    array_transformed = scaler.transform(df[numerical_cols])
    df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index)
    df_final = aggregate_transform_df(df, df_transformed, numerical_cols)
    if fit:
        return df_final, scaler
    else:
        return df_final


def scale_numerical_w_missing(df, numerical_cols, scaler):
    """
    Scale the dataframe when there are missing columns from the columns used to fit the scaler
    """
    additional_cols = [c for c in numerical_cols if c not in df.columns]
    df_w_cols = df.copy()
    df_w_cols[additional_cols] = 0
    df_w_cols_scaled = scale_numerical(df_w_cols, numerical_cols, scaler=scaler, fit=False)
    df_scaled = df_w_cols_scaled.drop(columns=additional_cols)
    return df_scaled


def fill_nans(df, cols, method="mean"):
    df_filled = df.copy()
    print(f"Fill nans in {cols} with the {method} method")
    for col in cols:
        if method == "mean":
            df_filled[col] = df_filled[col].fillna(df[col].mean())
        elif method == "mode":
            df_filled[col] = df_filled[col].fillna(df[col].mode())
    return df_filled


def encode_and_predict(
    model_path,
    data,
    one_hot_scaler,
    minmax_scaler_inputs,
    minmax_scaler_targets,
    categorical_columns,
    numerical_columns,
    target_columns,
    explainer=None,
):
    model = tf.keras.models.load_model(model_path)
    data = encode_categorical(data, categorical_columns, encoder=one_hot_scaler, fit=False)
    data = scale_numerical(data, numerical_columns, scaler=minmax_scaler_inputs, fit=False)
    if explainer:
        return model.predict(data), data.columns, explainer.shap_values(data[-10:])
    else:
        return model.predict(data)


class EnsembleModel:
    """
    Class to store a list of models and to run predictions as the mean of those models
    """

    def __init__(self, models_list, history_list, loss_threshold=0, scaler_targets=None) -> None:
        """
        Initialized the Ensemble model and cleans the models that stayed stuck, or that didn't achieve a sufficient performance (if loss_threshold parameter is set)
        By assumption the content of models_list are AI models that have a predict method
        """
        self.models = []
        self.models_history = []
        self.loss_threshold = loss_threshold
        for i, model in enumerate(models_list):
            model_history = history_list[i]
            if np.abs(min(model_history.history["loss"]) - max(model_history.history["loss"])) < 0.001:
                print(f"Model {i} skipped due to loss getting stuck")
                continue
            if (self.loss_threshold > 0) and (model_history.history["loss"][-1] > self.loss_threshold):
                print(f"Model {i} skipped due to performance")
                continue
            self.models.append(model)
            self.models_history.append(model)
        self.scaler_targets = scaler_targets
        print(f"Ensemble model initialized with {len(self.models)} models")

    def predict_list(self, data):
        pred_list = [model.predict(data) for model in self.models]
        if self.scaler_targets is not None:
            pred_list = [self.scaler_targets.inverse_transform(pred) for pred in pred_list]

        return pred_list

    def predict_w_uncertainty(self, data, uncertainty_type="confidence_interval", model_bias=0.03):
        """
        Returns the prediction and the confidence interval on the data
        """
        # The prediction is the average of all predictions and the uncertainty is the variance of all predictions
        # LB: not sure this works if multiple targets are predicted with the same model
        n_models = len(self.models)
        pred_mean, pred_list = self.predict(data, return_list=True)

        pred_std = np.std(pred_list, axis=0)

        training_average_dict = {
            "%C": 0.587936,
            "%Co": 0.306122,
            "%Cr": 0,
            "%V": 0,
            "%Mo": 0,
            "%W": 0.363942,
            "Temperature_C": 0.387755,
        }
        eps = 0.1
        if uncertainty_type == "confidence_interval":
            print("Confidence interval")
            # Confidence interval = mean +- z * std/sqrt(n)
            z = 1.96  # 95%: 1.96, 90% 1.645
            model_bias_vector = np.ones(pred_mean.shape) * model_bias * pred_mean
            pred_uncertainty = z * (pred_std + model_bias_vector) / np.sqrt(n_models)
        elif uncertainty_type == "std":
            print("Standard deviation")
            pred_uncertainty = pred_std.copy()
        else:
            print("Weighted uncertainty")
            pred_uncertainty = pred_std.copy()
            uncertainty_weights = np.ones((pred_std.shape[0],))
            dist_df = pd.DataFrame()
            for col in training_average_dict.keys():
                print(training_average_dict[col])
                dist_vector = (data[col] - training_average_dict[col]) ** 2
                # dist_vector = np.abs(data[col] - training_average_dict[col])
                # Quick fix for the constant elements that are not properly scaled
                if col in ["%Cr", "%V", "%Mo"]:
                    dist_vector = dist_vector / 10
                dist_df[col] = dist_vector
                print(dist_vector)
            uncertainty_weights = np.sqrt(dist_df.sum(axis=1)) + eps
            pred_uncertainty = np.multiply(uncertainty_weights, pred_uncertainty[:, 0])

        return pred_mean, pred_uncertainty

    def predict(self, data, return_list=False):
        """
        Returns only the prediction of the Ensemble models on the data
        """
        pred_list = self.predict_list(data)
        preds = np.mean(pred_list, axis=0)
        if return_list:
            return preds, pred_list
        return preds


def unpickle_file(path):
    with open(path, "rb") as file:
        unpickler = pickle.Unpickler(file)
        unpickled_file = unpickler.load()
    return unpickled_file


def read_data(data_path, sep=","):
    """
    Opens the file based on the extension
    """
    file_extension = data_path.split(".")[-1]
    if file_extension == "csv":
        return pd.read_csv(data_path, sep=sep)
    elif file_extension in ["xls", "xlsx"]:
        return pd.read_excel(data_path)
    else:
        return unpickle_file(data_path)


class NoPhysicsModels:
    """
    Class to hide the physics-informed features to be able to run the shap interpreter on it
    """

    def __init__(self, model, scaler_inputs=None, preprocessing_physics_fn=None):
        self.model = model
        self.scaler_inputs = scaler_inputs
        self.physics_fn = preprocessing_physics_fn

    def predict(self, x):
        x_w_p = self.physics_fn(x)
        x_w_p_for_scaling = x_w_p[self.scaler_inputs.feature_names_in_]
        x_w_p_scaled = scale_numerical(
            x_w_p_for_scaling, self.scaler_inputs.feature_names_in_, scaler=self.scaler_inputs, fit=False
        )

        return self.model.predict(x_w_p_scaled)