""" Utils functions for preprocessing""" import pandas as pd from sklearn.preprocessing import OneHotEncoder, MinMaxScaler import pickle import tensorflow as tf import numpy as np def aggregate_transform_df(original_df, transformed_df, transformed_cols): """ Helper function to aggregate the columns transformed with the original dataset """ print(original_df.shape) print(transformed_df.shape) df_final = original_df.drop(columns=transformed_cols) df_final = df_final.merge(transformed_df, left_index=True, right_index=True) print(df_final.shape) return df_final def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True): """ Returns the dataframe where the categorical columns have been replaced according to the method selected Right now only OneHot is supported """ print(f"Running {method} encoding") if fit: encoder = OneHotEncoder() encoder.fit(df[categorical_cols]) array_transformed = encoder.transform(df[categorical_cols]).toarray() df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index) df_final = aggregate_transform_df(df, df_encoded, categorical_cols) if fit: return df_final, encoder else: return df_final def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True): """ Returns the dataframe where the numerical columns have been scaled according to the method selected Right now only MinMax is supported """ print(f"Running {method} scaling") if fit: scaler = MinMaxScaler() scaler.fit(df[numerical_cols]) array_transformed = scaler.transform(df[numerical_cols]) df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index) df_final = aggregate_transform_df(df, df_transformed, numerical_cols) if fit: return df_final, scaler else: return df_final def scale_numerical_w_missing(df, numerical_cols, scaler): """ Scale the dataframe when there are missing columns from the columns used to fit the scaler """ additional_cols = [c for c in numerical_cols if c not in df.columns] df_w_cols = df.copy() df_w_cols[additional_cols] = 0 df_w_cols_scaled = scale_numerical(df_w_cols, numerical_cols, scaler=scaler, fit=False) df_scaled = df_w_cols_scaled.drop(columns=additional_cols) return df_scaled def fill_nans(df, cols, method="mean"): df_filled = df.copy() print(f"Fill nans in {cols} with the {method} method") for col in cols: if method == "mean": df_filled[col] = df_filled[col].fillna(df[col].mean()) elif method == "mode": df_filled[col] = df_filled[col].fillna(df[col].mode()) return df_filled def encode_and_predict( model_path, data, one_hot_scaler, minmax_scaler_inputs, minmax_scaler_targets, categorical_columns, numerical_columns, target_columns, explainer=None, ): model = tf.keras.models.load_model(model_path) data = encode_categorical(data, categorical_columns, encoder=one_hot_scaler, fit=False) data = scale_numerical(data, numerical_columns, scaler=minmax_scaler_inputs, fit=False) if explainer: return model.predict(data), data.columns, explainer.shap_values(data[-10:]) else: return model.predict(data) class EnsembleModel: """ Class to store a list of models and to run predictions as the mean of those models """ def __init__(self, models_list, history_list, loss_threshold=0, scaler_targets=None) -> None: """ Initialized the Ensemble model and cleans the models that stayed stuck, or that didn't achieve a sufficient performance (if loss_threshold parameter is set) By assumption the content of models_list are AI models that have a predict method """ self.models = [] self.models_history = [] self.loss_threshold = loss_threshold for i, model in enumerate(models_list): model_history = history_list[i] if np.abs(min(model_history.history["loss"]) - max(model_history.history["loss"])) < 0.001: print(f"Model {i} skipped due to loss getting stuck") continue if (self.loss_threshold > 0) and (model_history.history["loss"][-1] > self.loss_threshold): print(f"Model {i} skipped due to performance") continue self.models.append(model) self.models_history.append(model) self.scaler_targets = scaler_targets print(f"Ensemble model initialized with {len(self.models)} models") def predict_list(self, data): pred_list = [model.predict(data) for model in self.models] if self.scaler_targets is not None: pred_list = [self.scaler_targets.inverse_transform(pred) for pred in pred_list] return pred_list def predict_w_uncertainty(self, data, uncertainty_type="confidence_interval", model_bias=0.03): """ Returns the prediction and the confidence interval on the data """ # The prediction is the average of all predictions and the uncertainty is the variance of all predictions # LB: not sure this works if multiple targets are predicted with the same model n_models = len(self.models) pred_mean, pred_list = self.predict(data, return_list=True) pred_std = np.std(pred_list, axis=0) training_average_dict = { "%C": 0.587936, "%Co": 0.306122, "%Cr": 0, "%V": 0, "%Mo": 0, "%W": 0.363942, "Temperature_C": 0.387755, } eps = 0.1 if uncertainty_type == "confidence_interval": print("Confidence interval") # Confidence interval = mean +- z * std/sqrt(n) z = 1.96 # 95%: 1.96, 90% 1.645 model_bias_vector = np.ones(pred_mean.shape) * model_bias * pred_mean pred_uncertainty = z * (pred_std + model_bias_vector) / np.sqrt(n_models) elif uncertainty_type == "std": print("Standard deviation") pred_uncertainty = pred_std.copy() else: print("Weighted uncertainty") pred_uncertainty = pred_std.copy() uncertainty_weights = np.ones((pred_std.shape[0],)) dist_df = pd.DataFrame() for col in training_average_dict.keys(): print(training_average_dict[col]) dist_vector = (data[col] - training_average_dict[col]) ** 2 # dist_vector = np.abs(data[col] - training_average_dict[col]) # Quick fix for the constant elements that are not properly scaled if col in ["%Cr", "%V", "%Mo"]: dist_vector = dist_vector / 10 dist_df[col] = dist_vector print(dist_vector) uncertainty_weights = np.sqrt(dist_df.sum(axis=1)) + eps pred_uncertainty = np.multiply(uncertainty_weights, pred_uncertainty[:, 0]) return pred_mean, pred_uncertainty def predict(self, data, return_list=False): """ Returns only the prediction of the Ensemble models on the data """ pred_list = self.predict_list(data) preds = np.mean(pred_list, axis=0) if return_list: return preds, pred_list return preds def unpickle_file(path): with open(path, "rb") as file: unpickler = pickle.Unpickler(file) unpickled_file = unpickler.load() return unpickled_file def read_data(data_path, sep=","): """ Opens the file based on the extension """ file_extension = data_path.split(".")[-1] if file_extension == "csv": return pd.read_csv(data_path, sep=sep) elif file_extension in ["xls", "xlsx"]: return pd.read_excel(data_path) else: return unpickle_file(data_path) class NoPhysicsModels: """ Class to hide the physics-informed features to be able to run the shap interpreter on it """ def __init__(self, model, scaler_inputs=None, preprocessing_physics_fn=None): self.model = model self.scaler_inputs = scaler_inputs self.physics_fn = preprocessing_physics_fn def predict(self, x): x_w_p = self.physics_fn(x) x_w_p_for_scaling = x_w_p[self.scaler_inputs.feature_names_in_] x_w_p_scaled = scale_numerical( x_w_p_for_scaling, self.scaler_inputs.feature_names_in_, scaler=self.scaler_inputs, fit=False ) return self.model.predict(x_w_p_scaled)