bndl commited on
Commit
19b61e8
·
1 Parent(s): d7010e9

Upload 3 files

Browse files
inference_model_main.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import argparse
3
+
4
+ # import shap
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ import os
9
+ import tensorflow as tf
10
+
11
+ from utils import encode_categorical, scale_numerical, fill_nans, unpickle_file, EnsembleModel, read_data
12
+ from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
13
+ import pickle
14
+
15
+
16
+ def predict(model_path, data, explainer=None, scaler_targets=None):
17
+ model_extension = model_path.split(".")[-1]
18
+ if model_extension == "h5":
19
+ model = tf.keras.models.load_model(model_path)
20
+ else:
21
+ model = unpickle_file(model_path)
22
+ pred = model.predict(data)
23
+ if model_extension != "h5":
24
+ # Fix for the RF model in the case where there is one feature only (other cases not supported so far)
25
+ pred = pred.reshape(-1, 1)
26
+ if scaler_targets is not None:
27
+ pred = scaler_targets.inverse_transform(pred)
28
+ if explainer:
29
+ return pred, data.columns, explainer.shap_values(data[-10:])
30
+ else:
31
+ return pred
32
+
33
+
34
+ def predict_from_multiple_models(
35
+ models_order, model_path_dict, data, explainer_path_dict={}, scaler_targets_path_dict={}
36
+ ):
37
+ """
38
+ This function is used in the gradio to predict different targets from different models
39
+ """
40
+ y_pred_list = []
41
+ shap_values_list = []
42
+
43
+ for predict_name in models_order:
44
+ if predict_name in scaler_targets_path_dict.keys():
45
+ scaler_targets = unpickle_file(scaler_targets_path_dict[predict_name])
46
+ else:
47
+ scaler_targets = None
48
+ if predict_name in explainer_path_dict.keys():
49
+ explainer = unpickle_file(explainer_path_dict[predict_name])
50
+ y_pred, _, shap_values = predict(
51
+ model_path_dict[predict_name], data, explainer=explainer, scaler_targets=scaler_targets
52
+ )
53
+ shap_values_list += [shap_values]
54
+ else:
55
+ explainer = None
56
+ y_pred = predict(model_path_dict[predict_name], data, explainer=explainer, scaler_targets=scaler_targets)
57
+
58
+ df_pred_task = pd.DataFrame(y_pred, columns=[predict_name])
59
+ # y_pred_list += [y_pred[0][0]]
60
+ y_pred_list.append(df_pred_task)
61
+ df_pred = pd.concat(y_pred_list, axis=1)
62
+ if len(shap_values_list) > 0:
63
+ return df_pred, shap_values_list
64
+ else:
65
+ return df_pred
66
+
67
+
68
+ def predict_from_ensemble_model(ensemble_model_path, data, explainer=None, uncertainty_type="confidence_interval"):
69
+ """
70
+ Returns the prediction of the model defnied using the EnsembleModel class
71
+ """
72
+ ensemble_model = unpickle_file(ensemble_model_path)
73
+ pred_mean, pred_uncertainty = ensemble_model.predict_w_uncertainty(data, uncertainty_type=uncertainty_type)
74
+ if explainer is not None:
75
+ shap_values = explainer.shap_values(data[-10:])
76
+ return pred_mean, pred_uncertainty, shap_values
77
+ return pred_mean, pred_uncertainty
78
+
79
+
80
+ def predict_all_results(
81
+ df,
82
+ main_model_path,
83
+ main_input_cols_order,
84
+ scaler_targets_main=None,
85
+ intermediate_model_path=None,
86
+ intermediate_results_columns=[],
87
+ return_uncertainty=False,
88
+ uncertainty_type="confidence_interval",
89
+ ):
90
+ """
91
+ Initial df must be scaled
92
+
93
+ Args:
94
+ -----
95
+ df: pd.DataFrame
96
+ Initial inputs
97
+ main_model_path: str
98
+ Path to the model to compute the main results
99
+ scaler_target_main: scaler for the main results
100
+ intermediate_model_path: None, str, dict can be a path to a model or a dict of models
101
+ intermediate_results_columns: List(str)
102
+ """
103
+ if type(intermediate_model_path) == str:
104
+ # This section has not been checked (LB)s
105
+ predictions_constraint = predict(intermediate_model_path, df)
106
+ input_data_main = np.concatenate([df.values[:, :-1], predictions_constraint, [df.values[:, -1]]], axis=1)
107
+ elif type(intermediate_model_path) == dict:
108
+ ### Predict the intermediaary results from a dictionary of models (not rescaled version of the intermediary outputs)
109
+ outputs_df = predict_from_multiple_models(
110
+ intermediate_results_columns,
111
+ intermediate_model_path,
112
+ df,
113
+ explainer_path_dict={},
114
+ scaler_targets_path_dict={},
115
+ )
116
+ input_data_main = pd.concat([df, outputs_df], axis=1) # Concatenate the scaled version of the data
117
+ else:
118
+ input_data_main = df.copy()
119
+
120
+ # Put data in the right order for the main model
121
+ input_data_main = input_data_main[main_input_cols_order]
122
+ # Run the main prediction
123
+ model_extension = main_model_path.split(".")[-1]
124
+ if model_extension == "h5":
125
+ predictions = predict(main_model_path, input_data_main, scaler_targets=scaler_targets_main)
126
+ uncertainty = None
127
+ else:
128
+ predictions, uncertainty = predict_from_ensemble_model(
129
+ main_model_path, input_data_main, uncertainty_type=uncertainty_type
130
+ )
131
+
132
+ if return_uncertainty:
133
+ return predictions, uncertainty
134
+ return predictions
135
+
136
+
137
+ def get_test_inference(
138
+ main_folder,
139
+ columns_numerical,
140
+ columns_target,
141
+ model_name,
142
+ test_data_path,
143
+ x_data_scaled=True,
144
+ y_data_rescaled=False,
145
+ ):
146
+ X_test_data = read_data(os.path.join(main_folder, test_data_path))
147
+ columns_categorical = [column for column in X_test_data.columns if column not in columns_numerical]
148
+
149
+ y_test_data = unpickle_file(os.path.join(main_folder, "y_test_data.pickle"))
150
+ one_hot_scaler = unpickle_file(os.path.join(main_folder, "one_hot_scaler.pickle"))
151
+ minmax_scaler_targets = unpickle_file(os.path.join(main_folder, "minmax_scaler_targets.pickle"))
152
+ minmax_scaler_inputs = unpickle_file(os.path.join(main_folder, "minmax_scaler_inputs.pickle"))
153
+
154
+ for col in columns_target:
155
+ if col in columns_numerical:
156
+ columns_numerical.remove(col)
157
+ # If the data has not been already scaled
158
+ if not x_data_scaled:
159
+ df_with_results = X_test_data.copy()
160
+ X_test_data = scale_numerical(
161
+ X_test_data, minmax_scaler_inputs.feature_names_in_, scaler=minmax_scaler_inputs, fit=False
162
+ )
163
+ else:
164
+ df_with_results = pd.DataFrame(minmax_scaler_inputs.inverse_transform(X_test_data), columns=X_test_data.columns)
165
+
166
+ ### Run model in inference mode
167
+ predictions = predict(os.path.join(main_folder, model_name), X_test_data)
168
+ y_test_data = minmax_scaler_targets.inverse_transform(y_test_data)
169
+
170
+ # Depending on the model used the targets may already be rescaled (case of Ensemble models to run the uncertainty)
171
+ if not y_data_rescaled:
172
+ predictions = minmax_scaler_targets.inverse_transform(predictions)
173
+
174
+ print("***************************************************")
175
+ print(predictions)
176
+ print(predictions.shape, y_test_data.shape)
177
+ results = pd.DataFrame(
178
+ {
179
+ "predictions": np.squeeze(predictions[:, 0]),
180
+ "ground truth": np.squeeze(y_test_data),
181
+ "mae": np.abs(np.squeeze(predictions[:, 0]) - np.squeeze(y_test_data)),
182
+ "mse": np.sqrt(np.square(np.squeeze(predictions[:, 0]) - np.squeeze(y_test_data))),
183
+ "percentage error": np.abs(
184
+ (np.squeeze(predictions[:, 0]) - np.squeeze(y_test_data)) / np.squeeze(predictions[:, 0])
185
+ )
186
+ * 100,
187
+ }
188
+ )
189
+
190
+ mean_results = pd.DataFrame(
191
+ {
192
+ "mean mae": [np.mean(results["mae"])],
193
+ "mean mse": [np.mean(results["mse"])],
194
+ "mean percentage error": [np.mean(results["percentage error"])],
195
+ }
196
+ )
197
+ print(mean_results)
198
+
199
+ metrics = {
200
+ "mae": mean_absolute_error(y_test_data, predictions),
201
+ "mape": mean_absolute_percentage_error(y_test_data, predictions),
202
+ "r2": r2_score(y_test_data, predictions),
203
+ }
204
+
205
+ with open(os.path.join(main_folder, "metrics.pkl"), "wb+") as file:
206
+ pickle.dump(metrics, file)
207
+
208
+ ### Plot predictions vs ground truth
209
+ plt.clf()
210
+ plt.scatter(results["ground truth"], results["predictions"], c="r")
211
+ plt.plot(results["ground truth"], results["ground truth"])
212
+ plt.xlabel("Ground truth")
213
+ plt.ylabel("Predictions")
214
+ fig = plt.gcf()
215
+ fig.savefig(os.path.join(main_folder, "plot_performance_test.png"))
216
+ plt.show()
217
+
218
+ df_with_results["ground_truth"] = y_test_data
219
+ df_with_results["predictions"] = predictions
220
+ return metrics
221
+
222
+
223
+ if __name__ == "__main__":
224
+ parser = argparse.ArgumentParser(description="Process parameters")
225
+ parser.add_argument(
226
+ "--model_path", type=str, help="The path to your model file", default="model_hardness.h5", required=False
227
+ )
228
+ parser.add_argument(
229
+ "--model_folder", type=str, help="The path to your model folder", default="./models/phases", required=False
230
+ )
231
+ parser.add_argument(
232
+ "--df_columns",
233
+ type=str,
234
+ help="List of data columns of dataset",
235
+ default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature",
236
+ required=False,
237
+ )
238
+ parser.add_argument("--columns_target", type=str, help="List of target columns", default="H", required=False)
239
+ parser.add_argument(
240
+ "--columns_numerical",
241
+ type=str,
242
+ help="List of data columns with numeric values",
243
+ default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
244
+ required=False,
245
+ )
246
+ parser.add_argument(
247
+ "--data_path",
248
+ type=str,
249
+ help="The path to your input data for inference",
250
+ default="X_test_data.pickle",
251
+ required=False,
252
+ )
253
+
254
+ args = parser.parse_args()
255
+
256
+ ### Get categorical and numerical columns
257
+ columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
258
+ df_columns = args.df_columns.split(",")
259
+ columns_target = args.columns_target.split(",")
260
+
261
+ get_test_inference(
262
+ args.model_folder,
263
+ columns_numerical,
264
+ columns_target,
265
+ args.model_path,
266
+ args.data_path,
267
+ x_data_scaled=True,
268
+ y_data_rescaled=False,
269
+ )
preprocess_data_main.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import numpy as np
7
+ import pickle
8
+ import argparse
9
+
10
+ from utils import encode_categorical, scale_numerical, fill_nans, read_data
11
+ from alloy_data_preprocessing import add_physics_features
12
+
13
+
14
+ def alloy_preprocessing(df):
15
+ return add_physics_features(df)
16
+
17
+
18
+ if __name__ == "__main__":
19
+ parser = argparse.ArgumentParser(description="Process parameters")
20
+ parser.add_argument(
21
+ "--data_path",
22
+ type=str,
23
+ help="The path to your input data file",
24
+ default="./data/Data_Osium.csv",
25
+ required=False,
26
+ )
27
+ parser.add_argument(
28
+ "--preprocessed_data_path",
29
+ type=str,
30
+ help="The path to your input data file preprocessed for training",
31
+ default="preprocessed_data.csv",
32
+ required=False,
33
+ )
34
+ parser.add_argument(
35
+ "--columns_not_training",
36
+ type=str,
37
+ help="List of data columns not used for training",
38
+ default="",
39
+ required=False,
40
+ )
41
+ parser.add_argument(
42
+ "--columns_numerical",
43
+ type=str,
44
+ help="List of data columns with numeric values",
45
+ default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
46
+ required=False,
47
+ )
48
+ parser.add_argument(
49
+ "--add_physics",
50
+ type=str,
51
+ help="Whether to add physics based features",
52
+ default="y",
53
+ required=False,
54
+ )
55
+
56
+ args = parser.parse_args()
57
+
58
+ df = read_data(args.data_path)
59
+
60
+ columns_not_training = args.columns_not_training.split(",") if args.columns_not_training else []
61
+ df.drop(columns=columns_not_training, inplace=True)
62
+
63
+ columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
64
+
65
+ # Fill nan values
66
+ for col in df.columns:
67
+ if col not in columns_numerical:
68
+ df[col] = df[col].fillna(df[col].mode()[0])
69
+ else:
70
+ df[col] = df[col].fillna(df[col].mean())
71
+
72
+ assert sum(np.sum(df.isna()) != 0) == 0
73
+
74
+ if args.add_physics == "y":
75
+ df = alloy_preprocessing(df)
76
+
77
+ df.to_csv(args.preprocessed_data_path, sep=";", index=False)
train_ensemble_models_main.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import pandas as pd
3
+ import os
4
+ import numpy as np
5
+ import pickle
6
+
7
+ from train_model_main import prepare_data, set_all_seeds, train_model, save_shap_explainer
8
+ from utils import EnsembleModel, unpickle_file
9
+
10
+
11
+ def run_ensemble_models_training(
12
+ data,
13
+ columns_numerical,
14
+ columns_target,
15
+ main_folder,
16
+ model_name,
17
+ data_type="path",
18
+ lr=0.01,
19
+ n_models=5,
20
+ save_explainer_single=False,
21
+ save_explainer_ensemble=True,
22
+ seed_train_test_split=None,
23
+ ):
24
+ """
25
+ Runs multiple models with different seed for the same prediction task
26
+ """
27
+ seeds = range(n_models)
28
+ model_list = []
29
+ model_path_list = []
30
+ history_list = []
31
+ for s in seeds:
32
+ print("-----------------------")
33
+ print(f"Training model {s + 1}/{n_models}")
34
+ main_seed_folder = os.path.join(main_folder, f"seed{s}")
35
+ seed_split = s
36
+ if seed_train_test_split is not None:
37
+ seed_split = seed_train_test_split
38
+ X_train, X_test, y_train, y_test = prepare_data(
39
+ data, columns_numerical, columns_target, main_seed_folder, data_type=data_type, seed=seed_split
40
+ )
41
+ model, history = train_model(
42
+ X_train,
43
+ X_test,
44
+ y_train,
45
+ y_test,
46
+ columns_target,
47
+ main_seed_folder,
48
+ model_name,
49
+ lr=lr,
50
+ seed=s,
51
+ get_history=True,
52
+ )
53
+ model_list.append(model)
54
+ history_list.append(history)
55
+ model_path_list.append(os.path.join(main_seed_folder, model_name))
56
+ if save_explainer_single:
57
+ save_shap_explainer(model.predict, X_train, X_test, main_seed_folder)
58
+
59
+ scaler_targets = unpickle_file(os.path.join(main_seed_folder, "minmax_scaler_targets.pickle"))
60
+ ensemble_model = EnsembleModel(model_list, history_list, scaler_targets=scaler_targets)
61
+ with open(os.path.join(main_folder, f"ensemble_{model_name.split('.')[0]}.pkl"), "wb+") as file:
62
+ pickle.dump(ensemble_model, file)
63
+ # For now just gets the last X_train, X_test, but should be changed to a better solution
64
+ X_train_all = X_train.copy()
65
+ X_test_all = X_test.copy()
66
+
67
+ if save_explainer_ensemble:
68
+ save_shap_explainer(ensemble_model.predict, X_train_all, X_test_all, main_folder)
69
+
70
+ return model_list
71
+
72
+
73
+ def train_ensemble_models_from_split(
74
+ X_train, X_test, y_train, y_test, columns_target, main_folder, model_path, lr=0.01, n_models=5, save_explainer=True
75
+ ):
76
+ """
77
+ Assumes the train set is the same for all the models
78
+ """
79
+ seeds = range(n_models)
80
+ model_ls = []
81
+ for s in seeds:
82
+ model_main_name = model_path.split(".")[0]
83
+ model_ext = model_path.split(".")[1]
84
+ model_name = f"{model_main_name}_s{s}.{model_ext}"
85
+ model = train_model(X_train, X_test, y_train, y_test, columns_target, main_folder, model_name, lr=lr, seed=s)
86
+ model_ls.append(model)
87
+ if save_explainer:
88
+ save_shap_explainer(model, X_train, X_test, main_folder)
89
+
90
+ return model_ls
91
+
92
+
93
+ if __name__ == "__main__":
94
+ parser = argparse.ArgumentParser(description="Process parameters")
95
+ parser.add_argument(
96
+ "--data_path",
97
+ type=str,
98
+ help="The path to your input data file",
99
+ default="preprocessed_data.csv",
100
+ required=False,
101
+ )
102
+ parser.add_argument(
103
+ "--main_folder", type=str, help="Folder to save model files", default="../models/hardness", required=False
104
+ )
105
+ parser.add_argument(
106
+ "--model_name", type=str, help="Path to save model", default="model_hardness.h5", required=False
107
+ )
108
+ parser.add_argument("--columns_target", type=str, help="List of target columns", default="H", required=False)
109
+ parser.add_argument(
110
+ "--columns_numerical",
111
+ type=str,
112
+ help="List of data columns with numeric values",
113
+ default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
114
+ required=False,
115
+ )
116
+ parser.add_argument("--learning_rate", "-lr", type=float, help="Learning rate", default=0.01, required=False)
117
+ parser.add_argument("--n_models", "-n", type=int, help="Number of models to run", default=2, required=False)
118
+
119
+ args = parser.parse_args()
120
+
121
+ columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
122
+ columns_target = args.columns_target.split(",") if args.columns_target else []
123
+
124
+ run_ensemble_models_training(
125
+ args.data_path,
126
+ columns_numerical,
127
+ columns_target,
128
+ args.main_folder,
129
+ args.model_name,
130
+ lr=args.learning_rate,
131
+ n_models=args.n_models,
132
+ )