Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- inference_model_main.py +269 -0
- preprocess_data_main.py +77 -0
- train_ensemble_models_main.py +132 -0
inference_model_main.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import argparse
|
3 |
+
|
4 |
+
# import shap
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import os
|
9 |
+
import tensorflow as tf
|
10 |
+
|
11 |
+
from utils import encode_categorical, scale_numerical, fill_nans, unpickle_file, EnsembleModel, read_data
|
12 |
+
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
|
13 |
+
import pickle
|
14 |
+
|
15 |
+
|
16 |
+
def predict(model_path, data, explainer=None, scaler_targets=None):
|
17 |
+
model_extension = model_path.split(".")[-1]
|
18 |
+
if model_extension == "h5":
|
19 |
+
model = tf.keras.models.load_model(model_path)
|
20 |
+
else:
|
21 |
+
model = unpickle_file(model_path)
|
22 |
+
pred = model.predict(data)
|
23 |
+
if model_extension != "h5":
|
24 |
+
# Fix for the RF model in the case where there is one feature only (other cases not supported so far)
|
25 |
+
pred = pred.reshape(-1, 1)
|
26 |
+
if scaler_targets is not None:
|
27 |
+
pred = scaler_targets.inverse_transform(pred)
|
28 |
+
if explainer:
|
29 |
+
return pred, data.columns, explainer.shap_values(data[-10:])
|
30 |
+
else:
|
31 |
+
return pred
|
32 |
+
|
33 |
+
|
34 |
+
def predict_from_multiple_models(
|
35 |
+
models_order, model_path_dict, data, explainer_path_dict={}, scaler_targets_path_dict={}
|
36 |
+
):
|
37 |
+
"""
|
38 |
+
This function is used in the gradio to predict different targets from different models
|
39 |
+
"""
|
40 |
+
y_pred_list = []
|
41 |
+
shap_values_list = []
|
42 |
+
|
43 |
+
for predict_name in models_order:
|
44 |
+
if predict_name in scaler_targets_path_dict.keys():
|
45 |
+
scaler_targets = unpickle_file(scaler_targets_path_dict[predict_name])
|
46 |
+
else:
|
47 |
+
scaler_targets = None
|
48 |
+
if predict_name in explainer_path_dict.keys():
|
49 |
+
explainer = unpickle_file(explainer_path_dict[predict_name])
|
50 |
+
y_pred, _, shap_values = predict(
|
51 |
+
model_path_dict[predict_name], data, explainer=explainer, scaler_targets=scaler_targets
|
52 |
+
)
|
53 |
+
shap_values_list += [shap_values]
|
54 |
+
else:
|
55 |
+
explainer = None
|
56 |
+
y_pred = predict(model_path_dict[predict_name], data, explainer=explainer, scaler_targets=scaler_targets)
|
57 |
+
|
58 |
+
df_pred_task = pd.DataFrame(y_pred, columns=[predict_name])
|
59 |
+
# y_pred_list += [y_pred[0][0]]
|
60 |
+
y_pred_list.append(df_pred_task)
|
61 |
+
df_pred = pd.concat(y_pred_list, axis=1)
|
62 |
+
if len(shap_values_list) > 0:
|
63 |
+
return df_pred, shap_values_list
|
64 |
+
else:
|
65 |
+
return df_pred
|
66 |
+
|
67 |
+
|
68 |
+
def predict_from_ensemble_model(ensemble_model_path, data, explainer=None, uncertainty_type="confidence_interval"):
|
69 |
+
"""
|
70 |
+
Returns the prediction of the model defnied using the EnsembleModel class
|
71 |
+
"""
|
72 |
+
ensemble_model = unpickle_file(ensemble_model_path)
|
73 |
+
pred_mean, pred_uncertainty = ensemble_model.predict_w_uncertainty(data, uncertainty_type=uncertainty_type)
|
74 |
+
if explainer is not None:
|
75 |
+
shap_values = explainer.shap_values(data[-10:])
|
76 |
+
return pred_mean, pred_uncertainty, shap_values
|
77 |
+
return pred_mean, pred_uncertainty
|
78 |
+
|
79 |
+
|
80 |
+
def predict_all_results(
|
81 |
+
df,
|
82 |
+
main_model_path,
|
83 |
+
main_input_cols_order,
|
84 |
+
scaler_targets_main=None,
|
85 |
+
intermediate_model_path=None,
|
86 |
+
intermediate_results_columns=[],
|
87 |
+
return_uncertainty=False,
|
88 |
+
uncertainty_type="confidence_interval",
|
89 |
+
):
|
90 |
+
"""
|
91 |
+
Initial df must be scaled
|
92 |
+
|
93 |
+
Args:
|
94 |
+
-----
|
95 |
+
df: pd.DataFrame
|
96 |
+
Initial inputs
|
97 |
+
main_model_path: str
|
98 |
+
Path to the model to compute the main results
|
99 |
+
scaler_target_main: scaler for the main results
|
100 |
+
intermediate_model_path: None, str, dict can be a path to a model or a dict of models
|
101 |
+
intermediate_results_columns: List(str)
|
102 |
+
"""
|
103 |
+
if type(intermediate_model_path) == str:
|
104 |
+
# This section has not been checked (LB)s
|
105 |
+
predictions_constraint = predict(intermediate_model_path, df)
|
106 |
+
input_data_main = np.concatenate([df.values[:, :-1], predictions_constraint, [df.values[:, -1]]], axis=1)
|
107 |
+
elif type(intermediate_model_path) == dict:
|
108 |
+
### Predict the intermediaary results from a dictionary of models (not rescaled version of the intermediary outputs)
|
109 |
+
outputs_df = predict_from_multiple_models(
|
110 |
+
intermediate_results_columns,
|
111 |
+
intermediate_model_path,
|
112 |
+
df,
|
113 |
+
explainer_path_dict={},
|
114 |
+
scaler_targets_path_dict={},
|
115 |
+
)
|
116 |
+
input_data_main = pd.concat([df, outputs_df], axis=1) # Concatenate the scaled version of the data
|
117 |
+
else:
|
118 |
+
input_data_main = df.copy()
|
119 |
+
|
120 |
+
# Put data in the right order for the main model
|
121 |
+
input_data_main = input_data_main[main_input_cols_order]
|
122 |
+
# Run the main prediction
|
123 |
+
model_extension = main_model_path.split(".")[-1]
|
124 |
+
if model_extension == "h5":
|
125 |
+
predictions = predict(main_model_path, input_data_main, scaler_targets=scaler_targets_main)
|
126 |
+
uncertainty = None
|
127 |
+
else:
|
128 |
+
predictions, uncertainty = predict_from_ensemble_model(
|
129 |
+
main_model_path, input_data_main, uncertainty_type=uncertainty_type
|
130 |
+
)
|
131 |
+
|
132 |
+
if return_uncertainty:
|
133 |
+
return predictions, uncertainty
|
134 |
+
return predictions
|
135 |
+
|
136 |
+
|
137 |
+
def get_test_inference(
|
138 |
+
main_folder,
|
139 |
+
columns_numerical,
|
140 |
+
columns_target,
|
141 |
+
model_name,
|
142 |
+
test_data_path,
|
143 |
+
x_data_scaled=True,
|
144 |
+
y_data_rescaled=False,
|
145 |
+
):
|
146 |
+
X_test_data = read_data(os.path.join(main_folder, test_data_path))
|
147 |
+
columns_categorical = [column for column in X_test_data.columns if column not in columns_numerical]
|
148 |
+
|
149 |
+
y_test_data = unpickle_file(os.path.join(main_folder, "y_test_data.pickle"))
|
150 |
+
one_hot_scaler = unpickle_file(os.path.join(main_folder, "one_hot_scaler.pickle"))
|
151 |
+
minmax_scaler_targets = unpickle_file(os.path.join(main_folder, "minmax_scaler_targets.pickle"))
|
152 |
+
minmax_scaler_inputs = unpickle_file(os.path.join(main_folder, "minmax_scaler_inputs.pickle"))
|
153 |
+
|
154 |
+
for col in columns_target:
|
155 |
+
if col in columns_numerical:
|
156 |
+
columns_numerical.remove(col)
|
157 |
+
# If the data has not been already scaled
|
158 |
+
if not x_data_scaled:
|
159 |
+
df_with_results = X_test_data.copy()
|
160 |
+
X_test_data = scale_numerical(
|
161 |
+
X_test_data, minmax_scaler_inputs.feature_names_in_, scaler=minmax_scaler_inputs, fit=False
|
162 |
+
)
|
163 |
+
else:
|
164 |
+
df_with_results = pd.DataFrame(minmax_scaler_inputs.inverse_transform(X_test_data), columns=X_test_data.columns)
|
165 |
+
|
166 |
+
### Run model in inference mode
|
167 |
+
predictions = predict(os.path.join(main_folder, model_name), X_test_data)
|
168 |
+
y_test_data = minmax_scaler_targets.inverse_transform(y_test_data)
|
169 |
+
|
170 |
+
# Depending on the model used the targets may already be rescaled (case of Ensemble models to run the uncertainty)
|
171 |
+
if not y_data_rescaled:
|
172 |
+
predictions = minmax_scaler_targets.inverse_transform(predictions)
|
173 |
+
|
174 |
+
print("***************************************************")
|
175 |
+
print(predictions)
|
176 |
+
print(predictions.shape, y_test_data.shape)
|
177 |
+
results = pd.DataFrame(
|
178 |
+
{
|
179 |
+
"predictions": np.squeeze(predictions[:, 0]),
|
180 |
+
"ground truth": np.squeeze(y_test_data),
|
181 |
+
"mae": np.abs(np.squeeze(predictions[:, 0]) - np.squeeze(y_test_data)),
|
182 |
+
"mse": np.sqrt(np.square(np.squeeze(predictions[:, 0]) - np.squeeze(y_test_data))),
|
183 |
+
"percentage error": np.abs(
|
184 |
+
(np.squeeze(predictions[:, 0]) - np.squeeze(y_test_data)) / np.squeeze(predictions[:, 0])
|
185 |
+
)
|
186 |
+
* 100,
|
187 |
+
}
|
188 |
+
)
|
189 |
+
|
190 |
+
mean_results = pd.DataFrame(
|
191 |
+
{
|
192 |
+
"mean mae": [np.mean(results["mae"])],
|
193 |
+
"mean mse": [np.mean(results["mse"])],
|
194 |
+
"mean percentage error": [np.mean(results["percentage error"])],
|
195 |
+
}
|
196 |
+
)
|
197 |
+
print(mean_results)
|
198 |
+
|
199 |
+
metrics = {
|
200 |
+
"mae": mean_absolute_error(y_test_data, predictions),
|
201 |
+
"mape": mean_absolute_percentage_error(y_test_data, predictions),
|
202 |
+
"r2": r2_score(y_test_data, predictions),
|
203 |
+
}
|
204 |
+
|
205 |
+
with open(os.path.join(main_folder, "metrics.pkl"), "wb+") as file:
|
206 |
+
pickle.dump(metrics, file)
|
207 |
+
|
208 |
+
### Plot predictions vs ground truth
|
209 |
+
plt.clf()
|
210 |
+
plt.scatter(results["ground truth"], results["predictions"], c="r")
|
211 |
+
plt.plot(results["ground truth"], results["ground truth"])
|
212 |
+
plt.xlabel("Ground truth")
|
213 |
+
plt.ylabel("Predictions")
|
214 |
+
fig = plt.gcf()
|
215 |
+
fig.savefig(os.path.join(main_folder, "plot_performance_test.png"))
|
216 |
+
plt.show()
|
217 |
+
|
218 |
+
df_with_results["ground_truth"] = y_test_data
|
219 |
+
df_with_results["predictions"] = predictions
|
220 |
+
return metrics
|
221 |
+
|
222 |
+
|
223 |
+
if __name__ == "__main__":
|
224 |
+
parser = argparse.ArgumentParser(description="Process parameters")
|
225 |
+
parser.add_argument(
|
226 |
+
"--model_path", type=str, help="The path to your model file", default="model_hardness.h5", required=False
|
227 |
+
)
|
228 |
+
parser.add_argument(
|
229 |
+
"--model_folder", type=str, help="The path to your model folder", default="./models/phases", required=False
|
230 |
+
)
|
231 |
+
parser.add_argument(
|
232 |
+
"--df_columns",
|
233 |
+
type=str,
|
234 |
+
help="List of data columns of dataset",
|
235 |
+
default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature",
|
236 |
+
required=False,
|
237 |
+
)
|
238 |
+
parser.add_argument("--columns_target", type=str, help="List of target columns", default="H", required=False)
|
239 |
+
parser.add_argument(
|
240 |
+
"--columns_numerical",
|
241 |
+
type=str,
|
242 |
+
help="List of data columns with numeric values",
|
243 |
+
default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
|
244 |
+
required=False,
|
245 |
+
)
|
246 |
+
parser.add_argument(
|
247 |
+
"--data_path",
|
248 |
+
type=str,
|
249 |
+
help="The path to your input data for inference",
|
250 |
+
default="X_test_data.pickle",
|
251 |
+
required=False,
|
252 |
+
)
|
253 |
+
|
254 |
+
args = parser.parse_args()
|
255 |
+
|
256 |
+
### Get categorical and numerical columns
|
257 |
+
columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
|
258 |
+
df_columns = args.df_columns.split(",")
|
259 |
+
columns_target = args.columns_target.split(",")
|
260 |
+
|
261 |
+
get_test_inference(
|
262 |
+
args.model_folder,
|
263 |
+
columns_numerical,
|
264 |
+
columns_target,
|
265 |
+
args.model_path,
|
266 |
+
args.data_path,
|
267 |
+
x_data_scaled=True,
|
268 |
+
y_data_rescaled=False,
|
269 |
+
)
|
preprocess_data_main.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import numpy as np
|
7 |
+
import pickle
|
8 |
+
import argparse
|
9 |
+
|
10 |
+
from utils import encode_categorical, scale_numerical, fill_nans, read_data
|
11 |
+
from alloy_data_preprocessing import add_physics_features
|
12 |
+
|
13 |
+
|
14 |
+
def alloy_preprocessing(df):
|
15 |
+
return add_physics_features(df)
|
16 |
+
|
17 |
+
|
18 |
+
if __name__ == "__main__":
|
19 |
+
parser = argparse.ArgumentParser(description="Process parameters")
|
20 |
+
parser.add_argument(
|
21 |
+
"--data_path",
|
22 |
+
type=str,
|
23 |
+
help="The path to your input data file",
|
24 |
+
default="./data/Data_Osium.csv",
|
25 |
+
required=False,
|
26 |
+
)
|
27 |
+
parser.add_argument(
|
28 |
+
"--preprocessed_data_path",
|
29 |
+
type=str,
|
30 |
+
help="The path to your input data file preprocessed for training",
|
31 |
+
default="preprocessed_data.csv",
|
32 |
+
required=False,
|
33 |
+
)
|
34 |
+
parser.add_argument(
|
35 |
+
"--columns_not_training",
|
36 |
+
type=str,
|
37 |
+
help="List of data columns not used for training",
|
38 |
+
default="",
|
39 |
+
required=False,
|
40 |
+
)
|
41 |
+
parser.add_argument(
|
42 |
+
"--columns_numerical",
|
43 |
+
type=str,
|
44 |
+
help="List of data columns with numeric values",
|
45 |
+
default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
|
46 |
+
required=False,
|
47 |
+
)
|
48 |
+
parser.add_argument(
|
49 |
+
"--add_physics",
|
50 |
+
type=str,
|
51 |
+
help="Whether to add physics based features",
|
52 |
+
default="y",
|
53 |
+
required=False,
|
54 |
+
)
|
55 |
+
|
56 |
+
args = parser.parse_args()
|
57 |
+
|
58 |
+
df = read_data(args.data_path)
|
59 |
+
|
60 |
+
columns_not_training = args.columns_not_training.split(",") if args.columns_not_training else []
|
61 |
+
df.drop(columns=columns_not_training, inplace=True)
|
62 |
+
|
63 |
+
columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
|
64 |
+
|
65 |
+
# Fill nan values
|
66 |
+
for col in df.columns:
|
67 |
+
if col not in columns_numerical:
|
68 |
+
df[col] = df[col].fillna(df[col].mode()[0])
|
69 |
+
else:
|
70 |
+
df[col] = df[col].fillna(df[col].mean())
|
71 |
+
|
72 |
+
assert sum(np.sum(df.isna()) != 0) == 0
|
73 |
+
|
74 |
+
if args.add_physics == "y":
|
75 |
+
df = alloy_preprocessing(df)
|
76 |
+
|
77 |
+
df.to_csv(args.preprocessed_data_path, sep=";", index=False)
|
train_ensemble_models_main.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
import numpy as np
|
5 |
+
import pickle
|
6 |
+
|
7 |
+
from train_model_main import prepare_data, set_all_seeds, train_model, save_shap_explainer
|
8 |
+
from utils import EnsembleModel, unpickle_file
|
9 |
+
|
10 |
+
|
11 |
+
def run_ensemble_models_training(
|
12 |
+
data,
|
13 |
+
columns_numerical,
|
14 |
+
columns_target,
|
15 |
+
main_folder,
|
16 |
+
model_name,
|
17 |
+
data_type="path",
|
18 |
+
lr=0.01,
|
19 |
+
n_models=5,
|
20 |
+
save_explainer_single=False,
|
21 |
+
save_explainer_ensemble=True,
|
22 |
+
seed_train_test_split=None,
|
23 |
+
):
|
24 |
+
"""
|
25 |
+
Runs multiple models with different seed for the same prediction task
|
26 |
+
"""
|
27 |
+
seeds = range(n_models)
|
28 |
+
model_list = []
|
29 |
+
model_path_list = []
|
30 |
+
history_list = []
|
31 |
+
for s in seeds:
|
32 |
+
print("-----------------------")
|
33 |
+
print(f"Training model {s + 1}/{n_models}")
|
34 |
+
main_seed_folder = os.path.join(main_folder, f"seed{s}")
|
35 |
+
seed_split = s
|
36 |
+
if seed_train_test_split is not None:
|
37 |
+
seed_split = seed_train_test_split
|
38 |
+
X_train, X_test, y_train, y_test = prepare_data(
|
39 |
+
data, columns_numerical, columns_target, main_seed_folder, data_type=data_type, seed=seed_split
|
40 |
+
)
|
41 |
+
model, history = train_model(
|
42 |
+
X_train,
|
43 |
+
X_test,
|
44 |
+
y_train,
|
45 |
+
y_test,
|
46 |
+
columns_target,
|
47 |
+
main_seed_folder,
|
48 |
+
model_name,
|
49 |
+
lr=lr,
|
50 |
+
seed=s,
|
51 |
+
get_history=True,
|
52 |
+
)
|
53 |
+
model_list.append(model)
|
54 |
+
history_list.append(history)
|
55 |
+
model_path_list.append(os.path.join(main_seed_folder, model_name))
|
56 |
+
if save_explainer_single:
|
57 |
+
save_shap_explainer(model.predict, X_train, X_test, main_seed_folder)
|
58 |
+
|
59 |
+
scaler_targets = unpickle_file(os.path.join(main_seed_folder, "minmax_scaler_targets.pickle"))
|
60 |
+
ensemble_model = EnsembleModel(model_list, history_list, scaler_targets=scaler_targets)
|
61 |
+
with open(os.path.join(main_folder, f"ensemble_{model_name.split('.')[0]}.pkl"), "wb+") as file:
|
62 |
+
pickle.dump(ensemble_model, file)
|
63 |
+
# For now just gets the last X_train, X_test, but should be changed to a better solution
|
64 |
+
X_train_all = X_train.copy()
|
65 |
+
X_test_all = X_test.copy()
|
66 |
+
|
67 |
+
if save_explainer_ensemble:
|
68 |
+
save_shap_explainer(ensemble_model.predict, X_train_all, X_test_all, main_folder)
|
69 |
+
|
70 |
+
return model_list
|
71 |
+
|
72 |
+
|
73 |
+
def train_ensemble_models_from_split(
|
74 |
+
X_train, X_test, y_train, y_test, columns_target, main_folder, model_path, lr=0.01, n_models=5, save_explainer=True
|
75 |
+
):
|
76 |
+
"""
|
77 |
+
Assumes the train set is the same for all the models
|
78 |
+
"""
|
79 |
+
seeds = range(n_models)
|
80 |
+
model_ls = []
|
81 |
+
for s in seeds:
|
82 |
+
model_main_name = model_path.split(".")[0]
|
83 |
+
model_ext = model_path.split(".")[1]
|
84 |
+
model_name = f"{model_main_name}_s{s}.{model_ext}"
|
85 |
+
model = train_model(X_train, X_test, y_train, y_test, columns_target, main_folder, model_name, lr=lr, seed=s)
|
86 |
+
model_ls.append(model)
|
87 |
+
if save_explainer:
|
88 |
+
save_shap_explainer(model, X_train, X_test, main_folder)
|
89 |
+
|
90 |
+
return model_ls
|
91 |
+
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
parser = argparse.ArgumentParser(description="Process parameters")
|
95 |
+
parser.add_argument(
|
96 |
+
"--data_path",
|
97 |
+
type=str,
|
98 |
+
help="The path to your input data file",
|
99 |
+
default="preprocessed_data.csv",
|
100 |
+
required=False,
|
101 |
+
)
|
102 |
+
parser.add_argument(
|
103 |
+
"--main_folder", type=str, help="Folder to save model files", default="../models/hardness", required=False
|
104 |
+
)
|
105 |
+
parser.add_argument(
|
106 |
+
"--model_name", type=str, help="Path to save model", default="model_hardness.h5", required=False
|
107 |
+
)
|
108 |
+
parser.add_argument("--columns_target", type=str, help="List of target columns", default="H", required=False)
|
109 |
+
parser.add_argument(
|
110 |
+
"--columns_numerical",
|
111 |
+
type=str,
|
112 |
+
help="List of data columns with numeric values",
|
113 |
+
default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
|
114 |
+
required=False,
|
115 |
+
)
|
116 |
+
parser.add_argument("--learning_rate", "-lr", type=float, help="Learning rate", default=0.01, required=False)
|
117 |
+
parser.add_argument("--n_models", "-n", type=int, help="Number of models to run", default=2, required=False)
|
118 |
+
|
119 |
+
args = parser.parse_args()
|
120 |
+
|
121 |
+
columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
|
122 |
+
columns_target = args.columns_target.split(",") if args.columns_target else []
|
123 |
+
|
124 |
+
run_ensemble_models_training(
|
125 |
+
args.data_path,
|
126 |
+
columns_numerical,
|
127 |
+
columns_target,
|
128 |
+
args.main_folder,
|
129 |
+
args.model_name,
|
130 |
+
lr=args.learning_rate,
|
131 |
+
n_models=args.n_models,
|
132 |
+
)
|