bndl commited on
Commit
155354b
·
1 Parent(s): b070101

Upload train_model_main.py

Browse files
Files changed (1) hide show
  1. train_model_main.py +254 -0
train_model_main.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import pandas as pd
3
+ import tensorflow as tf
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+
7
+ # from numpy.random import seed
8
+ import random
9
+ import os
10
+
11
+ import pickle
12
+ import shap
13
+ import dill
14
+
15
+ from utils import encode_categorical, scale_numerical, NoPhysicsModels, unpickle_file
16
+ from alloy_data_preprocessing import add_physics_features
17
+ import tensorflow as tf
18
+ from tensorflow.keras import initializers
19
+ from sklearn.model_selection import train_test_split
20
+ from sklearn.ensemble import RandomForestRegressor
21
+ from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
22
+
23
+ SEED = 42
24
+
25
+
26
+ def set_all_seeds(seed=SEED):
27
+ os.environ["PYTHONHASHSEED"] = str(seed)
28
+ tf.keras.utils.set_random_seed(seed)
29
+ np.random.seed(seed)
30
+ random.seed(seed)
31
+
32
+
33
+ def setup_model(num_outputs):
34
+ model = tf.keras.models.Sequential(
35
+ [
36
+ tf.keras.layers.Flatten(),
37
+ tf.keras.layers.Dense(
38
+ 8,
39
+ kernel_initializer=initializers.RandomNormal(stddev=0.00001), # Initially was at 0.01
40
+ bias_initializer=initializers.Zeros(),
41
+ activation="relu",
42
+ ),
43
+ tf.keras.layers.Dense(
44
+ 4,
45
+ activation="relu",
46
+ kernel_initializer=initializers.RandomNormal(stddev=0.00001), # Initially was at 0.01
47
+ bias_initializer=initializers.Zeros(),
48
+ ),
49
+ tf.keras.layers.Dense(
50
+ num_outputs,
51
+ activation="relu",
52
+ kernel_initializer=initializers.RandomNormal(stddev=0.00001), # Initially was at 0.01
53
+ bias_initializer=initializers.Zeros(),
54
+ ),
55
+ ]
56
+ )
57
+ return model
58
+
59
+
60
+ def prepare_data(data, columns_num, columns_target, main_folder, data_type="path", seed=SEED):
61
+ # Create folder if doesn't exist
62
+ if not os.path.exists(main_folder):
63
+ os.makedirs(main_folder)
64
+
65
+ columns_numerical = columns_num.copy()
66
+
67
+ ### Read data
68
+ print(data_type)
69
+ if data_type == "path":
70
+ df = pd.read_csv(data, sep=";")
71
+ else:
72
+ df = data.copy()
73
+
74
+ ### Remove columns not used during training
75
+ X = df.drop(columns=columns_target)
76
+ y = df[columns_target]
77
+
78
+ # Remove the index columns (if coming from the sampling pipeline)
79
+ if "Index" in X.columns:
80
+ X.drop(columns=["Index"], inplace=True)
81
+
82
+ ### Get categorical columns
83
+ columns_categorical = [column for column in X.columns if column not in columns_numerical]
84
+ # ### Remove target from column names
85
+ # for target in columns_target:
86
+ # columns_numerical.remove(target)
87
+ print("lllllllllllllllllllllllll")
88
+ print(X.columns)
89
+
90
+ ### Encode variables into one-hot
91
+ X, one_hot_scaler = encode_categorical(X, columns_categorical)
92
+ X, minmax_scaler_inputs = scale_numerical(
93
+ X, [column for column in columns_numerical if column not in columns_target]
94
+ )
95
+ y, minmax_scaler_targets = scale_numerical(y, columns_target)
96
+
97
+ ### Split data
98
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
99
+
100
+ ### Pickle data
101
+ with open(os.path.join(main_folder, f"X_test_data.pickle"), "wb+") as file:
102
+ pickle.dump(X_test, file)
103
+
104
+ with open(os.path.join(main_folder, f"y_test_data.pickle"), "wb+") as file:
105
+ pickle.dump(y_test, file)
106
+
107
+ with open(os.path.join(main_folder, f"one_hot_scaler.pickle"), "wb+") as file:
108
+ pickle.dump(one_hot_scaler, file)
109
+
110
+ with open(os.path.join(main_folder, f"minmax_scaler_inputs.pickle"), "wb+") as file:
111
+ pickle.dump(minmax_scaler_inputs, file)
112
+
113
+ with open(os.path.join(main_folder, f"minmax_scaler_targets.pickle"), "wb+") as file:
114
+ pickle.dump(minmax_scaler_targets, file)
115
+
116
+ return X_train, X_test, y_train, y_test
117
+
118
+
119
+ def train_model_ml(X_train, X_test, y_train, y_test, main_folder, model_path, seed=SEED):
120
+ set_all_seeds(seed)
121
+
122
+ model = RandomForestRegressor(random_state=seed)
123
+ model.fit(X_train, y_train)
124
+ y_hat = model.predict(X_test)
125
+ print("----------------")
126
+ print("Model performance")
127
+ print("MAE", mean_absolute_error(y_test, y_hat))
128
+ print("MAPE", mean_absolute_percentage_error(y_test, y_hat))
129
+ print("R2", r2_score(y_test, y_hat))
130
+
131
+ with open(os.path.join(main_folder, model_path), "wb+") as file:
132
+ pickle.dump(model, file)
133
+
134
+ return model
135
+
136
+
137
+ def train_model(
138
+ X_train, X_test, y_train, y_test, columns_target, main_folder, model_path, lr=0.01, seed=SEED, get_history=False
139
+ ):
140
+ # Set all seeds from reproducibility
141
+ set_all_seeds(seed)
142
+
143
+ # Create folder if doesn't exist
144
+ if not os.path.exists(main_folder):
145
+ os.makedirs(main_folder)
146
+
147
+ ## Setup model for training and training
148
+ model = setup_model(len(columns_target))
149
+ opt = tf.keras.optimizers.Adam(learning_rate=lr) # 0.01 for the hardness
150
+ print("learning rate", lr)
151
+ model.compile(optimizer=opt, loss="mean_squared_error")
152
+
153
+ validation_split = 0.1
154
+ history = model.fit(
155
+ X_train, y_train, batch_size=1, epochs=200, verbose=1, validation_data=(X_test, y_test), shuffle=True
156
+ ) # 200 epochs initially
157
+ # raise Exception("Early stopping to test reproducibility")
158
+ model.save(os.path.join(main_folder, model_path))
159
+
160
+ model_core_name = model_path.split(".")[0]
161
+ with open(os.path.join(main_folder, f"{model_core_name}_fit_history.pickle"), "wb+") as file:
162
+ pickle.dump(history, file)
163
+
164
+ ### Plot loss
165
+ plt.clf()
166
+ plt.plot(history.history["loss"])
167
+ plt.plot(history.history["val_loss"])
168
+ plt.title("model loss")
169
+ plt.ylabel("loss")
170
+ plt.xlabel("epoch")
171
+ plt.legend(["train", "test"], loc="upper left")
172
+ fig = plt.gcf()
173
+ plt.show()
174
+ fig.savefig(os.path.join(main_folder, "plot_loss_function.png"))
175
+
176
+ if get_history:
177
+ return model, history
178
+ return model
179
+
180
+
181
+ def save_shap_explainer(predict_fn, X_train, X_test, main_folder, explainer_name="explainer"):
182
+ # Create folder if doesn't exist
183
+ if not os.path.exists(main_folder):
184
+ os.makedirs(main_folder)
185
+
186
+ ## Get explainer
187
+ ex = shap.KernelExplainer(predict_fn, X_train[:80])
188
+ shap_values = ex.shap_values(X_test[-20:])
189
+ fig, axes = plt.subplots(1, 2, figsize=(5, 5))
190
+ # need to check that it works in all cases (especially if size the X_test is 1)
191
+ if len(shap_values) == 1:
192
+ shap_values = shap_values[0]
193
+ plt.clf()
194
+ shap.summary_plot(shap_values, X_test[-20:], show=False)
195
+ fig = plt.gcf()
196
+ fig.savefig(os.path.join(main_folder, f"plot_shap_summary_{explainer_name}.png"))
197
+ plt.show()
198
+
199
+ with open(os.path.join(main_folder, f"{explainer_name}.bz2"), "wb") as file:
200
+ # pickle.dump(ex, file)
201
+ dill.dump(ex, file)
202
+
203
+
204
+ def compute_shap_explainer_no_physics(model_path, X_train, X_test, main_folder, scaler_inputs_path):
205
+ """
206
+ Creates and save a shap explainer that do not include physics-informed features
207
+ To be shared with customers and put into the gradio
208
+ X_train and X_test must NOT be scaled
209
+ """
210
+ scaler_inputs = unpickle_file(scaler_inputs_path)
211
+ if model_path.split(".")[-1] == "h5":
212
+ model = tf.keras.models.load_model(model_path)
213
+ else:
214
+ model = unpickle_file(model_path)
215
+
216
+ model_no_physics = NoPhysicsModels(model, scaler_inputs, add_physics_features)
217
+
218
+ save_shap_explainer(model_no_physics.predict, X_train, X_test, main_folder, explainer_name="exp_no_physics")
219
+
220
+
221
+ if __name__ == "__main__":
222
+ parser = argparse.ArgumentParser(description="Process parameters")
223
+ parser.add_argument(
224
+ "--data_path",
225
+ type=str,
226
+ help="The path to your input data file",
227
+ default="preprocessed_data.csv",
228
+ required=False,
229
+ )
230
+ parser.add_argument(
231
+ "--main_folder", type=str, help="Folder to save model files", default="../models/hardness", required=False
232
+ )
233
+ parser.add_argument(
234
+ "--model_path", type=str, help="Path to save model", default="model_hardness.h5", required=False
235
+ )
236
+ parser.add_argument("--columns_target", type=str, help="List of target columns", default="H", required=False)
237
+ parser.add_argument(
238
+ "--columns_numerical",
239
+ type=str,
240
+ help="List of data columns with numeric values",
241
+ default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
242
+ required=False,
243
+ )
244
+
245
+ args = parser.parse_args()
246
+
247
+ columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
248
+ columns_target = args.columns_target.split(",") if args.columns_target else []
249
+
250
+ X_train, X_test, y_train, y_test = prepare_data(args.data_path, columns_numerical, columns_target, args.main_folder)
251
+
252
+ model = train_model(X_train, X_test, y_train, y_test, columns_target, args.main_folder, args.model_path)
253
+
254
+ save_shap_explainer(model.predict, X_train, X_test, args.main_folder)