Spaces:
Sleeping
Sleeping
Upload train_model_main.py
Browse files- train_model_main.py +254 -0
train_model_main.py
ADDED
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import pandas as pd
|
3 |
+
import tensorflow as tf
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
|
7 |
+
# from numpy.random import seed
|
8 |
+
import random
|
9 |
+
import os
|
10 |
+
|
11 |
+
import pickle
|
12 |
+
import shap
|
13 |
+
import dill
|
14 |
+
|
15 |
+
from utils import encode_categorical, scale_numerical, NoPhysicsModels, unpickle_file
|
16 |
+
from alloy_data_preprocessing import add_physics_features
|
17 |
+
import tensorflow as tf
|
18 |
+
from tensorflow.keras import initializers
|
19 |
+
from sklearn.model_selection import train_test_split
|
20 |
+
from sklearn.ensemble import RandomForestRegressor
|
21 |
+
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
|
22 |
+
|
23 |
+
SEED = 42
|
24 |
+
|
25 |
+
|
26 |
+
def set_all_seeds(seed=SEED):
|
27 |
+
os.environ["PYTHONHASHSEED"] = str(seed)
|
28 |
+
tf.keras.utils.set_random_seed(seed)
|
29 |
+
np.random.seed(seed)
|
30 |
+
random.seed(seed)
|
31 |
+
|
32 |
+
|
33 |
+
def setup_model(num_outputs):
|
34 |
+
model = tf.keras.models.Sequential(
|
35 |
+
[
|
36 |
+
tf.keras.layers.Flatten(),
|
37 |
+
tf.keras.layers.Dense(
|
38 |
+
8,
|
39 |
+
kernel_initializer=initializers.RandomNormal(stddev=0.00001), # Initially was at 0.01
|
40 |
+
bias_initializer=initializers.Zeros(),
|
41 |
+
activation="relu",
|
42 |
+
),
|
43 |
+
tf.keras.layers.Dense(
|
44 |
+
4,
|
45 |
+
activation="relu",
|
46 |
+
kernel_initializer=initializers.RandomNormal(stddev=0.00001), # Initially was at 0.01
|
47 |
+
bias_initializer=initializers.Zeros(),
|
48 |
+
),
|
49 |
+
tf.keras.layers.Dense(
|
50 |
+
num_outputs,
|
51 |
+
activation="relu",
|
52 |
+
kernel_initializer=initializers.RandomNormal(stddev=0.00001), # Initially was at 0.01
|
53 |
+
bias_initializer=initializers.Zeros(),
|
54 |
+
),
|
55 |
+
]
|
56 |
+
)
|
57 |
+
return model
|
58 |
+
|
59 |
+
|
60 |
+
def prepare_data(data, columns_num, columns_target, main_folder, data_type="path", seed=SEED):
|
61 |
+
# Create folder if doesn't exist
|
62 |
+
if not os.path.exists(main_folder):
|
63 |
+
os.makedirs(main_folder)
|
64 |
+
|
65 |
+
columns_numerical = columns_num.copy()
|
66 |
+
|
67 |
+
### Read data
|
68 |
+
print(data_type)
|
69 |
+
if data_type == "path":
|
70 |
+
df = pd.read_csv(data, sep=";")
|
71 |
+
else:
|
72 |
+
df = data.copy()
|
73 |
+
|
74 |
+
### Remove columns not used during training
|
75 |
+
X = df.drop(columns=columns_target)
|
76 |
+
y = df[columns_target]
|
77 |
+
|
78 |
+
# Remove the index columns (if coming from the sampling pipeline)
|
79 |
+
if "Index" in X.columns:
|
80 |
+
X.drop(columns=["Index"], inplace=True)
|
81 |
+
|
82 |
+
### Get categorical columns
|
83 |
+
columns_categorical = [column for column in X.columns if column not in columns_numerical]
|
84 |
+
# ### Remove target from column names
|
85 |
+
# for target in columns_target:
|
86 |
+
# columns_numerical.remove(target)
|
87 |
+
print("lllllllllllllllllllllllll")
|
88 |
+
print(X.columns)
|
89 |
+
|
90 |
+
### Encode variables into one-hot
|
91 |
+
X, one_hot_scaler = encode_categorical(X, columns_categorical)
|
92 |
+
X, minmax_scaler_inputs = scale_numerical(
|
93 |
+
X, [column for column in columns_numerical if column not in columns_target]
|
94 |
+
)
|
95 |
+
y, minmax_scaler_targets = scale_numerical(y, columns_target)
|
96 |
+
|
97 |
+
### Split data
|
98 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
|
99 |
+
|
100 |
+
### Pickle data
|
101 |
+
with open(os.path.join(main_folder, f"X_test_data.pickle"), "wb+") as file:
|
102 |
+
pickle.dump(X_test, file)
|
103 |
+
|
104 |
+
with open(os.path.join(main_folder, f"y_test_data.pickle"), "wb+") as file:
|
105 |
+
pickle.dump(y_test, file)
|
106 |
+
|
107 |
+
with open(os.path.join(main_folder, f"one_hot_scaler.pickle"), "wb+") as file:
|
108 |
+
pickle.dump(one_hot_scaler, file)
|
109 |
+
|
110 |
+
with open(os.path.join(main_folder, f"minmax_scaler_inputs.pickle"), "wb+") as file:
|
111 |
+
pickle.dump(minmax_scaler_inputs, file)
|
112 |
+
|
113 |
+
with open(os.path.join(main_folder, f"minmax_scaler_targets.pickle"), "wb+") as file:
|
114 |
+
pickle.dump(minmax_scaler_targets, file)
|
115 |
+
|
116 |
+
return X_train, X_test, y_train, y_test
|
117 |
+
|
118 |
+
|
119 |
+
def train_model_ml(X_train, X_test, y_train, y_test, main_folder, model_path, seed=SEED):
|
120 |
+
set_all_seeds(seed)
|
121 |
+
|
122 |
+
model = RandomForestRegressor(random_state=seed)
|
123 |
+
model.fit(X_train, y_train)
|
124 |
+
y_hat = model.predict(X_test)
|
125 |
+
print("----------------")
|
126 |
+
print("Model performance")
|
127 |
+
print("MAE", mean_absolute_error(y_test, y_hat))
|
128 |
+
print("MAPE", mean_absolute_percentage_error(y_test, y_hat))
|
129 |
+
print("R2", r2_score(y_test, y_hat))
|
130 |
+
|
131 |
+
with open(os.path.join(main_folder, model_path), "wb+") as file:
|
132 |
+
pickle.dump(model, file)
|
133 |
+
|
134 |
+
return model
|
135 |
+
|
136 |
+
|
137 |
+
def train_model(
|
138 |
+
X_train, X_test, y_train, y_test, columns_target, main_folder, model_path, lr=0.01, seed=SEED, get_history=False
|
139 |
+
):
|
140 |
+
# Set all seeds from reproducibility
|
141 |
+
set_all_seeds(seed)
|
142 |
+
|
143 |
+
# Create folder if doesn't exist
|
144 |
+
if not os.path.exists(main_folder):
|
145 |
+
os.makedirs(main_folder)
|
146 |
+
|
147 |
+
## Setup model for training and training
|
148 |
+
model = setup_model(len(columns_target))
|
149 |
+
opt = tf.keras.optimizers.Adam(learning_rate=lr) # 0.01 for the hardness
|
150 |
+
print("learning rate", lr)
|
151 |
+
model.compile(optimizer=opt, loss="mean_squared_error")
|
152 |
+
|
153 |
+
validation_split = 0.1
|
154 |
+
history = model.fit(
|
155 |
+
X_train, y_train, batch_size=1, epochs=200, verbose=1, validation_data=(X_test, y_test), shuffle=True
|
156 |
+
) # 200 epochs initially
|
157 |
+
# raise Exception("Early stopping to test reproducibility")
|
158 |
+
model.save(os.path.join(main_folder, model_path))
|
159 |
+
|
160 |
+
model_core_name = model_path.split(".")[0]
|
161 |
+
with open(os.path.join(main_folder, f"{model_core_name}_fit_history.pickle"), "wb+") as file:
|
162 |
+
pickle.dump(history, file)
|
163 |
+
|
164 |
+
### Plot loss
|
165 |
+
plt.clf()
|
166 |
+
plt.plot(history.history["loss"])
|
167 |
+
plt.plot(history.history["val_loss"])
|
168 |
+
plt.title("model loss")
|
169 |
+
plt.ylabel("loss")
|
170 |
+
plt.xlabel("epoch")
|
171 |
+
plt.legend(["train", "test"], loc="upper left")
|
172 |
+
fig = plt.gcf()
|
173 |
+
plt.show()
|
174 |
+
fig.savefig(os.path.join(main_folder, "plot_loss_function.png"))
|
175 |
+
|
176 |
+
if get_history:
|
177 |
+
return model, history
|
178 |
+
return model
|
179 |
+
|
180 |
+
|
181 |
+
def save_shap_explainer(predict_fn, X_train, X_test, main_folder, explainer_name="explainer"):
|
182 |
+
# Create folder if doesn't exist
|
183 |
+
if not os.path.exists(main_folder):
|
184 |
+
os.makedirs(main_folder)
|
185 |
+
|
186 |
+
## Get explainer
|
187 |
+
ex = shap.KernelExplainer(predict_fn, X_train[:80])
|
188 |
+
shap_values = ex.shap_values(X_test[-20:])
|
189 |
+
fig, axes = plt.subplots(1, 2, figsize=(5, 5))
|
190 |
+
# need to check that it works in all cases (especially if size the X_test is 1)
|
191 |
+
if len(shap_values) == 1:
|
192 |
+
shap_values = shap_values[0]
|
193 |
+
plt.clf()
|
194 |
+
shap.summary_plot(shap_values, X_test[-20:], show=False)
|
195 |
+
fig = plt.gcf()
|
196 |
+
fig.savefig(os.path.join(main_folder, f"plot_shap_summary_{explainer_name}.png"))
|
197 |
+
plt.show()
|
198 |
+
|
199 |
+
with open(os.path.join(main_folder, f"{explainer_name}.bz2"), "wb") as file:
|
200 |
+
# pickle.dump(ex, file)
|
201 |
+
dill.dump(ex, file)
|
202 |
+
|
203 |
+
|
204 |
+
def compute_shap_explainer_no_physics(model_path, X_train, X_test, main_folder, scaler_inputs_path):
|
205 |
+
"""
|
206 |
+
Creates and save a shap explainer that do not include physics-informed features
|
207 |
+
To be shared with customers and put into the gradio
|
208 |
+
X_train and X_test must NOT be scaled
|
209 |
+
"""
|
210 |
+
scaler_inputs = unpickle_file(scaler_inputs_path)
|
211 |
+
if model_path.split(".")[-1] == "h5":
|
212 |
+
model = tf.keras.models.load_model(model_path)
|
213 |
+
else:
|
214 |
+
model = unpickle_file(model_path)
|
215 |
+
|
216 |
+
model_no_physics = NoPhysicsModels(model, scaler_inputs, add_physics_features)
|
217 |
+
|
218 |
+
save_shap_explainer(model_no_physics.predict, X_train, X_test, main_folder, explainer_name="exp_no_physics")
|
219 |
+
|
220 |
+
|
221 |
+
if __name__ == "__main__":
|
222 |
+
parser = argparse.ArgumentParser(description="Process parameters")
|
223 |
+
parser.add_argument(
|
224 |
+
"--data_path",
|
225 |
+
type=str,
|
226 |
+
help="The path to your input data file",
|
227 |
+
default="preprocessed_data.csv",
|
228 |
+
required=False,
|
229 |
+
)
|
230 |
+
parser.add_argument(
|
231 |
+
"--main_folder", type=str, help="Folder to save model files", default="../models/hardness", required=False
|
232 |
+
)
|
233 |
+
parser.add_argument(
|
234 |
+
"--model_path", type=str, help="Path to save model", default="model_hardness.h5", required=False
|
235 |
+
)
|
236 |
+
parser.add_argument("--columns_target", type=str, help="List of target columns", default="H", required=False)
|
237 |
+
parser.add_argument(
|
238 |
+
"--columns_numerical",
|
239 |
+
type=str,
|
240 |
+
help="List of data columns with numeric values",
|
241 |
+
default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
|
242 |
+
required=False,
|
243 |
+
)
|
244 |
+
|
245 |
+
args = parser.parse_args()
|
246 |
+
|
247 |
+
columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
|
248 |
+
columns_target = args.columns_target.split(",") if args.columns_target else []
|
249 |
+
|
250 |
+
X_train, X_test, y_train, y_test = prepare_data(args.data_path, columns_numerical, columns_target, args.main_folder)
|
251 |
+
|
252 |
+
model = train_model(X_train, X_test, y_train, y_test, columns_target, args.main_folder, args.model_path)
|
253 |
+
|
254 |
+
save_shap_explainer(model.predict, X_train, X_test, args.main_folder)
|