Spaces:
Sleeping
Sleeping
File size: 4,731 Bytes
19b61e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import argparse
import pandas as pd
import os
import numpy as np
import pickle
from train_model_main import prepare_data, set_all_seeds, train_model, save_shap_explainer
from utils import EnsembleModel, unpickle_file
def run_ensemble_models_training(
data,
columns_numerical,
columns_target,
main_folder,
model_name,
data_type="path",
lr=0.01,
n_models=5,
save_explainer_single=False,
save_explainer_ensemble=True,
seed_train_test_split=None,
):
"""
Runs multiple models with different seed for the same prediction task
"""
seeds = range(n_models)
model_list = []
model_path_list = []
history_list = []
for s in seeds:
print("-----------------------")
print(f"Training model {s + 1}/{n_models}")
main_seed_folder = os.path.join(main_folder, f"seed{s}")
seed_split = s
if seed_train_test_split is not None:
seed_split = seed_train_test_split
X_train, X_test, y_train, y_test = prepare_data(
data, columns_numerical, columns_target, main_seed_folder, data_type=data_type, seed=seed_split
)
model, history = train_model(
X_train,
X_test,
y_train,
y_test,
columns_target,
main_seed_folder,
model_name,
lr=lr,
seed=s,
get_history=True,
)
model_list.append(model)
history_list.append(history)
model_path_list.append(os.path.join(main_seed_folder, model_name))
if save_explainer_single:
save_shap_explainer(model.predict, X_train, X_test, main_seed_folder)
scaler_targets = unpickle_file(os.path.join(main_seed_folder, "minmax_scaler_targets.pickle"))
ensemble_model = EnsembleModel(model_list, history_list, scaler_targets=scaler_targets)
with open(os.path.join(main_folder, f"ensemble_{model_name.split('.')[0]}.pkl"), "wb+") as file:
pickle.dump(ensemble_model, file)
# For now just gets the last X_train, X_test, but should be changed to a better solution
X_train_all = X_train.copy()
X_test_all = X_test.copy()
if save_explainer_ensemble:
save_shap_explainer(ensemble_model.predict, X_train_all, X_test_all, main_folder)
return model_list
def train_ensemble_models_from_split(
X_train, X_test, y_train, y_test, columns_target, main_folder, model_path, lr=0.01, n_models=5, save_explainer=True
):
"""
Assumes the train set is the same for all the models
"""
seeds = range(n_models)
model_ls = []
for s in seeds:
model_main_name = model_path.split(".")[0]
model_ext = model_path.split(".")[1]
model_name = f"{model_main_name}_s{s}.{model_ext}"
model = train_model(X_train, X_test, y_train, y_test, columns_target, main_folder, model_name, lr=lr, seed=s)
model_ls.append(model)
if save_explainer:
save_shap_explainer(model, X_train, X_test, main_folder)
return model_ls
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process parameters")
parser.add_argument(
"--data_path",
type=str,
help="The path to your input data file",
default="preprocessed_data.csv",
required=False,
)
parser.add_argument(
"--main_folder", type=str, help="Folder to save model files", default="../models/hardness", required=False
)
parser.add_argument(
"--model_name", type=str, help="Path to save model", default="model_hardness.h5", required=False
)
parser.add_argument("--columns_target", type=str, help="List of target columns", default="H", required=False)
parser.add_argument(
"--columns_numerical",
type=str,
help="List of data columns with numeric values",
default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
required=False,
)
parser.add_argument("--learning_rate", "-lr", type=float, help="Learning rate", default=0.01, required=False)
parser.add_argument("--n_models", "-n", type=int, help="Number of models to run", default=2, required=False)
args = parser.parse_args()
columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
columns_target = args.columns_target.split(",") if args.columns_target else []
run_ensemble_models_training(
args.data_path,
columns_numerical,
columns_target,
args.main_folder,
args.model_name,
lr=args.learning_rate,
n_models=args.n_models,
)
|