Spaces:
Sleeping
Sleeping
import argparse | |
import pandas as pd | |
import os | |
import numpy as np | |
import pickle | |
from train_model_main import prepare_data, set_all_seeds, train_model, save_shap_explainer | |
from utils import EnsembleModel, unpickle_file | |
def run_ensemble_models_training( | |
data, | |
columns_numerical, | |
columns_target, | |
main_folder, | |
model_name, | |
data_type="path", | |
lr=0.01, | |
n_models=5, | |
save_explainer_single=False, | |
save_explainer_ensemble=True, | |
seed_train_test_split=None, | |
): | |
""" | |
Runs multiple models with different seed for the same prediction task | |
""" | |
seeds = range(n_models) | |
model_list = [] | |
model_path_list = [] | |
history_list = [] | |
for s in seeds: | |
print("-----------------------") | |
print(f"Training model {s + 1}/{n_models}") | |
main_seed_folder = os.path.join(main_folder, f"seed{s}") | |
seed_split = s | |
if seed_train_test_split is not None: | |
seed_split = seed_train_test_split | |
X_train, X_test, y_train, y_test = prepare_data( | |
data, columns_numerical, columns_target, main_seed_folder, data_type=data_type, seed=seed_split | |
) | |
model, history = train_model( | |
X_train, | |
X_test, | |
y_train, | |
y_test, | |
columns_target, | |
main_seed_folder, | |
model_name, | |
lr=lr, | |
seed=s, | |
get_history=True, | |
) | |
model_list.append(model) | |
history_list.append(history) | |
model_path_list.append(os.path.join(main_seed_folder, model_name)) | |
if save_explainer_single: | |
save_shap_explainer(model.predict, X_train, X_test, main_seed_folder) | |
scaler_targets = unpickle_file(os.path.join(main_seed_folder, "minmax_scaler_targets.pickle")) | |
ensemble_model = EnsembleModel(model_list, history_list, scaler_targets=scaler_targets) | |
with open(os.path.join(main_folder, f"ensemble_{model_name.split('.')[0]}.pkl"), "wb+") as file: | |
pickle.dump(ensemble_model, file) | |
# For now just gets the last X_train, X_test, but should be changed to a better solution | |
X_train_all = X_train.copy() | |
X_test_all = X_test.copy() | |
if save_explainer_ensemble: | |
save_shap_explainer(ensemble_model.predict, X_train_all, X_test_all, main_folder) | |
return model_list | |
def train_ensemble_models_from_split( | |
X_train, X_test, y_train, y_test, columns_target, main_folder, model_path, lr=0.01, n_models=5, save_explainer=True | |
): | |
""" | |
Assumes the train set is the same for all the models | |
""" | |
seeds = range(n_models) | |
model_ls = [] | |
for s in seeds: | |
model_main_name = model_path.split(".")[0] | |
model_ext = model_path.split(".")[1] | |
model_name = f"{model_main_name}_s{s}.{model_ext}" | |
model = train_model(X_train, X_test, y_train, y_test, columns_target, main_folder, model_name, lr=lr, seed=s) | |
model_ls.append(model) | |
if save_explainer: | |
save_shap_explainer(model, X_train, X_test, main_folder) | |
return model_ls | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Process parameters") | |
parser.add_argument( | |
"--data_path", | |
type=str, | |
help="The path to your input data file", | |
default="preprocessed_data.csv", | |
required=False, | |
) | |
parser.add_argument( | |
"--main_folder", type=str, help="Folder to save model files", default="../models/hardness", required=False | |
) | |
parser.add_argument( | |
"--model_name", type=str, help="Path to save model", default="model_hardness.h5", required=False | |
) | |
parser.add_argument("--columns_target", type=str, help="List of target columns", default="H", required=False) | |
parser.add_argument( | |
"--columns_numerical", | |
type=str, | |
help="List of data columns with numeric values", | |
default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C", | |
required=False, | |
) | |
parser.add_argument("--learning_rate", "-lr", type=float, help="Learning rate", default=0.01, required=False) | |
parser.add_argument("--n_models", "-n", type=int, help="Number of models to run", default=2, required=False) | |
args = parser.parse_args() | |
columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else [] | |
columns_target = args.columns_target.split(",") if args.columns_target else [] | |
run_ensemble_models_training( | |
args.data_path, | |
columns_numerical, | |
columns_target, | |
args.main_folder, | |
args.model_name, | |
lr=args.learning_rate, | |
n_models=args.n_models, | |
) | |