File size: 4,731 Bytes
19b61e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import argparse
import pandas as pd
import os
import numpy as np
import pickle

from train_model_main import prepare_data, set_all_seeds, train_model, save_shap_explainer
from utils import EnsembleModel, unpickle_file


def run_ensemble_models_training(
    data,
    columns_numerical,
    columns_target,
    main_folder,
    model_name,
    data_type="path",
    lr=0.01,
    n_models=5,
    save_explainer_single=False,
    save_explainer_ensemble=True,
    seed_train_test_split=None,
):
    """
    Runs multiple models with different seed for the same prediction task
    """
    seeds = range(n_models)
    model_list = []
    model_path_list = []
    history_list = []
    for s in seeds:
        print("-----------------------")
        print(f"Training model {s + 1}/{n_models}")
        main_seed_folder = os.path.join(main_folder, f"seed{s}")
        seed_split = s
        if seed_train_test_split is not None:
            seed_split = seed_train_test_split
        X_train, X_test, y_train, y_test = prepare_data(
            data, columns_numerical, columns_target, main_seed_folder, data_type=data_type, seed=seed_split
        )
        model, history = train_model(
            X_train,
            X_test,
            y_train,
            y_test,
            columns_target,
            main_seed_folder,
            model_name,
            lr=lr,
            seed=s,
            get_history=True,
        )
        model_list.append(model)
        history_list.append(history)
        model_path_list.append(os.path.join(main_seed_folder, model_name))
        if save_explainer_single:
            save_shap_explainer(model.predict, X_train, X_test, main_seed_folder)

    scaler_targets = unpickle_file(os.path.join(main_seed_folder, "minmax_scaler_targets.pickle"))
    ensemble_model = EnsembleModel(model_list, history_list, scaler_targets=scaler_targets)
    with open(os.path.join(main_folder, f"ensemble_{model_name.split('.')[0]}.pkl"), "wb+") as file:
        pickle.dump(ensemble_model, file)
    # For now just gets the last X_train, X_test, but should be changed to a better solution
    X_train_all = X_train.copy()
    X_test_all = X_test.copy()

    if save_explainer_ensemble:
        save_shap_explainer(ensemble_model.predict, X_train_all, X_test_all, main_folder)

    return model_list


def train_ensemble_models_from_split(
    X_train, X_test, y_train, y_test, columns_target, main_folder, model_path, lr=0.01, n_models=5, save_explainer=True
):
    """
    Assumes the train set is the same for all the models
    """
    seeds = range(n_models)
    model_ls = []
    for s in seeds:
        model_main_name = model_path.split(".")[0]
        model_ext = model_path.split(".")[1]
        model_name = f"{model_main_name}_s{s}.{model_ext}"
        model = train_model(X_train, X_test, y_train, y_test, columns_target, main_folder, model_name, lr=lr, seed=s)
        model_ls.append(model)
        if save_explainer:
            save_shap_explainer(model, X_train, X_test, main_folder)

    return model_ls


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process parameters")
    parser.add_argument(
        "--data_path",
        type=str,
        help="The path to your input data file",
        default="preprocessed_data.csv",
        required=False,
    )
    parser.add_argument(
        "--main_folder", type=str, help="Folder to save model files", default="../models/hardness", required=False
    )
    parser.add_argument(
        "--model_name", type=str, help="Path to save model", default="model_hardness.h5", required=False
    )
    parser.add_argument("--columns_target", type=str, help="List of target columns", default="H", required=False)
    parser.add_argument(
        "--columns_numerical",
        type=str,
        help="List of data columns with numeric values",
        default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
        required=False,
    )
    parser.add_argument("--learning_rate", "-lr", type=float, help="Learning rate", default=0.01, required=False)
    parser.add_argument("--n_models", "-n", type=int, help="Number of models to run", default=2, required=False)

    args = parser.parse_args()

    columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
    columns_target = args.columns_target.split(",") if args.columns_target else []

    run_ensemble_models_training(
        args.data_path,
        columns_numerical,
        columns_target,
        args.main_folder,
        args.model_name,
        lr=args.learning_rate,
        n_models=args.n_models,
    )