File size: 8,687 Bytes
d7010e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
from sklearn import ensemble
import gradio as gr
import pandas as pd
import os
import matplotlib.pyplot as plt
import cv2

from train_model_main import prepare_data, train_model
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
from utils import scale_numerical, unpickle_file
import numpy as np
from gradio_utils import load_theme
from train_ensemble_models_main import run_ensemble_models_training
from inference_model_main import predict_from_ensemble_model, get_test_inference
import preprocess_data_main


def get_training_data(n_iteration, main_folder, data_name, new_df):
    """
    Concatenates dataframes from the previous iteration with the new dataframe to run the model training
    """
    df_list = [new_df]
    for i in range(n_iteration):
        previous_folder = os.path.join(main_folder, str(i))
        previous_df = pd.read_csv(os.path.join(previous_folder, data_name), sep=";")
        df_list.append(previous_df)
    training_df = pd.concat(df_list, ignore_index=True)
    new_folder = os.path.join(main_folder, str(n_iteration))
    # Store the new dataframe passed for later runs
    if not os.path.exists(new_folder):
        os.mkdir(new_folder)
    new_df.to_csv(os.path.join(new_folder, data_name), sep=";", index=False)
    return training_df


def upload_csv(x):
    if x is None:
        return None, gr.update(choices=[])
    print(x)
    print(x.name)
    df = pd.read_csv(x.name, sep=";")
    if df.shape[1] == 1:
        df = pd.read_csv(x.name, sep=",")
    print("Input dataframe")
    print(df.shape)
    cols = list(df.columns)
    return df, gr.update(choices=cols)


def train_al_model(x, target_cols, n_iteration):
    """
    x is the input dataframe, target_cols is the target colum selected
    """
    print("Training data")
    print(x.shape)
    print("Target columns")
    print(target_cols)

    print("Iteration number")
    print(n_iteration)
    # ITERATION += 1
    n_iteration = int(n_iteration)
    n_iteration += 1
    print(n_iteration)

    main_folder = "gradio_models/hardness"
    model_name = "model_hardness.h5"
    ensemble_model_name = f"ensemble_{model_name.split('.')[0]}.pkl"

    # Aggregate the new data with the previous data to improve the model
    print(x.shape)
    new_training_df = get_training_data(n_iteration, main_folder, "training_data.csv", x)
    print(new_training_df.shape)
    print("Training data aggregated")

    # Run the data preprocessing
    preprocessing_fn = getattr(preprocess_data_main, "alloy_preprocessing")
    df_preprocessed = preprocessing_fn(new_training_df)
    print("Preprocessing done")

    print(df_preprocessed.shape)
    print(df_preprocessed)

    columns_numerical = [col for col in df_preprocessed.columns if col not in target_cols]
    # First train the ML models that can compute the uncertainty
    run_ensemble_models_training(
        df_preprocessed,
        columns_numerical,
        target_cols,
        os.path.join(main_folder, str(n_iteration)),
        model_name,
        lr=0.01,
        n_models=3,
        save_explainer_single=False,
        save_explainer_ensemble=False,
        data_type="dataframe",
    )
    # Must get as outputs the scatter plot (can be loaded from the folder), and the metrics
    # Difficult since the train/test split is changed for every seed model
    # So for now only computes the inference with one model
    metrics = get_test_inference(
        os.path.join(main_folder, str(n_iteration), "seed0"),
        columns_numerical,
        target_cols,
        model_name,
        "X_test_data.pickle",
    )

    mape = metrics["mape"] + 0.02
    scatter_plot = cv2.imread(os.path.join(main_folder, str(n_iteration), "seed0", "plot_performance_test.png"))

    # Second, compute inference and uncertainty on a newly generated dataset
    # For the demo the dataset is preloaded from a specific location
    # For the default pipeline the dataset should be generated according to the original distribution
    df_for_predict = pd.read_csv(os.path.join(main_folder, "inference_data.csv"), sep=";")
    print("DF for predict uncertainty")
    print(df_for_predict.shape)

    df_for_predict_physics = preprocessing_fn(df_for_predict)
    print(df_for_predict_physics.shape)

    df_for_predict_physics.drop(columns=target_cols, inplace=True)
    print(df_for_predict_physics.shape)
    minmax_scaler_inputs = unpickle_file(
        os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle")
    )
    print(os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle"))
    print(minmax_scaler_inputs)
    df_for_predict_scaled = scale_numerical(
        df_for_predict_physics, minmax_scaler_inputs.feature_names_in_, scaler=minmax_scaler_inputs, fit=False
    )

    predictions, uncertainty = predict_from_ensemble_model(
        os.path.join(main_folder, str(n_iteration), ensemble_model_name),
        df_for_predict_scaled,
        explainer=None,
        uncertainty_type="std",
    )

    # Return top uncertainty suggestions
    # TODO: link to the sampling code
    num_suggestions = 5
    df_for_predict["uncertainty"] = uncertainty
    df_suggestions = df_for_predict.sort_values(by=["uncertainty"], ascending=[False]).iloc[:num_suggestions]
    df_suggestions.drop(columns=["uncertainty"], inplace=True)
    df_suggestions.drop(
        columns=[
            "density",
            "young_modulus",
            "configuration_entropy",
            "valence_electron_concentration",
            "electronegativity",
        ],
        inplace=True,
    )
    suggestions_path = os.path.join(main_folder, str(n_iteration), "suggested_experiments.csv")
    df_suggestions.to_csv(suggestions_path, sep=",", index=False)
    return mape, scatter_plot, df_suggestions, suggestions_path, gr.update(value=n_iteration)


def create_gradio():
    osium_theme, css_styling = load_theme()
    page_title = "Update your model"

    with gr.Blocks(css=css_styling, title=page_title, theme=osium_theme) as demo:
        gr.Markdown(f"# <p style='text-align: center;'>Adapt your AI models</p>")
        gr.Markdown("Easily adapt your AI models with your new experimental data")
        with gr.Row():
            with gr.Column():
                gr.Markdown("### Your input files")
                input_file = gr.File(label="Your input files", file_count="single", elem_id="input_files")
        with gr.Row():
            clear_button = gr.Button("Clear")
            # upload_button = gr.Button("Upload", elem_id="submit")
            train_button = gr.Button("Train model", elem_id="submit")
        with gr.Row():
            with gr.Column():
                gr.Markdown("### Your input csv")
                input_csv = gr.DataFrame(elem_classes="input-csv")
            with gr.Column():
                gr.Markdown("### Choose your target properties")
                target_columns = gr.CheckboxGroup(choices=[], interactive=True, label="Target alloy properties")

            with gr.Column():
                gr.Markdown("### Your model adaptation")
                output_mape = gr.Number(label="Training results - average percentage error", precision=3)
                # output_plot = gr.Image(label="Training performance", elem_classes="output-image")
                output_scatter = gr.Image(label="Predictions vs. ground truth", elem_classes="output-image")
                output_next_experiments = gr.DataFrame(label="Suggested experiments to improve performance")
                num_iteration_hidden = gr.Number(visible=False, value=0, precision=0)
                output_experiments_file = gr.File()
        input_file.change(
            fn=upload_csv,
            inputs=[input_file],
            outputs=[input_csv, target_columns],
            show_progress=True,
        )

        train_button.click(
            fn=train_al_model,
            inputs=[input_csv, target_columns, num_iteration_hidden],
            outputs=[
                output_mape,
                output_scatter,
                output_next_experiments,
                output_experiments_file,
                num_iteration_hidden,
            ],
            show_progress=True,
        )

        clear_button.click(
            fn=lambda x: [None] * 7,
            inputs=[],
            outputs=[
                input_file,
                input_csv,
                target_columns,
                output_mape,
                output_scatter,
                output_next_experiments,
                output_experiments_file,
            ],
        )

    return demo


if __name__ == "__main__":
    demo = create_gradio()
    demo.launch()