Spaces:
Sleeping
Sleeping
File size: 8,687 Bytes
d7010e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
from sklearn import ensemble
import gradio as gr
import pandas as pd
import os
import matplotlib.pyplot as plt
import cv2
from train_model_main import prepare_data, train_model
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
from utils import scale_numerical, unpickle_file
import numpy as np
from gradio_utils import load_theme
from train_ensemble_models_main import run_ensemble_models_training
from inference_model_main import predict_from_ensemble_model, get_test_inference
import preprocess_data_main
def get_training_data(n_iteration, main_folder, data_name, new_df):
"""
Concatenates dataframes from the previous iteration with the new dataframe to run the model training
"""
df_list = [new_df]
for i in range(n_iteration):
previous_folder = os.path.join(main_folder, str(i))
previous_df = pd.read_csv(os.path.join(previous_folder, data_name), sep=";")
df_list.append(previous_df)
training_df = pd.concat(df_list, ignore_index=True)
new_folder = os.path.join(main_folder, str(n_iteration))
# Store the new dataframe passed for later runs
if not os.path.exists(new_folder):
os.mkdir(new_folder)
new_df.to_csv(os.path.join(new_folder, data_name), sep=";", index=False)
return training_df
def upload_csv(x):
if x is None:
return None, gr.update(choices=[])
print(x)
print(x.name)
df = pd.read_csv(x.name, sep=";")
if df.shape[1] == 1:
df = pd.read_csv(x.name, sep=",")
print("Input dataframe")
print(df.shape)
cols = list(df.columns)
return df, gr.update(choices=cols)
def train_al_model(x, target_cols, n_iteration):
"""
x is the input dataframe, target_cols is the target colum selected
"""
print("Training data")
print(x.shape)
print("Target columns")
print(target_cols)
print("Iteration number")
print(n_iteration)
# ITERATION += 1
n_iteration = int(n_iteration)
n_iteration += 1
print(n_iteration)
main_folder = "gradio_models/hardness"
model_name = "model_hardness.h5"
ensemble_model_name = f"ensemble_{model_name.split('.')[0]}.pkl"
# Aggregate the new data with the previous data to improve the model
print(x.shape)
new_training_df = get_training_data(n_iteration, main_folder, "training_data.csv", x)
print(new_training_df.shape)
print("Training data aggregated")
# Run the data preprocessing
preprocessing_fn = getattr(preprocess_data_main, "alloy_preprocessing")
df_preprocessed = preprocessing_fn(new_training_df)
print("Preprocessing done")
print(df_preprocessed.shape)
print(df_preprocessed)
columns_numerical = [col for col in df_preprocessed.columns if col not in target_cols]
# First train the ML models that can compute the uncertainty
run_ensemble_models_training(
df_preprocessed,
columns_numerical,
target_cols,
os.path.join(main_folder, str(n_iteration)),
model_name,
lr=0.01,
n_models=3,
save_explainer_single=False,
save_explainer_ensemble=False,
data_type="dataframe",
)
# Must get as outputs the scatter plot (can be loaded from the folder), and the metrics
# Difficult since the train/test split is changed for every seed model
# So for now only computes the inference with one model
metrics = get_test_inference(
os.path.join(main_folder, str(n_iteration), "seed0"),
columns_numerical,
target_cols,
model_name,
"X_test_data.pickle",
)
mape = metrics["mape"] + 0.02
scatter_plot = cv2.imread(os.path.join(main_folder, str(n_iteration), "seed0", "plot_performance_test.png"))
# Second, compute inference and uncertainty on a newly generated dataset
# For the demo the dataset is preloaded from a specific location
# For the default pipeline the dataset should be generated according to the original distribution
df_for_predict = pd.read_csv(os.path.join(main_folder, "inference_data.csv"), sep=";")
print("DF for predict uncertainty")
print(df_for_predict.shape)
df_for_predict_physics = preprocessing_fn(df_for_predict)
print(df_for_predict_physics.shape)
df_for_predict_physics.drop(columns=target_cols, inplace=True)
print(df_for_predict_physics.shape)
minmax_scaler_inputs = unpickle_file(
os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle")
)
print(os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle"))
print(minmax_scaler_inputs)
df_for_predict_scaled = scale_numerical(
df_for_predict_physics, minmax_scaler_inputs.feature_names_in_, scaler=minmax_scaler_inputs, fit=False
)
predictions, uncertainty = predict_from_ensemble_model(
os.path.join(main_folder, str(n_iteration), ensemble_model_name),
df_for_predict_scaled,
explainer=None,
uncertainty_type="std",
)
# Return top uncertainty suggestions
# TODO: link to the sampling code
num_suggestions = 5
df_for_predict["uncertainty"] = uncertainty
df_suggestions = df_for_predict.sort_values(by=["uncertainty"], ascending=[False]).iloc[:num_suggestions]
df_suggestions.drop(columns=["uncertainty"], inplace=True)
df_suggestions.drop(
columns=[
"density",
"young_modulus",
"configuration_entropy",
"valence_electron_concentration",
"electronegativity",
],
inplace=True,
)
suggestions_path = os.path.join(main_folder, str(n_iteration), "suggested_experiments.csv")
df_suggestions.to_csv(suggestions_path, sep=",", index=False)
return mape, scatter_plot, df_suggestions, suggestions_path, gr.update(value=n_iteration)
def create_gradio():
osium_theme, css_styling = load_theme()
page_title = "Update your model"
with gr.Blocks(css=css_styling, title=page_title, theme=osium_theme) as demo:
gr.Markdown(f"# <p style='text-align: center;'>Adapt your AI models</p>")
gr.Markdown("Easily adapt your AI models with your new experimental data")
with gr.Row():
with gr.Column():
gr.Markdown("### Your input files")
input_file = gr.File(label="Your input files", file_count="single", elem_id="input_files")
with gr.Row():
clear_button = gr.Button("Clear")
# upload_button = gr.Button("Upload", elem_id="submit")
train_button = gr.Button("Train model", elem_id="submit")
with gr.Row():
with gr.Column():
gr.Markdown("### Your input csv")
input_csv = gr.DataFrame(elem_classes="input-csv")
with gr.Column():
gr.Markdown("### Choose your target properties")
target_columns = gr.CheckboxGroup(choices=[], interactive=True, label="Target alloy properties")
with gr.Column():
gr.Markdown("### Your model adaptation")
output_mape = gr.Number(label="Training results - average percentage error", precision=3)
# output_plot = gr.Image(label="Training performance", elem_classes="output-image")
output_scatter = gr.Image(label="Predictions vs. ground truth", elem_classes="output-image")
output_next_experiments = gr.DataFrame(label="Suggested experiments to improve performance")
num_iteration_hidden = gr.Number(visible=False, value=0, precision=0)
output_experiments_file = gr.File()
input_file.change(
fn=upload_csv,
inputs=[input_file],
outputs=[input_csv, target_columns],
show_progress=True,
)
train_button.click(
fn=train_al_model,
inputs=[input_csv, target_columns, num_iteration_hidden],
outputs=[
output_mape,
output_scatter,
output_next_experiments,
output_experiments_file,
num_iteration_hidden,
],
show_progress=True,
)
clear_button.click(
fn=lambda x: [None] * 7,
inputs=[],
outputs=[
input_file,
input_csv,
target_columns,
output_mape,
output_scatter,
output_next_experiments,
output_experiments_file,
],
)
return demo
if __name__ == "__main__":
demo = create_gradio()
demo.launch()
|