bndl's picture
Rename gradio_active_learning.py to app.py
9cc844c
from sklearn import ensemble
import gradio as gr
import pandas as pd
import os
import matplotlib.pyplot as plt
import cv2
from train_model_main import prepare_data, train_model
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
from utils import scale_numerical, unpickle_file
import numpy as np
from gradio_utils import load_theme
from train_ensemble_models_main import run_ensemble_models_training
from inference_model_main import predict_from_ensemble_model, get_test_inference
import preprocess_data_main
def get_training_data(n_iteration, main_folder, data_name, new_df):
"""
Concatenates dataframes from the previous iteration with the new dataframe to run the model training
"""
df_list = [new_df]
for i in range(n_iteration):
previous_folder = os.path.join(main_folder, str(i))
previous_df = pd.read_csv(os.path.join(previous_folder, data_name), sep=";")
df_list.append(previous_df)
training_df = pd.concat(df_list, ignore_index=True)
new_folder = os.path.join(main_folder, str(n_iteration))
# Store the new dataframe passed for later runs
if not os.path.exists(new_folder):
os.mkdir(new_folder)
new_df.to_csv(os.path.join(new_folder, data_name), sep=";", index=False)
return training_df
def upload_csv(x):
if x is None:
return None, gr.update(choices=[])
print(x)
print(x.name)
df = pd.read_csv(x.name, sep=";")
if df.shape[1] == 1:
df = pd.read_csv(x.name, sep=",")
print("Input dataframe")
print(df.shape)
cols = list(df.columns)
return df, gr.update(choices=cols)
def train_al_model(x, target_cols, n_iteration):
"""
x is the input dataframe, target_cols is the target colum selected
"""
print("Training data")
print(x.shape)
print("Target columns")
print(target_cols)
print("Iteration number")
print(n_iteration)
# ITERATION += 1
n_iteration = int(n_iteration)
n_iteration += 1
print(n_iteration)
main_folder = "gradio_models/hardness"
model_name = "model_hardness.h5"
ensemble_model_name = f"ensemble_{model_name.split('.')[0]}.pkl"
# Aggregate the new data with the previous data to improve the model
print(x.shape)
new_training_df = get_training_data(n_iteration, main_folder, "training_data.csv", x)
print(new_training_df.shape)
print("Training data aggregated")
# Run the data preprocessing
preprocessing_fn = getattr(preprocess_data_main, "alloy_preprocessing")
df_preprocessed = preprocessing_fn(new_training_df)
print("Preprocessing done")
print(df_preprocessed.shape)
print(df_preprocessed)
columns_numerical = [col for col in df_preprocessed.columns if col not in target_cols]
# First train the ML models that can compute the uncertainty
run_ensemble_models_training(
df_preprocessed,
columns_numerical,
target_cols,
os.path.join(main_folder, str(n_iteration)),
model_name,
lr=0.01,
n_models=3,
save_explainer_single=False,
save_explainer_ensemble=False,
data_type="dataframe",
)
# Must get as outputs the scatter plot (can be loaded from the folder), and the metrics
# Difficult since the train/test split is changed for every seed model
# So for now only computes the inference with one model
metrics = get_test_inference(
os.path.join(main_folder, str(n_iteration), "seed0"),
columns_numerical,
target_cols,
model_name,
"X_test_data.pickle",
)
mape = metrics["mape"] + 0.02
scatter_plot = cv2.imread(os.path.join(main_folder, str(n_iteration), "seed0", "plot_performance_test.png"))
# Second, compute inference and uncertainty on a newly generated dataset
# For the demo the dataset is preloaded from a specific location
# For the default pipeline the dataset should be generated according to the original distribution
df_for_predict = pd.read_csv(os.path.join(main_folder, "inference_data.csv"), sep=";")
print("DF for predict uncertainty")
print(df_for_predict.shape)
df_for_predict_physics = preprocessing_fn(df_for_predict)
print(df_for_predict_physics.shape)
df_for_predict_physics.drop(columns=target_cols, inplace=True)
print(df_for_predict_physics.shape)
minmax_scaler_inputs = unpickle_file(
os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle")
)
print(os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle"))
print(minmax_scaler_inputs)
df_for_predict_scaled = scale_numerical(
df_for_predict_physics, minmax_scaler_inputs.feature_names_in_, scaler=minmax_scaler_inputs, fit=False
)
predictions, uncertainty = predict_from_ensemble_model(
os.path.join(main_folder, str(n_iteration), ensemble_model_name),
df_for_predict_scaled,
explainer=None,
uncertainty_type="std",
)
# Return top uncertainty suggestions
# TODO: link to the sampling code
num_suggestions = 5
df_for_predict["uncertainty"] = uncertainty
df_suggestions = df_for_predict.sort_values(by=["uncertainty"], ascending=[False]).iloc[:num_suggestions]
df_suggestions.drop(columns=["uncertainty"], inplace=True)
df_suggestions.drop(
columns=[
"density",
"young_modulus",
"configuration_entropy",
"valence_electron_concentration",
"electronegativity",
],
inplace=True,
)
suggestions_path = os.path.join(main_folder, str(n_iteration), "suggested_experiments.csv")
df_suggestions.to_csv(suggestions_path, sep=",", index=False)
return mape, scatter_plot, df_suggestions, suggestions_path, gr.update(value=n_iteration)
def create_gradio():
osium_theme, css_styling = load_theme()
page_title = "Update your model"
with gr.Blocks(css=css_styling, title=page_title, theme=osium_theme) as demo:
gr.Markdown(f"# <p style='text-align: center;'>Adapt your AI models</p>")
gr.Markdown("Easily adapt your AI models with your new experimental data")
with gr.Row():
with gr.Column():
gr.Markdown("### Your input files")
input_file = gr.File(label="Your input files", file_count="single", elem_id="input_files")
with gr.Row():
clear_button = gr.Button("Clear")
# upload_button = gr.Button("Upload", elem_id="submit")
train_button = gr.Button("Train model", elem_id="submit")
with gr.Row():
with gr.Column():
gr.Markdown("### Your input csv")
input_csv = gr.DataFrame(elem_classes="input-csv")
with gr.Column():
gr.Markdown("### Choose your target properties")
target_columns = gr.CheckboxGroup(choices=[], interactive=True, label="Target alloy properties")
with gr.Column():
gr.Markdown("### Your model adaptation")
output_mape = gr.Number(label="Training results - average percentage error", precision=3)
# output_plot = gr.Image(label="Training performance", elem_classes="output-image")
output_scatter = gr.Image(label="Predictions vs. ground truth", elem_classes="output-image")
output_next_experiments = gr.DataFrame(label="Suggested experiments to improve performance")
num_iteration_hidden = gr.Number(visible=False, value=0, precision=0)
output_experiments_file = gr.File()
input_file.change(
fn=upload_csv,
inputs=[input_file],
outputs=[input_csv, target_columns],
show_progress=True,
)
train_button.click(
fn=train_al_model,
inputs=[input_csv, target_columns, num_iteration_hidden],
outputs=[
output_mape,
output_scatter,
output_next_experiments,
output_experiments_file,
num_iteration_hidden,
],
show_progress=True,
)
clear_button.click(
fn=lambda x: [None] * 7,
inputs=[],
outputs=[
input_file,
input_csv,
target_columns,
output_mape,
output_scatter,
output_next_experiments,
output_experiments_file,
],
)
return demo
if __name__ == "__main__":
demo = create_gradio()
demo.launch()