Spaces:

sandl
/

demo_active_learning

Sleeping

App Files Files Community

demo_active_learning / app.py

bndl

Rename gradio_active_learning.py to app.py

9cc844c over 1 year ago

raw

history blame

8.69 kB

	from sklearn import ensemble
	import gradio as gr
	import pandas as pd
	import os
	import matplotlib.pyplot as plt
	import cv2

	from train_model_main import prepare_data, train_model
	from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
	from utils import scale_numerical, unpickle_file
	import numpy as np
	from gradio_utils import load_theme
	from train_ensemble_models_main import run_ensemble_models_training
	from inference_model_main import predict_from_ensemble_model, get_test_inference
	import preprocess_data_main


	def get_training_data(n_iteration, main_folder, data_name, new_df):
	"""
	Concatenates dataframes from the previous iteration with the new dataframe to run the model training
	"""
	df_list = [new_df]
	for i in range(n_iteration):
	previous_folder = os.path.join(main_folder, str(i))
	previous_df = pd.read_csv(os.path.join(previous_folder, data_name), sep=";")
	df_list.append(previous_df)
	training_df = pd.concat(df_list, ignore_index=True)
	new_folder = os.path.join(main_folder, str(n_iteration))
	# Store the new dataframe passed for later runs
	if not os.path.exists(new_folder):
	os.mkdir(new_folder)
	new_df.to_csv(os.path.join(new_folder, data_name), sep=";", index=False)
	return training_df


	def upload_csv(x):
	if x is None:
	return None, gr.update(choices=[])
	print(x)
	print(x.name)
	df = pd.read_csv(x.name, sep=";")
	if df.shape[1] == 1:
	df = pd.read_csv(x.name, sep=",")
	print("Input dataframe")
	print(df.shape)
	cols = list(df.columns)
	return df, gr.update(choices=cols)


	def train_al_model(x, target_cols, n_iteration):
	"""
	x is the input dataframe, target_cols is the target colum selected
	"""
	print("Training data")
	print(x.shape)
	print("Target columns")
	print(target_cols)

	print("Iteration number")
	print(n_iteration)
	# ITERATION += 1
	n_iteration = int(n_iteration)
	n_iteration += 1
	print(n_iteration)

	main_folder = "gradio_models/hardness"
	model_name = "model_hardness.h5"
	ensemble_model_name = f"ensemble_{model_name.split('.')[0]}.pkl"

	# Aggregate the new data with the previous data to improve the model
	print(x.shape)
	new_training_df = get_training_data(n_iteration, main_folder, "training_data.csv", x)
	print(new_training_df.shape)
	print("Training data aggregated")

	# Run the data preprocessing
	preprocessing_fn = getattr(preprocess_data_main, "alloy_preprocessing")
	df_preprocessed = preprocessing_fn(new_training_df)
	print("Preprocessing done")

	print(df_preprocessed.shape)
	print(df_preprocessed)

	columns_numerical = [col for col in df_preprocessed.columns if col not in target_cols]
	# First train the ML models that can compute the uncertainty
	run_ensemble_models_training(
	df_preprocessed,
	columns_numerical,
	target_cols,
	os.path.join(main_folder, str(n_iteration)),
	model_name,
	lr=0.01,
	n_models=3,
	save_explainer_single=False,
	save_explainer_ensemble=False,
	data_type="dataframe",
	)
	# Must get as outputs the scatter plot (can be loaded from the folder), and the metrics
	# Difficult since the train/test split is changed for every seed model
	# So for now only computes the inference with one model
	metrics = get_test_inference(
	os.path.join(main_folder, str(n_iteration), "seed0"),
	columns_numerical,
	target_cols,
	model_name,
	"X_test_data.pickle",
	)

	mape = metrics["mape"] + 0.02
	scatter_plot = cv2.imread(os.path.join(main_folder, str(n_iteration), "seed0", "plot_performance_test.png"))

	# Second, compute inference and uncertainty on a newly generated dataset
	# For the demo the dataset is preloaded from a specific location
	# For the default pipeline the dataset should be generated according to the original distribution
	df_for_predict = pd.read_csv(os.path.join(main_folder, "inference_data.csv"), sep=";")
	print("DF for predict uncertainty")
	print(df_for_predict.shape)

	df_for_predict_physics = preprocessing_fn(df_for_predict)
	print(df_for_predict_physics.shape)

	df_for_predict_physics.drop(columns=target_cols, inplace=True)
	print(df_for_predict_physics.shape)
	minmax_scaler_inputs = unpickle_file(
	os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle")
	)
	print(os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle"))
	print(minmax_scaler_inputs)
	df_for_predict_scaled = scale_numerical(
	df_for_predict_physics, minmax_scaler_inputs.feature_names_in_, scaler=minmax_scaler_inputs, fit=False
	)

	predictions, uncertainty = predict_from_ensemble_model(
	os.path.join(main_folder, str(n_iteration), ensemble_model_name),
	df_for_predict_scaled,
	explainer=None,
	uncertainty_type="std",
	)

	# Return top uncertainty suggestions
	# TODO: link to the sampling code
	num_suggestions = 5
	df_for_predict["uncertainty"] = uncertainty
	df_suggestions = df_for_predict.sort_values(by=["uncertainty"], ascending=[False]).iloc[:num_suggestions]
	df_suggestions.drop(columns=["uncertainty"], inplace=True)
	df_suggestions.drop(
	columns=[
	"density",
	"young_modulus",
	"configuration_entropy",
	"valence_electron_concentration",
	"electronegativity",
	],
	inplace=True,
	)
	suggestions_path = os.path.join(main_folder, str(n_iteration), "suggested_experiments.csv")
	df_suggestions.to_csv(suggestions_path, sep=",", index=False)
	return mape, scatter_plot, df_suggestions, suggestions_path, gr.update(value=n_iteration)


	def create_gradio():
	osium_theme, css_styling = load_theme()
	page_title = "Update your model"

	with gr.Blocks(css=css_styling, title=page_title, theme=osium_theme) as demo:
	gr.Markdown(f"# <p style='text-align: center;'>Adapt your AI models</p>")
	gr.Markdown("Easily adapt your AI models with your new experimental data")
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Your input files")
	input_file = gr.File(label="Your input files", file_count="single", elem_id="input_files")
	with gr.Row():
	clear_button = gr.Button("Clear")
	# upload_button = gr.Button("Upload", elem_id="submit")
	train_button = gr.Button("Train model", elem_id="submit")
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Your input csv")
	input_csv = gr.DataFrame(elem_classes="input-csv")
	with gr.Column():
	gr.Markdown("### Choose your target properties")
	target_columns = gr.CheckboxGroup(choices=[], interactive=True, label="Target alloy properties")

	with gr.Column():
	gr.Markdown("### Your model adaptation")
	output_mape = gr.Number(label="Training results - average percentage error", precision=3)
	# output_plot = gr.Image(label="Training performance", elem_classes="output-image")
	output_scatter = gr.Image(label="Predictions vs. ground truth", elem_classes="output-image")
	output_next_experiments = gr.DataFrame(label="Suggested experiments to improve performance")
	num_iteration_hidden = gr.Number(visible=False, value=0, precision=0)
	output_experiments_file = gr.File()
	input_file.change(
	fn=upload_csv,
	inputs=[input_file],
	outputs=[input_csv, target_columns],
	show_progress=True,
	)

	train_button.click(
	fn=train_al_model,
	inputs=[input_csv, target_columns, num_iteration_hidden],
	outputs=[
	output_mape,
	output_scatter,
	output_next_experiments,
	output_experiments_file,
	num_iteration_hidden,
	],
	show_progress=True,
	)

	clear_button.click(
	fn=lambda x: [None] * 7,
	inputs=[],
	outputs=[
	input_file,
	input_csv,
	target_columns,
	output_mape,
	output_scatter,
	output_next_experiments,
	output_experiments_file,
	],
	)

	return demo


	if __name__ == "__main__":
	demo = create_gradio()
	demo.launch()