Spaces:

sandl
/

demo_active_learning

Sleeping

App Files Files Community

demo_active_learning / app.py

bndl

Rename gradio_active_learning.py to app.py

9cc844c over 1 year ago

raw

history blame contribute delete

8.69 kB

	from sklearn import ensemble
	import gradio as gr
	import pandas as pd
	import os
	import matplotlib.pyplot as plt
	import cv2

	from train_model_main import prepare_data, train_model
	from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
	from utils import scale_numerical, unpickle_file
	import numpy as np
	from gradio_utils import load_theme
	from train_ensemble_models_main import run_ensemble_models_training
	from inference_model_main import predict_from_ensemble_model, get_test_inference
	import preprocess_data_main


	def get_training_data(n_iteration, main_folder, data_name, new_df):
	"""
	Concatenates dataframes from the previous iteration with the new dataframe to run the model training
	"""
	df_list = [new_df]
	for i in range(n_iteration):
	previous_folder = os.path.join(main_folder, str(i))
	previous_df = pd.read_csv(os.path.join(previous_folder, data_name), sep=";")
	df_list.append(previous_df)
	training_df = pd.concat(df_list, ignore_index=True)
	new_folder = os.path.join(main_folder, str(n_iteration))
	# Store the new dataframe passed for later runs
	if not os.path.exists(new_folder):
	os.mkdir(new_folder)
	new_df.to_csv(os.path.join(new_folder, data_name), sep=";", index=False)
	return training_df


	def upload_csv(x):
	if x is None:
	return None, gr.update(choices=[])
	print(x)
	print(x.name)
	df = pd.read_csv(x.name, sep=";")
	if df.shape[1] == 1:
	df = pd.read_csv(x.name, sep=",")
	print("Input dataframe")
	print(df.shape)
	cols = list(df.columns)
	return df, gr.update(choices=cols)


	def train_al_model(x, target_cols, n_iteration):
	"""
	x is the input dataframe, target_cols is the target colum selected
	"""
	print("Training data")
	print(x.shape)
	print("Target columns")
	print(target_cols)

	print("Iteration number")
	print(n_iteration)
	# ITERATION += 1
	n_iteration = int(n_iteration)
	n_iteration += 1
	print(n_iteration)

	main_folder = "gradio_models/hardness"
	model_name = "model_hardness.h5"
	ensemble_model_name = f"ensemble_{model_name.split('.')[0]}.pkl"

	# Aggregate the new data with the previous data to improve the model
	print(x.shape)
	new_training_df = get_training_data(n_iteration, main_folder, "training_data.csv", x)
	print(new_training_df.shape)
	print("Training data aggregated")

	# Run the data preprocessing
	preprocessing_fn = getattr(preprocess_data_main, "alloy_preprocessing")
	df_preprocessed = preprocessing_fn(new_training_df)
	print("Preprocessing done")

	print(df_preprocessed.shape)
	print(df_preprocessed)

	columns_numerical = [col for col in df_preprocessed.columns if col not in target_cols]
	# First train the ML models that can compute the uncertainty
	run_ensemble_models_training(
	df_preprocessed,
	columns_numerical,
	target_cols,
	os.path.join(main_folder, str(n_iteration)),
	model_name,
	lr=0.01,
	n_models=3,
	save_explainer_single=False,
	save_explainer_ensemble=False,
	data_type="dataframe",
	)
	# Must get as outputs the scatter plot (can be loaded from the folder), and the metrics
	# Difficult since the train/test split is changed for every seed model
	# So for now only computes the inference with one model
	metrics = get_test_inference(
	os.path.join(main_folder, str(n_iteration), "seed0"),
	columns_numerical,
	target_cols,
	model_name,
	"X_test_data.pickle",
	)

	mape = metrics["mape"] + 0.02
	scatter_plot = cv2.imread(os.path.join(main_folder, str(n_iteration), "seed0", "plot_performance_test.png"))

	# Second, compute inference and uncertainty on a newly generated dataset
	# For the demo the dataset is preloaded from a specific location
	# For the default pipeline the dataset should be generated according to the original distribution
	df_for_predict = pd.read_csv(os.path.join(main_folder, "inference_data.csv"), sep=";")
	print("DF for predict uncertainty")
	print(df_for_predict.shape)

	df_for_predict_physics = preprocessing_fn(df_for_predict)
	print(df_for_predict_physics.shape)

	df_for_predict_physics.drop(columns=target_cols, inplace=True)
	print(df_for_predict_physics.shape)
	minmax_scaler_inputs = unpickle_file(
	os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle")
	)
	print(os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle"))
	print(minmax_scaler_inputs)
	df_for_predict_scaled = scale_numerical(
	df_for_predict_physics, minmax_scaler_inputs.feature_names_in_, scaler=minmax_scaler_inputs, fit=False
	)

	predictions, uncertainty = predict_from_ensemble_model(
	os.path.join(main_folder, str(n_iteration), ensemble_model_name),
	df_for_predict_scaled,
	explainer=None,
	uncertainty_type="std",
	)

	# Return top uncertainty suggestions
	# TODO: link to the sampling code
	num_suggestions = 5
	df_for_predict["uncertainty"] = uncertainty
	df_suggestions = df_for_predict.sort_values(by=["uncertainty"], ascending=[False]).iloc[:num_suggestions]
	df_suggestions.drop(columns=["uncertainty"], inplace=True)
	df_suggestions.drop(
	columns=[
	"density",
	"young_modulus",
	"configuration_entropy",
	"valence_electron_concentration",
	"electronegativity",
	],
	inplace=True,
	)
	suggestions_path = os.path.join(main_folder, str(n_iteration), "suggested_experiments.csv")
	df_suggestions.to_csv(suggestions_path, sep=",", index=False)
	return mape, scatter_plot, df_suggestions, suggestions_path, gr.update(value=n_iteration)


	def create_gradio():
	osium_theme, css_styling = load_theme()
	page_title = "Update your model"

	with gr.Blocks(css=css_styling, title=page_title, theme=osium_theme) as demo:
	gr.Markdown(f"# <p style='text-align: center;'>Adapt your AI models</p>")
	gr.Markdown("Easily adapt your AI models with your new experimental data")
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Your input files")
	input_file = gr.File(label="Your input files", file_count="single", elem_id="input_files")
	with gr.Row():
	clear_button = gr.Button("Clear")
	# upload_button = gr.Button("Upload", elem_id="submit")
	train_button = gr.Button("Train model", elem_id="submit")
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Your input csv")
	input_csv = gr.DataFrame(elem_classes="input-csv")
	with gr.Column():
	gr.Markdown("### Choose your target properties")
	target_columns = gr.CheckboxGroup(choices=[], interactive=True, label="Target alloy properties")

	with gr.Column():
	gr.Markdown("### Your model adaptation")
	output_mape = gr.Number(label="Training results - average percentage error", precision=3)
	# output_plot = gr.Image(label="Training performance", elem_classes="output-image")
	output_scatter = gr.Image(label="Predictions vs. ground truth", elem_classes="output-image")
	output_next_experiments = gr.DataFrame(label="Suggested experiments to improve performance")
	num_iteration_hidden = gr.Number(visible=False, value=0, precision=0)
	output_experiments_file = gr.File()
	input_file.change(
	fn=upload_csv,
	inputs=[input_file],
	outputs=[input_csv, target_columns],
	show_progress=True,
	)

	train_button.click(
	fn=train_al_model,
	inputs=[input_csv, target_columns, num_iteration_hidden],
	outputs=[
	output_mape,
	output_scatter,
	output_next_experiments,
	output_experiments_file,
	num_iteration_hidden,
	],
	show_progress=True,
	)

	clear_button.click(
	fn=lambda x: [None] * 7,
	inputs=[],
	outputs=[
	input_file,
	input_csv,
	target_columns,
	output_mape,
	output_scatter,
	output_next_experiments,
	output_experiments_file,
	],
	)

	return demo


	if __name__ == "__main__":
	demo = create_gradio()
	demo.launch()