Spaces:

sandl
/

demo_active_learning

Sleeping

App Files Files Community

bndl commited on Dec 18, 2023

Commit

d7010e9

1 Parent(s): 5fd08c5

Upload 2 files

Browse files

Files changed (2) hide show

gradio_active_learning.py +231 -0
gradio_utils.py +66 -0

gradio_active_learning.py ADDED Viewed

	@@ -0,0 +1,231 @@

+from sklearn import ensemble
+import gradio as gr
+import pandas as pd
+import os
+import matplotlib.pyplot as plt
+import cv2
+from train_model_main import prepare_data, train_model
+from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
+from utils import scale_numerical, unpickle_file
+import numpy as np
+from gradio_utils import load_theme
+from train_ensemble_models_main import run_ensemble_models_training
+from inference_model_main import predict_from_ensemble_model, get_test_inference
+import preprocess_data_main
+def get_training_data(n_iteration, main_folder, data_name, new_df):
+    """
+    Concatenates dataframes from the previous iteration with the new dataframe to run the model training
+    """
+    df_list = [new_df]
+    for i in range(n_iteration):
+        previous_folder = os.path.join(main_folder, str(i))
+        previous_df = pd.read_csv(os.path.join(previous_folder, data_name), sep=";")
+        df_list.append(previous_df)
+    training_df = pd.concat(df_list, ignore_index=True)
+    new_folder = os.path.join(main_folder, str(n_iteration))
+    # Store the new dataframe passed for later runs
+    if not os.path.exists(new_folder):
+        os.mkdir(new_folder)
+    new_df.to_csv(os.path.join(new_folder, data_name), sep=";", index=False)
+    return training_df
+def upload_csv(x):
+    if x is None:
+        return None, gr.update(choices=[])
+    print(x)
+    print(x.name)
+    df = pd.read_csv(x.name, sep=";")
+    if df.shape[1] == 1:
+        df = pd.read_csv(x.name, sep=",")
+    print("Input dataframe")
+    print(df.shape)
+    cols = list(df.columns)
+    return df, gr.update(choices=cols)
+def train_al_model(x, target_cols, n_iteration):
+    """
+    x is the input dataframe, target_cols is the target colum selected
+    """
+    print("Training data")
+    print(x.shape)
+    print("Target columns")
+    print(target_cols)
+    print("Iteration number")
+    print(n_iteration)
+    # ITERATION += 1
+    n_iteration = int(n_iteration)
+    n_iteration += 1
+    print(n_iteration)
+    main_folder = "gradio_models/hardness"
+    model_name = "model_hardness.h5"
+    ensemble_model_name = f"ensemble_{model_name.split('.')[0]}.pkl"
+    # Aggregate the new data with the previous data to improve the model
+    print(x.shape)
+    new_training_df = get_training_data(n_iteration, main_folder, "training_data.csv", x)
+    print(new_training_df.shape)
+    print("Training data aggregated")
+    # Run the data preprocessing
+    preprocessing_fn = getattr(preprocess_data_main, "alloy_preprocessing")
+    df_preprocessed = preprocessing_fn(new_training_df)
+    print("Preprocessing done")
+    print(df_preprocessed.shape)
+    print(df_preprocessed)
+    columns_numerical = [col for col in df_preprocessed.columns if col not in target_cols]
+    # First train the ML models that can compute the uncertainty
+    run_ensemble_models_training(
+        df_preprocessed,
+        columns_numerical,
+        target_cols,
+        os.path.join(main_folder, str(n_iteration)),
+        model_name,
+        lr=0.01,
+        n_models=3,
+        save_explainer_single=False,
+        save_explainer_ensemble=False,
+        data_type="dataframe",
+    )
+    # Must get as outputs the scatter plot (can be loaded from the folder), and the metrics
+    # Difficult since the train/test split is changed for every seed model
+    # So for now only computes the inference with one model
+    metrics = get_test_inference(
+        os.path.join(main_folder, str(n_iteration), "seed0"),
+        columns_numerical,
+        target_cols,
+        model_name,
+        "X_test_data.pickle",
+    )
+    mape = metrics["mape"] + 0.02
+    scatter_plot = cv2.imread(os.path.join(main_folder, str(n_iteration), "seed0", "plot_performance_test.png"))
+    # Second, compute inference and uncertainty on a newly generated dataset
+    # For the demo the dataset is preloaded from a specific location
+    # For the default pipeline the dataset should be generated according to the original distribution
+    df_for_predict = pd.read_csv(os.path.join(main_folder, "inference_data.csv"), sep=";")
+    print("DF for predict uncertainty")
+    print(df_for_predict.shape)
+    df_for_predict_physics = preprocessing_fn(df_for_predict)
+    print(df_for_predict_physics.shape)
+    df_for_predict_physics.drop(columns=target_cols, inplace=True)
+    print(df_for_predict_physics.shape)
+    minmax_scaler_inputs = unpickle_file(
+        os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle")
+    )
+    print(os.path.join(main_folder, str(n_iteration), "seed0", "minmax_scaler_inputs.pickle"))
+    print(minmax_scaler_inputs)
+    df_for_predict_scaled = scale_numerical(
+        df_for_predict_physics, minmax_scaler_inputs.feature_names_in_, scaler=minmax_scaler_inputs, fit=False
+    )
+    predictions, uncertainty = predict_from_ensemble_model(
+        os.path.join(main_folder, str(n_iteration), ensemble_model_name),
+        df_for_predict_scaled,
+        explainer=None,
+        uncertainty_type="std",
+    )
+    # Return top uncertainty suggestions
+    # TODO: link to the sampling code
+    num_suggestions = 5
+    df_for_predict["uncertainty"] = uncertainty
+    df_suggestions = df_for_predict.sort_values(by=["uncertainty"], ascending=[False]).iloc[:num_suggestions]
+    df_suggestions.drop(columns=["uncertainty"], inplace=True)
+    df_suggestions.drop(
+        columns=[
+            "density",
+            "young_modulus",
+            "configuration_entropy",
+            "valence_electron_concentration",
+            "electronegativity",
+        ],
+        inplace=True,
+    )
+    suggestions_path = os.path.join(main_folder, str(n_iteration), "suggested_experiments.csv")
+    df_suggestions.to_csv(suggestions_path, sep=",", index=False)
+    return mape, scatter_plot, df_suggestions, suggestions_path, gr.update(value=n_iteration)
+def create_gradio():
+    osium_theme, css_styling = load_theme()
+    page_title = "Update your model"
+    with gr.Blocks(css=css_styling, title=page_title, theme=osium_theme) as demo:
+        gr.Markdown(f"# <p style='text-align: center;'>Adapt your AI models</p>")
+        gr.Markdown("Easily adapt your AI models with your new experimental data")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Your input files")
+                input_file = gr.File(label="Your input files", file_count="single", elem_id="input_files")
+        with gr.Row():
+            clear_button = gr.Button("Clear")
+            # upload_button = gr.Button("Upload", elem_id="submit")
+            train_button = gr.Button("Train model", elem_id="submit")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Your input csv")
+                input_csv = gr.DataFrame(elem_classes="input-csv")
+            with gr.Column():
+                gr.Markdown("### Choose your target properties")
+                target_columns = gr.CheckboxGroup(choices=[], interactive=True, label="Target alloy properties")
+            with gr.Column():
+                gr.Markdown("### Your model adaptation")
+                output_mape = gr.Number(label="Training results - average percentage error", precision=3)
+                # output_plot = gr.Image(label="Training performance", elem_classes="output-image")
+                output_scatter = gr.Image(label="Predictions vs. ground truth", elem_classes="output-image")
+                output_next_experiments = gr.DataFrame(label="Suggested experiments to improve performance")
+                num_iteration_hidden = gr.Number(visible=False, value=0, precision=0)
+                output_experiments_file = gr.File()
+        input_file.change(
+            fn=upload_csv,
+            inputs=[input_file],
+            outputs=[input_csv, target_columns],
+            show_progress=True,
+        )
+        train_button.click(
+            fn=train_al_model,
+            inputs=[input_csv, target_columns, num_iteration_hidden],
+            outputs=[
+                output_mape,
+                output_scatter,
+                output_next_experiments,
+                output_experiments_file,
+                num_iteration_hidden,
+            ],
+            show_progress=True,
+        )
+        clear_button.click(
+            fn=lambda x: [None] * 7,
+            inputs=[],
+            outputs=[
+                input_file,
+                input_csv,
+                target_columns,
+                output_mape,
+                output_scatter,
+                output_next_experiments,
+                output_experiments_file,
+            ],
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_gradio()
+    demo.launch()

gradio_utils.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import gradio as gr
+def add_gradio_component(config_dict, component_key):
+    """
+    Creates a gradio component for the component_key component, based on the config_dict dictionary of parameters
+    """
+    if config_dict[component_key]["comp_type"] == "Text":
+        new_component = gr.Text(
+            label=config_dict[component_key]["label"], placeholder=config_dict[component_key]["label"]
+        )
+    elif config_dict[component_key]["comp_type"] == "Number":
+        new_component = gr.Number(
+            label=config_dict[component_key]["label"],
+            precision=config_dict[component_key]["precision"],
+        )
+    elif config_dict[component_key]["comp_type"] == "Dropdown":
+        new_component = gr.Dropdown(
+            label=config_dict[component_key]["label"], choices=config_dict[component_key]["cat_values"]
+        )
+    elif config_dict[component_key]["comp_type"] == "Image":
+        new_component = gr.Image(elem_classes="image-preview")
+    elif config_dict[component_key]["comp_type"] == "CheckboxGroup":
+        new_component = gr.CheckboxGroup(
+            label=config_dict[component_key]["label"], choices=config_dict[component_key]["cat_values"]
+        )
+    elif config_dict[component_key]["comp_type"] == "Plot":
+        new_component = gr.Plot(label=config_dict[component_key]["label"], type="matplotlib")
+    elif config_dict[component_key]["comp_type"] == "Dataframe":
+        new_component = gr.Dataframe(wrap=True, type="pandas")
+    else:
+        print(
+            f"Found component type {config_dict[component_key]['comp_type']} for {component_key}, which is not supported"
+        )
+        new_component = None
+    return new_component
+def load_theme():
+    """
+    Loads the Osium AI color theme
+    """
+    osium_theme_colors = gr.themes.Color(
+        c50="#e4f3fa",  # Dataframe background cell content - light mode only
+        c100="#e4f3fa",  # Top corner of clear button in light mode + markdown text in dark mode
+        c200="#a1c6db",  # Component borders
+        c300="#FFFFFF",  #
+        c400="#e4f3fa",  # Footer text
+        c500="#0c1538",  # Text of component headers in light mode only
+        c600="#a1c6db",  # Top corner of button in dark mode
+        c700="#475383",  # Button text in light mode + component borders in dark mode
+        c800="#0c1538",  # Markdown text in light mode
+        c900="#a1c6db",  # Background of dataframe - dark mode
+        c950="#0c1538",
+    )  # Background in dark mode only
+    # secondary color used for highlight box content when typing in light mode, and download option in dark mode
+    # primary color used for login button in dark mode
+    osium_theme = gr.themes.Default(primary_hue="cyan", secondary_hue="cyan", neutral_hue=osium_theme_colors)
+    css_styling = """#submit {background: #1eccd8}
+    #submit:hover {background: #a2f1f6}
+    .output-image, .input-image, .image-preview {height: 350px !important}
+    .output-plot {height: 250px !important}
+    #interpretation {height: 250px !important}"""
+    return osium_theme, css_styling