import pandas as pd import os from sklearn.preprocessing import OneHotEncoder, MinMaxScaler import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pickle import argparse from utils import encode_categorical, scale_numerical, fill_nans, read_data from alloy_data_preprocessing import add_physics_features def alloy_preprocessing(df): return add_physics_features(df) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process parameters") parser.add_argument( "--data_path", type=str, help="The path to your input data file", default="./data/Data_Osium.csv", required=False, ) parser.add_argument( "--preprocessed_data_path", type=str, help="The path to your input data file preprocessed for training", default="preprocessed_data.csv", required=False, ) parser.add_argument( "--columns_not_training", type=str, help="List of data columns not used for training", default="", required=False, ) parser.add_argument( "--columns_numerical", type=str, help="List of data columns with numeric values", default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C", required=False, ) parser.add_argument( "--add_physics", type=str, help="Whether to add physics based features", default="y", required=False, ) args = parser.parse_args() df = read_data(args.data_path) columns_not_training = args.columns_not_training.split(",") if args.columns_not_training else [] df.drop(columns=columns_not_training, inplace=True) columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else [] # Fill nan values for col in df.columns: if col not in columns_numerical: df[col] = df[col].fillna(df[col].mode()[0]) else: df[col] = df[col].fillna(df[col].mean()) assert sum(np.sum(df.isna()) != 0) == 0 if args.add_physics == "y": df = alloy_preprocessing(df) df.to_csv(args.preprocessed_data_path, sep=";", index=False)