demo_active_learning / preprocess_data_main.py
bndl's picture
Upload 3 files
19b61e8
raw
history blame
2.32 kB
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import argparse
from utils import encode_categorical, scale_numerical, fill_nans, read_data
from alloy_data_preprocessing import add_physics_features
def alloy_preprocessing(df):
return add_physics_features(df)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process parameters")
parser.add_argument(
"--data_path",
type=str,
help="The path to your input data file",
default="./data/Data_Osium.csv",
required=False,
)
parser.add_argument(
"--preprocessed_data_path",
type=str,
help="The path to your input data file preprocessed for training",
default="preprocessed_data.csv",
required=False,
)
parser.add_argument(
"--columns_not_training",
type=str,
help="List of data columns not used for training",
default="",
required=False,
)
parser.add_argument(
"--columns_numerical",
type=str,
help="List of data columns with numeric values",
default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C",
required=False,
)
parser.add_argument(
"--add_physics",
type=str,
help="Whether to add physics based features",
default="y",
required=False,
)
args = parser.parse_args()
df = read_data(args.data_path)
columns_not_training = args.columns_not_training.split(",") if args.columns_not_training else []
df.drop(columns=columns_not_training, inplace=True)
columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else []
# Fill nan values
for col in df.columns:
if col not in columns_numerical:
df[col] = df[col].fillna(df[col].mode()[0])
else:
df[col] = df[col].fillna(df[col].mean())
assert sum(np.sum(df.isna()) != 0) == 0
if args.add_physics == "y":
df = alloy_preprocessing(df)
df.to_csv(args.preprocessed_data_path, sep=";", index=False)