Spaces:
Sleeping
Sleeping
import pandas as pd | |
import os | |
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import numpy as np | |
import pickle | |
import argparse | |
from utils import encode_categorical, scale_numerical, fill_nans, read_data | |
from alloy_data_preprocessing import add_physics_features | |
def alloy_preprocessing(df): | |
return add_physics_features(df) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Process parameters") | |
parser.add_argument( | |
"--data_path", | |
type=str, | |
help="The path to your input data file", | |
default="./data/Data_Osium.csv", | |
required=False, | |
) | |
parser.add_argument( | |
"--preprocessed_data_path", | |
type=str, | |
help="The path to your input data file preprocessed for training", | |
default="preprocessed_data.csv", | |
required=False, | |
) | |
parser.add_argument( | |
"--columns_not_training", | |
type=str, | |
help="List of data columns not used for training", | |
default="", | |
required=False, | |
) | |
parser.add_argument( | |
"--columns_numerical", | |
type=str, | |
help="List of data columns with numeric values", | |
default="%A,%B,%C,%D,%E,%F,%Phase_A,%Phase_B,%Phase_C,%Phase_D,%Phase_E,%Phase_F,%A_Matrice,%B_Matrice,%C_Matrice,%D_Matrice,%E_Matrice,%F_Matrice,H,Temperature_C", | |
required=False, | |
) | |
parser.add_argument( | |
"--add_physics", | |
type=str, | |
help="Whether to add physics based features", | |
default="y", | |
required=False, | |
) | |
args = parser.parse_args() | |
df = read_data(args.data_path) | |
columns_not_training = args.columns_not_training.split(",") if args.columns_not_training else [] | |
df.drop(columns=columns_not_training, inplace=True) | |
columns_numerical = args.columns_numerical.split(",") if args.columns_numerical else [] | |
# Fill nan values | |
for col in df.columns: | |
if col not in columns_numerical: | |
df[col] = df[col].fillna(df[col].mode()[0]) | |
else: | |
df[col] = df[col].fillna(df[col].mean()) | |
assert sum(np.sum(df.isna()) != 0) == 0 | |
if args.add_physics == "y": | |
df = alloy_preprocessing(df) | |
df.to_csv(args.preprocessed_data_path, sep=";", index=False) | |