snajmark's picture
Upload preprocessing_utils.py
78076bc
""" Utils functions for preprocessing"""
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
def aggregate_transform_df(original_df, transformed_df, transformed_cols):
"""
Helper function to aggregate the columns transformed with the original dataset
"""
print(original_df.shape)
print(transformed_df.shape)
df_final = original_df.drop(columns=transformed_cols)
df_final = df_final.merge(transformed_df, left_index=True, right_index=True)
print(df_final.shape)
return df_final
def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True):
"""
Returns the dataframe where the categorical columns have been replaced
according to the method selected
Right now only OneHot is supported
"""
print(f"Running {method} encoding")
if fit:
encoder = OneHotEncoder()
encoder.fit(df[categorical_cols])
array_transformed = encoder.transform(df[categorical_cols]).toarray()
df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index)
df_final = aggregate_transform_df(df, df_encoded, categorical_cols)
if fit:
return df_final, encoder
else:
return df_final
def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True):
"""
Returns the dataframe where the numerical columns have been scaled
according to the method selected
Right now only MinMax is supported
"""
print(f"Running {method} scaling")
if fit:
scaler = MinMaxScaler()
scaler.fit(df[numerical_cols])
array_transformed = scaler.transform(df[numerical_cols])
df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index)
df_final = aggregate_transform_df(df, df_transformed, numerical_cols)
if fit:
return df_final, scaler
else:
return df_final
def fill_nans(df, cols, method="mean"):
df_filled = df.copy()
print(f"Fill nans in {cols} with the {method} method")
for col in cols:
if method == "mean":
df_filled[col] = df_filled[col].fillna(df[col].mean())
elif method == "mode":
df_filled[col] = df_filled[col].fillna(df[col].mode())
return df_filled