Spaces:
Runtime error
Runtime error
""" Utils functions for preprocessing""" | |
import pandas as pd | |
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler | |
def aggregate_transform_df(original_df, transformed_df, transformed_cols): | |
""" | |
Helper function to aggregate the columns transformed with the original dataset | |
""" | |
print(original_df.shape) | |
print(transformed_df.shape) | |
df_final = original_df.drop(columns=transformed_cols) | |
df_final = df_final.merge(transformed_df, left_index=True, right_index=True) | |
print(df_final.shape) | |
return df_final | |
def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True): | |
""" | |
Returns the dataframe where the categorical columns have been replaced | |
according to the method selected | |
Right now only OneHot is supported | |
""" | |
print(f"Running {method} encoding") | |
if fit: | |
encoder = OneHotEncoder() | |
encoder.fit(df[categorical_cols]) | |
array_transformed = encoder.transform(df[categorical_cols]).toarray() | |
df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index) | |
df_final = aggregate_transform_df(df, df_encoded, categorical_cols) | |
if fit: | |
return df_final, encoder | |
else: | |
return df_final | |
def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True): | |
""" | |
Returns the dataframe where the numerical columns have been scaled | |
according to the method selected | |
Right now only MinMax is supported | |
""" | |
print(f"Running {method} scaling") | |
if fit: | |
scaler = MinMaxScaler() | |
scaler.fit(df[numerical_cols]) | |
array_transformed = scaler.transform(df[numerical_cols]) | |
df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index) | |
df_final = aggregate_transform_df(df, df_transformed, numerical_cols) | |
if fit: | |
return df_final, scaler | |
else: | |
return df_final | |
def fill_nans(df, cols, method="mean"): | |
df_filled = df.copy() | |
print(f"Fill nans in {cols} with the {method} method") | |
for col in cols: | |
if method == "mean": | |
df_filled[col] = df_filled[col].fillna(df[col].mean()) | |
elif method == "mode": | |
df_filled[col] = df_filled[col].fillna(df[col].mode()) | |
return df_filled | |