|
""" Utils functions for preprocessing""" |
|
import pandas as pd |
|
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler |
|
import pickle |
|
import tensorflow as tf |
|
import shap |
|
|
|
|
|
def aggregate_transform_df(original_df, transformed_df, transformed_cols): |
|
""" |
|
Helper function to aggregate the columns transformed with the original dataset |
|
""" |
|
print(original_df.shape) |
|
print(transformed_df.shape) |
|
df_final = original_df.drop(columns=transformed_cols) |
|
df_final = df_final.merge(transformed_df, left_index=True, right_index=True) |
|
print(df_final.shape) |
|
return df_final |
|
|
|
|
|
def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True): |
|
""" |
|
Returns the dataframe where the categorical columns have been replaced |
|
according to the method selected |
|
|
|
Right now only OneHot is supported |
|
""" |
|
print(f"Running {method} encoding") |
|
if fit: |
|
encoder = OneHotEncoder() |
|
encoder.fit(df[categorical_cols]) |
|
array_transformed = encoder.transform(df[categorical_cols]).toarray() |
|
df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index) |
|
df_final = aggregate_transform_df(df, df_encoded, categorical_cols) |
|
if fit: |
|
return df_final, encoder |
|
else: |
|
return df_final |
|
|
|
|
|
def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True): |
|
""" |
|
Returns the dataframe where the numerical columns have been scaled |
|
according to the method selected |
|
|
|
Right now only MinMax is supported |
|
""" |
|
print(f"Running {method} scaling") |
|
if fit: |
|
scaler = MinMaxScaler() |
|
scaler.fit(df[numerical_cols]) |
|
array_transformed = scaler.transform(df[numerical_cols]) |
|
df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index) |
|
df_final = aggregate_transform_df(df, df_transformed, numerical_cols) |
|
if fit: |
|
return df_final, scaler |
|
else: |
|
return df_final |
|
|
|
|
|
def fill_nans(df, cols, method="mean"): |
|
df_filled = df.copy() |
|
print(f"Fill nans in {cols} with the {method} method") |
|
for col in cols: |
|
if method == "mean": |
|
df_filled[col] = df_filled[col].fillna(df[col].mean()) |
|
elif method == "mode": |
|
df_filled[col] = df_filled[col].fillna(df[col].mode()) |
|
return df_filled |
|
|
|
def encode_and_predict(model_path, data, one_hot_scaler, minmax_scaler_inputs, minmax_scaler_targets, categorical_columns, numerical_columns, target_columns, explainer=None): |
|
model = tf.keras.models.load_model(model_path) |
|
data = encode_categorical(data, categorical_columns, encoder=one_hot_scaler, fit=False) |
|
data = scale_numerical(data, numerical_columns, scaler=minmax_scaler_inputs, fit=False) |
|
if explainer: |
|
return model.predict(data), data.columns, explainer.shap_values(data[-10:]) |
|
else: |
|
return model.predict(data) |
|
|
|
def predict(model_path, data, explainer=None, df_train=None): |
|
model = tf.keras.models.load_model(model_path) |
|
|
|
if df_train is not None: |
|
|
|
explainer = shap.KernelExplainer(model.predict, df_train[:10]) |
|
return model.predict(data), data.columns, explainer.shap_values(data[-10:]) |
|
|
|
if explainer: |
|
return model.predict(data), data.columns, explainer.shap_values(data[-10:]) |
|
else: |
|
return model.predict(data) |
|
|
|
def unpickle_file(path): |
|
with open(path, "rb") as file: |
|
unpickler = pickle.Unpickler(file) |
|
unpickled_file = unpickler.load() |
|
return unpickled_file |
|
|