File size: 3,536 Bytes
f66f57c
 
 
 
 
3e82a97
f66f57c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703aa87
f66f57c
703aa87
 
 
 
 
 
f66f57c
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
""" Utils functions for preprocessing"""
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pickle
import tensorflow as tf
import shap


def aggregate_transform_df(original_df, transformed_df, transformed_cols):
    """
    Helper function to aggregate the columns transformed with the original dataset
    """
    print(original_df.shape)
    print(transformed_df.shape)
    df_final = original_df.drop(columns=transformed_cols)
    df_final = df_final.merge(transformed_df, left_index=True, right_index=True)
    print(df_final.shape)
    return df_final


def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True):
    """
    Returns the dataframe where the categorical columns have been replaced
    according to the method selected

    Right now only OneHot is supported
    """
    print(f"Running {method} encoding")
    if fit:
        encoder = OneHotEncoder()
        encoder.fit(df[categorical_cols])
    array_transformed = encoder.transform(df[categorical_cols]).toarray()
    df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index)
    df_final = aggregate_transform_df(df, df_encoded, categorical_cols)
    if fit:
        return df_final, encoder
    else:
        return df_final


def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True):
    """
    Returns the dataframe where the numerical columns have been scaled
    according to the method selected

    Right now only MinMax is supported
    """
    print(f"Running {method} scaling")
    if fit:
        scaler = MinMaxScaler()
        scaler.fit(df[numerical_cols])
    array_transformed = scaler.transform(df[numerical_cols])
    df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index)
    df_final = aggregate_transform_df(df, df_transformed, numerical_cols)
    if fit:
        return df_final, scaler
    else:
        return df_final


def fill_nans(df, cols, method="mean"):
    df_filled = df.copy()
    print(f"Fill nans in {cols} with the {method} method")
    for col in cols:
        if method == "mean":
            df_filled[col] = df_filled[col].fillna(df[col].mean())
        elif method == "mode":
            df_filled[col] = df_filled[col].fillna(df[col].mode())
    return df_filled

def encode_and_predict(model_path, data, one_hot_scaler, minmax_scaler_inputs, minmax_scaler_targets, categorical_columns, numerical_columns, target_columns, explainer=None):
    model = tf.keras.models.load_model(model_path)
    data = encode_categorical(data, categorical_columns, encoder=one_hot_scaler, fit=False)
    data = scale_numerical(data, numerical_columns, scaler=minmax_scaler_inputs, fit=False)
    if explainer:
        return model.predict(data), data.columns, explainer.shap_values(data[-10:])
    else:
        return model.predict(data)

def predict(model_path, data, explainer=None, df_train=None):
    model = tf.keras.models.load_model(model_path)

    if df_train is not None:
        
        explainer = shap.KernelExplainer(model.predict, df_train[:10])
        return model.predict(data), data.columns, explainer.shap_values(data[-10:])

    if explainer:
        return model.predict(data), data.columns, explainer.shap_values(data[-10:])
    else:
        return model.predict(data)

def unpickle_file(path):
    with open(path, "rb") as file:
        unpickler = pickle.Unpickler(file)
        unpickled_file = unpickler.load()
    return unpickled_file