Spaces:
Runtime error
Runtime error
Upload preprocessing_utils.py
Browse files- preprocessing_utils.py +66 -0
preprocessing_utils.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" Utils functions for preprocessing"""
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
|
4 |
+
|
5 |
+
|
6 |
+
def aggregate_transform_df(original_df, transformed_df, transformed_cols):
|
7 |
+
"""
|
8 |
+
Helper function to aggregate the columns transformed with the original dataset
|
9 |
+
"""
|
10 |
+
print(original_df.shape)
|
11 |
+
print(transformed_df.shape)
|
12 |
+
df_final = original_df.drop(columns=transformed_cols)
|
13 |
+
df_final = df_final.merge(transformed_df, left_index=True, right_index=True)
|
14 |
+
print(df_final.shape)
|
15 |
+
return df_final
|
16 |
+
|
17 |
+
|
18 |
+
def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True):
|
19 |
+
"""
|
20 |
+
Returns the dataframe where the categorical columns have been replaced
|
21 |
+
according to the method selected
|
22 |
+
|
23 |
+
Right now only OneHot is supported
|
24 |
+
"""
|
25 |
+
print(f"Running {method} encoding")
|
26 |
+
if fit:
|
27 |
+
encoder = OneHotEncoder()
|
28 |
+
encoder.fit(df[categorical_cols])
|
29 |
+
array_transformed = encoder.transform(df[categorical_cols]).toarray()
|
30 |
+
df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index)
|
31 |
+
df_final = aggregate_transform_df(df, df_encoded, categorical_cols)
|
32 |
+
if fit:
|
33 |
+
return df_final, encoder
|
34 |
+
else:
|
35 |
+
return df_final
|
36 |
+
|
37 |
+
|
38 |
+
def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True):
|
39 |
+
"""
|
40 |
+
Returns the dataframe where the numerical columns have been scaled
|
41 |
+
according to the method selected
|
42 |
+
|
43 |
+
Right now only MinMax is supported
|
44 |
+
"""
|
45 |
+
print(f"Running {method} scaling")
|
46 |
+
if fit:
|
47 |
+
scaler = MinMaxScaler()
|
48 |
+
scaler.fit(df[numerical_cols])
|
49 |
+
array_transformed = scaler.transform(df[numerical_cols])
|
50 |
+
df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index)
|
51 |
+
df_final = aggregate_transform_df(df, df_transformed, numerical_cols)
|
52 |
+
if fit:
|
53 |
+
return df_final, scaler
|
54 |
+
else:
|
55 |
+
return df_final
|
56 |
+
|
57 |
+
|
58 |
+
def fill_nans(df, cols, method="mean"):
|
59 |
+
df_filled = df.copy()
|
60 |
+
print(f"Fill nans in {cols} with the {method} method")
|
61 |
+
for col in cols:
|
62 |
+
if method == "mean":
|
63 |
+
df_filled[col] = df_filled[col].fillna(df[col].mean())
|
64 |
+
elif method == "mode":
|
65 |
+
df_filled[col] = df_filled[col].fillna(df[col].mode())
|
66 |
+
return df_filled
|