snajmark commited on
Commit
78076bc
·
1 Parent(s): 9d60b86

Upload preprocessing_utils.py

Browse files
Files changed (1) hide show
  1. preprocessing_utils.py +66 -0
preprocessing_utils.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Utils functions for preprocessing"""
2
+ import pandas as pd
3
+ from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
4
+
5
+
6
+ def aggregate_transform_df(original_df, transformed_df, transformed_cols):
7
+ """
8
+ Helper function to aggregate the columns transformed with the original dataset
9
+ """
10
+ print(original_df.shape)
11
+ print(transformed_df.shape)
12
+ df_final = original_df.drop(columns=transformed_cols)
13
+ df_final = df_final.merge(transformed_df, left_index=True, right_index=True)
14
+ print(df_final.shape)
15
+ return df_final
16
+
17
+
18
+ def encode_categorical(df, categorical_cols, method="OneHot", encoder=None, fit=True):
19
+ """
20
+ Returns the dataframe where the categorical columns have been replaced
21
+ according to the method selected
22
+
23
+ Right now only OneHot is supported
24
+ """
25
+ print(f"Running {method} encoding")
26
+ if fit:
27
+ encoder = OneHotEncoder()
28
+ encoder.fit(df[categorical_cols])
29
+ array_transformed = encoder.transform(df[categorical_cols]).toarray()
30
+ df_encoded = pd.DataFrame(array_transformed, columns=encoder.get_feature_names_out(), index=df.index)
31
+ df_final = aggregate_transform_df(df, df_encoded, categorical_cols)
32
+ if fit:
33
+ return df_final, encoder
34
+ else:
35
+ return df_final
36
+
37
+
38
+ def scale_numerical(df, numerical_cols, method="MinMax", scaler=None, fit=True):
39
+ """
40
+ Returns the dataframe where the numerical columns have been scaled
41
+ according to the method selected
42
+
43
+ Right now only MinMax is supported
44
+ """
45
+ print(f"Running {method} scaling")
46
+ if fit:
47
+ scaler = MinMaxScaler()
48
+ scaler.fit(df[numerical_cols])
49
+ array_transformed = scaler.transform(df[numerical_cols])
50
+ df_transformed = pd.DataFrame(array_transformed, columns=numerical_cols, index=df.index)
51
+ df_final = aggregate_transform_df(df, df_transformed, numerical_cols)
52
+ if fit:
53
+ return df_final, scaler
54
+ else:
55
+ return df_final
56
+
57
+
58
+ def fill_nans(df, cols, method="mean"):
59
+ df_filled = df.copy()
60
+ print(f"Fill nans in {cols} with the {method} method")
61
+ for col in cols:
62
+ if method == "mean":
63
+ df_filled[col] = df_filled[col].fillna(df[col].mean())
64
+ elif method == "mode":
65
+ df_filled[col] = df_filled[col].fillna(df[col].mode())
66
+ return df_filled