Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.impute import SimpleImputer | |
class LoanDataProcessor: | |
""" | |
Class for preprocessing loan data for machine learning models. | |
""" | |
def __init__(self): | |
"""Initialize the data processor.""" | |
self.preprocessor = None | |
self.categorical_features = ['gender', 'employment_status', 'payment_history'] | |
self.numerical_features = ['age', 'annual_income', 'credit_score', 'loan_amount', | |
'interest_rate', 'loan_term', 'days_past_due', | |
'previous_defaults', 'monthly_payment', 'debt_to_income'] | |
def fit(self, X): | |
""" | |
Fit the preprocessor on the training data. | |
Parameters: | |
----------- | |
X : pandas.DataFrame | |
The training data | |
Returns: | |
-------- | |
self : LoanDataProcessor | |
The fitted processor | |
""" | |
# Define preprocessing for numerical features | |
numerical_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='median')), | |
('scaler', StandardScaler()) | |
]) | |
# Define preprocessing for categorical features | |
categorical_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='most_frequent')), | |
('onehot', OneHotEncoder(handle_unknown='ignore')) | |
]) | |
# Combine preprocessing steps | |
self.preprocessor = ColumnTransformer( | |
transformers=[ | |
('num', numerical_transformer, self.numerical_features), | |
('cat', categorical_transformer, self.categorical_features) | |
]) | |
# Fit the preprocessor | |
self.preprocessor.fit(X) | |
return self | |
def transform(self, X): | |
""" | |
Transform the data using the fitted preprocessor. | |
Parameters: | |
----------- | |
X : pandas.DataFrame | |
The data to transform | |
Returns: | |
-------- | |
numpy.ndarray | |
The transformed data | |
""" | |
if self.preprocessor is None: | |
raise ValueError("Preprocessor has not been fitted. Call fit() first.") | |
return self.preprocessor.transform(X) | |
def fit_transform(self, X): | |
""" | |
Fit the preprocessor and transform the data. | |
Parameters: | |
----------- | |
X : pandas.DataFrame | |
The data to fit and transform | |
Returns: | |
-------- | |
numpy.ndarray | |
The transformed data | |
""" | |
return self.fit(X).transform(X) | |
def get_feature_names(self): | |
""" | |
Get the names of the transformed features. | |
Returns: | |
-------- | |
list | |
List of feature names after transformation | |
""" | |
if self.preprocessor is None: | |
raise ValueError("Preprocessor has not been fitted. Call fit() first.") | |
# Get feature names from the column transformer | |
feature_names = [] | |
# Get numerical feature names (these stay the same) | |
feature_names.extend(self.numerical_features) | |
# Get categorical feature names (these are expanded by one-hot encoding) | |
categorical_features = self.preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out( | |
self.categorical_features) | |
feature_names.extend(categorical_features) | |
return feature_names | |
def prepare_data(self, data, target_column='recovery_status'): | |
""" | |
Prepare data for model training or prediction. | |
Parameters: | |
----------- | |
data : pandas.DataFrame | |
The data to prepare | |
target_column : str, optional | |
The name of the target column, by default 'recovery_status' | |
Returns: | |
-------- | |
tuple | |
(X, y) if target_column is in data, otherwise just X | |
""" | |
# Drop customer_id as it's not a feature | |
if 'customer_id' in data.columns: | |
data = data.drop('customer_id', axis=1) | |
if target_column in data.columns: | |
X = data.drop(target_column, axis=1) | |
y = data[target_column] | |
return X, y | |
else: | |
return data | |