Spaces:

Nikhillmahesh701
/

Loan_Recovery

Sleeping

App Files Files Community

Loan_Recovery / src /preprocessing /data_processor.py

Nikhillmahesh701

Upload 13 files

9d99cff verified about 2 months ago

raw

history blame contribute delete

4.6 kB

	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.impute import SimpleImputer

	class LoanDataProcessor:
	"""
	Class for preprocessing loan data for machine learning models.
	"""

	def __init__(self):
	"""Initialize the data processor."""
	self.preprocessor = None
	self.categorical_features = ['gender', 'employment_status', 'payment_history']
	self.numerical_features = ['age', 'annual_income', 'credit_score', 'loan_amount',
	'interest_rate', 'loan_term', 'days_past_due',
	'previous_defaults', 'monthly_payment', 'debt_to_income']

	def fit(self, X):
	"""
	Fit the preprocessor on the training data.

	Parameters:
	-----------
	X : pandas.DataFrame
	The training data

	Returns:
	--------
	self : LoanDataProcessor
	The fitted processor
	"""
	# Define preprocessing for numerical features
	numerical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler())
	])

	# Define preprocessing for categorical features
	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', OneHotEncoder(handle_unknown='ignore'))
	])

	# Combine preprocessing steps
	self.preprocessor = ColumnTransformer(
	transformers=[
	('num', numerical_transformer, self.numerical_features),
	('cat', categorical_transformer, self.categorical_features)
	])

	# Fit the preprocessor
	self.preprocessor.fit(X)

	return self

	def transform(self, X):
	"""
	Transform the data using the fitted preprocessor.

	Parameters:
	-----------
	X : pandas.DataFrame
	The data to transform

	Returns:
	--------
	numpy.ndarray
	The transformed data
	"""
	if self.preprocessor is None:
	raise ValueError("Preprocessor has not been fitted. Call fit() first.")

	return self.preprocessor.transform(X)

	def fit_transform(self, X):
	"""
	Fit the preprocessor and transform the data.

	Parameters:
	-----------
	X : pandas.DataFrame
	The data to fit and transform

	Returns:
	--------
	numpy.ndarray
	The transformed data
	"""
	return self.fit(X).transform(X)

	def get_feature_names(self):
	"""
	Get the names of the transformed features.

	Returns:
	--------
	list
	List of feature names after transformation
	"""
	if self.preprocessor is None:
	raise ValueError("Preprocessor has not been fitted. Call fit() first.")

	# Get feature names from the column transformer
	feature_names = []

	# Get numerical feature names (these stay the same)
	feature_names.extend(self.numerical_features)

	# Get categorical feature names (these are expanded by one-hot encoding)
	categorical_features = self.preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(
	self.categorical_features)
	feature_names.extend(categorical_features)

	return feature_names

	def prepare_data(self, data, target_column='recovery_status'):
	"""
	Prepare data for model training or prediction.

	Parameters:
	-----------
	data : pandas.DataFrame
	The data to prepare
	target_column : str, optional
	The name of the target column, by default 'recovery_status'

	Returns:
	--------
	tuple
	(X, y) if target_column is in data, otherwise just X
	"""
	# Drop customer_id as it's not a feature
	if 'customer_id' in data.columns:
	data = data.drop('customer_id', axis=1)

	if target_column in data.columns:
	X = data.drop(target_column, axis=1)
	y = data[target_column]
	return X, y
	else:
	return data