Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import joblib | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split, GridSearchCV | |
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from src.preprocessing.data_processor import LoanDataProcessor | |
class LoanRecoveryModel: | |
""" | |
Machine learning model for predicting loan recovery. | |
""" | |
def __init__(self, model_type='random_forest'): | |
""" | |
Initialize the loan recovery model. | |
Parameters: | |
----------- | |
model_type : str, optional | |
Type of model to use, by default 'random_forest' | |
Only 'random_forest' is supported | |
""" | |
self.model_type = 'random_forest' # Always use Random Forest | |
self.model = None | |
self.processor = LoanDataProcessor() | |
# Initialize the Random Forest model | |
self.model = RandomForestClassifier(random_state=42) | |
def train(self, data, target_column='recovery_status', test_size=0.2, tune_hyperparameters=False): | |
""" | |
Train the model on the provided data. | |
Parameters: | |
----------- | |
data : pandas.DataFrame | |
The training data | |
target_column : str, optional | |
The name of the target column, by default 'recovery_status' | |
test_size : float, optional | |
Proportion of data to use for testing, by default 0.2 | |
tune_hyperparameters : bool, optional | |
Whether to perform hyperparameter tuning, by default False | |
Returns: | |
-------- | |
dict | |
Dictionary containing model performance metrics | |
""" | |
# Prepare data | |
X, y = self.processor.prepare_data(data, target_column) | |
# Split data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y) | |
# Preprocess the data | |
X_train_processed = self.processor.fit_transform(X_train) | |
X_test_processed = self.processor.transform(X_test) | |
# Tune hyperparameters if requested | |
if tune_hyperparameters: | |
self._tune_hyperparameters(X_train_processed, y_train) | |
# Train the model | |
self.model.fit(X_train_processed, y_train) | |
# Evaluate the model | |
y_pred = self.model.predict(X_test_processed) | |
y_prob = self.model.predict_proba(X_test_processed)[:, 1] | |
# Calculate metrics | |
metrics = { | |
'accuracy': self.model.score(X_test_processed, y_test), | |
'roc_auc': roc_auc_score(y_test, y_prob), | |
'classification_report': classification_report(y_test, y_pred, output_dict=True), | |
'confusion_matrix': confusion_matrix(y_test, y_pred).tolist() | |
} | |
# Feature importance | |
if hasattr(self.model, 'feature_importances_'): | |
feature_names = self.processor.get_feature_names() | |
metrics['feature_importance'] = dict(zip(feature_names, self.model.feature_importances_)) | |
return metrics | |
def predict(self, data): | |
""" | |
Make predictions on new data. | |
Parameters: | |
----------- | |
data : pandas.DataFrame | |
The data to make predictions on | |
Returns: | |
-------- | |
numpy.ndarray | |
Array of predicted probabilities of recovery | |
""" | |
if self.model is None: | |
raise ValueError("Model has not been trained. Call train() first.") | |
# Prepare data | |
if 'recovery_status' in data.columns: | |
X, _ = self.processor.prepare_data(data) | |
else: | |
X = self.processor.prepare_data(data) | |
# Preprocess the data | |
X_processed = self.processor.transform(X) | |
# Make predictions | |
return self.model.predict_proba(X_processed)[:, 1] | |
def save_model(self, model_path, processor_path=None): | |
""" | |
Save the trained model and preprocessor to disk. | |
Parameters: | |
----------- | |
model_path : str | |
Path to save the model | |
processor_path : str, optional | |
Path to save the preprocessor, by default None | |
If None, will use model_path with '_processor' appended | |
""" | |
if self.model is None: | |
raise ValueError("Model has not been trained. Call train() first.") | |
# Save the model | |
joblib.dump(self.model, model_path) | |
# Save the preprocessor | |
if processor_path is None: | |
processor_path = model_path.replace('.pkl', '_processor.pkl') | |
joblib.dump(self.processor, processor_path) | |
def load_model(cls, model_path, processor_path=None): | |
""" | |
Load a trained model and preprocessor from disk. | |
Parameters: | |
----------- | |
model_path : str | |
Path to the saved model | |
processor_path : str, optional | |
Path to the saved preprocessor, by default None | |
If None, will use model_path with '_processor' appended | |
Returns: | |
-------- | |
LoanRecoveryModel | |
The loaded model | |
""" | |
# Create a new instance | |
instance = cls() | |
# Load the model | |
instance.model = joblib.load(model_path) | |
# Load the preprocessor | |
if processor_path is None: | |
processor_path = model_path.replace('.pkl', '_processor.pkl') | |
instance.processor = joblib.load(processor_path) | |
return instance | |
def _tune_hyperparameters(self, X_train, y_train): | |
""" | |
Perform hyperparameter tuning for Random Forest model. | |
Parameters: | |
----------- | |
X_train : numpy.ndarray | |
The processed training features | |
y_train : numpy.ndarray | |
The training target values | |
""" | |
# Random Forest hyperparameters | |
param_grid = { | |
'n_estimators': [50, 100, 200], | |
'max_depth': [None, 10, 20, 30], | |
'min_samples_split': [2, 5, 10], | |
'min_samples_leaf': [1, 2, 4] | |
} | |
# Create grid search | |
grid_search = GridSearchCV( | |
self.model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1 | |
) | |
# Fit grid search | |
grid_search.fit(X_train, y_train) | |
# Update model with best parameters | |
self.model = grid_search.best_estimator_ | |
def plot_feature_importance(self, top_n=10): | |
""" | |
Plot feature importance for the trained model. | |
Parameters: | |
----------- | |
top_n : int, optional | |
Number of top features to display, by default 10 | |
Returns: | |
-------- | |
matplotlib.figure.Figure | |
The feature importance plot | |
""" | |
if self.model is None: | |
raise ValueError("Model has not been trained. Call train() first.") | |
if not hasattr(self.model, 'feature_importances_'): | |
raise ValueError("Model does not have feature importances.") | |
# Get feature names and importances | |
feature_names = self.processor.get_feature_names() | |
importances = self.model.feature_importances_ | |
# Sort by importance | |
indices = np.argsort(importances)[::-1] | |
# Take top N features | |
indices = indices[:top_n] | |
# Create plot | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
ax.barh(range(len(indices)), importances[indices], align='center') | |
ax.set_yticks(range(len(indices))) | |
ax.set_yticklabels([feature_names[i] for i in indices]) | |
ax.set_xlabel('Feature Importance') | |
ax.set_title('Top {} Feature Importances'.format(top_n)) | |
plt.tight_layout() | |
return fig | |
def plot_confusion_matrix(self, y_true, y_pred): | |
""" | |
Plot confusion matrix for model predictions. | |
Parameters: | |
----------- | |
y_true : array-like | |
True labels | |
y_pred : array-like | |
Predicted labels | |
Returns: | |
-------- | |
matplotlib.figure.Figure | |
The confusion matrix plot | |
""" | |
# Calculate confusion matrix | |
cm = confusion_matrix(y_true, y_pred) | |
# Create plot | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) | |
ax.set_xlabel('Predicted labels') | |
ax.set_ylabel('True labels') | |
ax.set_title('Confusion Matrix') | |
ax.set_xticklabels(['Not Recovered', 'Recovered']) | |
ax.set_yticklabels(['Not Recovered', 'Recovered']) | |
plt.tight_layout() | |
return fig | |