import pandas as pd import numpy as np import joblib from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score import matplotlib.pyplot as plt import seaborn as sns from src.preprocessing.data_processor import LoanDataProcessor class LoanRecoveryModel: """ Machine learning model for predicting loan recovery. """ def __init__(self, model_type='random_forest'): """ Initialize the loan recovery model. Parameters: ----------- model_type : str, optional Type of model to use, by default 'random_forest' Only 'random_forest' is supported """ self.model_type = 'random_forest' # Always use Random Forest self.model = None self.processor = LoanDataProcessor() # Initialize the Random Forest model self.model = RandomForestClassifier(random_state=42) def train(self, data, target_column='recovery_status', test_size=0.2, tune_hyperparameters=False): """ Train the model on the provided data. Parameters: ----------- data : pandas.DataFrame The training data target_column : str, optional The name of the target column, by default 'recovery_status' test_size : float, optional Proportion of data to use for testing, by default 0.2 tune_hyperparameters : bool, optional Whether to perform hyperparameter tuning, by default False Returns: -------- dict Dictionary containing model performance metrics """ # Prepare data X, y = self.processor.prepare_data(data, target_column) # Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y) # Preprocess the data X_train_processed = self.processor.fit_transform(X_train) X_test_processed = self.processor.transform(X_test) # Tune hyperparameters if requested if tune_hyperparameters: self._tune_hyperparameters(X_train_processed, y_train) # Train the model self.model.fit(X_train_processed, y_train) # Evaluate the model y_pred = self.model.predict(X_test_processed) y_prob = self.model.predict_proba(X_test_processed)[:, 1] # Calculate metrics metrics = { 'accuracy': self.model.score(X_test_processed, y_test), 'roc_auc': roc_auc_score(y_test, y_prob), 'classification_report': classification_report(y_test, y_pred, output_dict=True), 'confusion_matrix': confusion_matrix(y_test, y_pred).tolist() } # Feature importance if hasattr(self.model, 'feature_importances_'): feature_names = self.processor.get_feature_names() metrics['feature_importance'] = dict(zip(feature_names, self.model.feature_importances_)) return metrics def predict(self, data): """ Make predictions on new data. Parameters: ----------- data : pandas.DataFrame The data to make predictions on Returns: -------- numpy.ndarray Array of predicted probabilities of recovery """ if self.model is None: raise ValueError("Model has not been trained. Call train() first.") # Prepare data if 'recovery_status' in data.columns: X, _ = self.processor.prepare_data(data) else: X = self.processor.prepare_data(data) # Preprocess the data X_processed = self.processor.transform(X) # Make predictions return self.model.predict_proba(X_processed)[:, 1] def save_model(self, model_path, processor_path=None): """ Save the trained model and preprocessor to disk. Parameters: ----------- model_path : str Path to save the model processor_path : str, optional Path to save the preprocessor, by default None If None, will use model_path with '_processor' appended """ if self.model is None: raise ValueError("Model has not been trained. Call train() first.") # Save the model joblib.dump(self.model, model_path) # Save the preprocessor if processor_path is None: processor_path = model_path.replace('.pkl', '_processor.pkl') joblib.dump(self.processor, processor_path) @classmethod def load_model(cls, model_path, processor_path=None): """ Load a trained model and preprocessor from disk. Parameters: ----------- model_path : str Path to the saved model processor_path : str, optional Path to the saved preprocessor, by default None If None, will use model_path with '_processor' appended Returns: -------- LoanRecoveryModel The loaded model """ # Create a new instance instance = cls() # Load the model instance.model = joblib.load(model_path) # Load the preprocessor if processor_path is None: processor_path = model_path.replace('.pkl', '_processor.pkl') instance.processor = joblib.load(processor_path) return instance def _tune_hyperparameters(self, X_train, y_train): """ Perform hyperparameter tuning for Random Forest model. Parameters: ----------- X_train : numpy.ndarray The processed training features y_train : numpy.ndarray The training target values """ # Random Forest hyperparameters param_grid = { 'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4] } # Create grid search grid_search = GridSearchCV( self.model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1 ) # Fit grid search grid_search.fit(X_train, y_train) # Update model with best parameters self.model = grid_search.best_estimator_ def plot_feature_importance(self, top_n=10): """ Plot feature importance for the trained model. Parameters: ----------- top_n : int, optional Number of top features to display, by default 10 Returns: -------- matplotlib.figure.Figure The feature importance plot """ if self.model is None: raise ValueError("Model has not been trained. Call train() first.") if not hasattr(self.model, 'feature_importances_'): raise ValueError("Model does not have feature importances.") # Get feature names and importances feature_names = self.processor.get_feature_names() importances = self.model.feature_importances_ # Sort by importance indices = np.argsort(importances)[::-1] # Take top N features indices = indices[:top_n] # Create plot fig, ax = plt.subplots(figsize=(10, 6)) ax.barh(range(len(indices)), importances[indices], align='center') ax.set_yticks(range(len(indices))) ax.set_yticklabels([feature_names[i] for i in indices]) ax.set_xlabel('Feature Importance') ax.set_title('Top {} Feature Importances'.format(top_n)) plt.tight_layout() return fig def plot_confusion_matrix(self, y_true, y_pred): """ Plot confusion matrix for model predictions. Parameters: ----------- y_true : array-like True labels y_pred : array-like Predicted labels Returns: -------- matplotlib.figure.Figure The confusion matrix plot """ # Calculate confusion matrix cm = confusion_matrix(y_true, y_pred) # Create plot fig, ax = plt.subplots(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) ax.set_xlabel('Predicted labels') ax.set_ylabel('True labels') ax.set_title('Confusion Matrix') ax.set_xticklabels(['Not Recovered', 'Recovered']) ax.set_yticklabels(['Not Recovered', 'Recovered']) plt.tight_layout() return fig