Spaces:

Nikhillmahesh701
/

Loan_Recovery

Sleeping

App Files Files Community

Nikhillmahesh701 commited on Apr 17

Commit

9d99cff

verified ·

1 Parent(s): fa19a7e

Upload 13 files

Browse files

Files changed (13) hide show

src/__pycache__/create_comprehensive_image.cpython-311.pyc +0 -0
src/__pycache__/detailed_model_comparison.cpython-311.pyc +0 -0
src/__pycache__/model_comparison.cpython-311.pyc +0 -0
src/__pycache__/prediction_process.cpython-311.pyc +0 -0
src/__pycache__/system_summary.cpython-311.pyc +0 -0
src/__pycache__/train_model.cpython-311.pyc +0 -0
src/models/__pycache__/loan_recovery_model.cpython-311.pyc +0 -0
src/models/loan_recovery_model.py +274 -0
src/preprocessing/__pycache__/data_processor.cpython-311.pyc +0 -0
src/preprocessing/data_processor.py +144 -0
src/train_model.py +87 -0
src/utils/__pycache__/data_generator.cpython-311.pyc +0 -0
src/utils/data_generator.py +202 -0

src/__pycache__/create_comprehensive_image.cpython-311.pyc ADDED Viewed

Binary file (3.38 kB). View file

src/__pycache__/detailed_model_comparison.cpython-311.pyc ADDED Viewed

Binary file (12.7 kB). View file

src/__pycache__/model_comparison.cpython-311.pyc ADDED Viewed

Binary file (9.22 kB). View file

src/__pycache__/prediction_process.cpython-311.pyc ADDED Viewed

Binary file (7.38 kB). View file

src/__pycache__/system_summary.cpython-311.pyc ADDED Viewed

Binary file (5.87 kB). View file

src/__pycache__/train_model.cpython-311.pyc ADDED Viewed

Binary file (4.81 kB). View file

src/models/__pycache__/loan_recovery_model.cpython-311.pyc ADDED Viewed

Binary file (11.4 kB). View file

src/models/loan_recovery_model.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
+import matplotlib.pyplot as plt
+import seaborn as sns
+from src.preprocessing.data_processor import LoanDataProcessor
+class LoanRecoveryModel:
+    """
+    Machine learning model for predicting loan recovery.
+    """
+    def __init__(self, model_type='random_forest'):
+        """
+        Initialize the loan recovery model.
+        Parameters:
+        -----------
+        model_type : str, optional
+            Type of model to use, by default 'random_forest'
+            Only 'random_forest' is supported
+        """
+        self.model_type = 'random_forest'  # Always use Random Forest
+        self.model = None
+        self.processor = LoanDataProcessor()
+        # Initialize the Random Forest model
+        self.model = RandomForestClassifier(random_state=42)
+    def train(self, data, target_column='recovery_status', test_size=0.2, tune_hyperparameters=False):
+        """
+        Train the model on the provided data.
+        Parameters:
+        -----------
+        data : pandas.DataFrame
+            The training data
+        target_column : str, optional
+            The name of the target column, by default 'recovery_status'
+        test_size : float, optional
+            Proportion of data to use for testing, by default 0.2
+        tune_hyperparameters : bool, optional
+            Whether to perform hyperparameter tuning, by default False
+        Returns:
+        --------
+        dict
+            Dictionary containing model performance metrics
+        """
+        # Prepare data
+        X, y = self.processor.prepare_data(data, target_column)
+        # Split data into training and testing sets
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
+        # Preprocess the data
+        X_train_processed = self.processor.fit_transform(X_train)
+        X_test_processed = self.processor.transform(X_test)
+        # Tune hyperparameters if requested
+        if tune_hyperparameters:
+            self._tune_hyperparameters(X_train_processed, y_train)
+        # Train the model
+        self.model.fit(X_train_processed, y_train)
+        # Evaluate the model
+        y_pred = self.model.predict(X_test_processed)
+        y_prob = self.model.predict_proba(X_test_processed)[:, 1]
+        # Calculate metrics
+        metrics = {
+            'accuracy': self.model.score(X_test_processed, y_test),
+            'roc_auc': roc_auc_score(y_test, y_prob),
+            'classification_report': classification_report(y_test, y_pred, output_dict=True),
+            'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
+        }
+        # Feature importance
+        if hasattr(self.model, 'feature_importances_'):
+            feature_names = self.processor.get_feature_names()
+            metrics['feature_importance'] = dict(zip(feature_names, self.model.feature_importances_))
+        return metrics
+    def predict(self, data):
+        """
+        Make predictions on new data.
+        Parameters:
+        -----------
+        data : pandas.DataFrame
+            The data to make predictions on
+        Returns:
+        --------
+        numpy.ndarray
+            Array of predicted probabilities of recovery
+        """
+        if self.model is None:
+            raise ValueError("Model has not been trained. Call train() first.")
+        # Prepare data
+        if 'recovery_status' in data.columns:
+            X, _ = self.processor.prepare_data(data)
+        else:
+            X = self.processor.prepare_data(data)
+        # Preprocess the data
+        X_processed = self.processor.transform(X)
+        # Make predictions
+        return self.model.predict_proba(X_processed)[:, 1]
+    def save_model(self, model_path, processor_path=None):
+        """
+        Save the trained model and preprocessor to disk.
+        Parameters:
+        -----------
+        model_path : str
+            Path to save the model
+        processor_path : str, optional
+            Path to save the preprocessor, by default None
+            If None, will use model_path with '_processor' appended
+        """
+        if self.model is None:
+            raise ValueError("Model has not been trained. Call train() first.")
+        # Save the model
+        joblib.dump(self.model, model_path)
+        # Save the preprocessor
+        if processor_path is None:
+            processor_path = model_path.replace('.pkl', '_processor.pkl')
+        joblib.dump(self.processor, processor_path)
+    @classmethod
+    def load_model(cls, model_path, processor_path=None):
+        """
+        Load a trained model and preprocessor from disk.
+        Parameters:
+        -----------
+        model_path : str
+            Path to the saved model
+        processor_path : str, optional
+            Path to the saved preprocessor, by default None
+            If None, will use model_path with '_processor' appended
+        Returns:
+        --------
+        LoanRecoveryModel
+            The loaded model
+        """
+        # Create a new instance
+        instance = cls()
+        # Load the model
+        instance.model = joblib.load(model_path)
+        # Load the preprocessor
+        if processor_path is None:
+            processor_path = model_path.replace('.pkl', '_processor.pkl')
+        instance.processor = joblib.load(processor_path)
+        return instance
+    def _tune_hyperparameters(self, X_train, y_train):
+        """
+        Perform hyperparameter tuning for Random Forest model.
+        Parameters:
+        -----------
+        X_train : numpy.ndarray
+            The processed training features
+        y_train : numpy.ndarray
+            The training target values
+        """
+        # Random Forest hyperparameters
+        param_grid = {
+            'n_estimators': [50, 100, 200],
+            'max_depth': [None, 10, 20, 30],
+            'min_samples_split': [2, 5, 10],
+            'min_samples_leaf': [1, 2, 4]
+        }
+        # Create grid search
+        grid_search = GridSearchCV(
+            self.model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1
+        )
+        # Fit grid search
+        grid_search.fit(X_train, y_train)
+        # Update model with best parameters
+        self.model = grid_search.best_estimator_
+    def plot_feature_importance(self, top_n=10):
+        """
+        Plot feature importance for the trained model.
+        Parameters:
+        -----------
+        top_n : int, optional
+            Number of top features to display, by default 10
+        Returns:
+        --------
+        matplotlib.figure.Figure
+            The feature importance plot
+        """
+        if self.model is None:
+            raise ValueError("Model has not been trained. Call train() first.")
+        if not hasattr(self.model, 'feature_importances_'):
+            raise ValueError("Model does not have feature importances.")
+        # Get feature names and importances
+        feature_names = self.processor.get_feature_names()
+        importances = self.model.feature_importances_
+        # Sort by importance
+        indices = np.argsort(importances)[::-1]
+        # Take top N features
+        indices = indices[:top_n]
+        # Create plot
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ax.barh(range(len(indices)), importances[indices], align='center')
+        ax.set_yticks(range(len(indices)))
+        ax.set_yticklabels([feature_names[i] for i in indices])
+        ax.set_xlabel('Feature Importance')
+        ax.set_title('Top {} Feature Importances'.format(top_n))
+        plt.tight_layout()
+        return fig
+    def plot_confusion_matrix(self, y_true, y_pred):
+        """
+        Plot confusion matrix for model predictions.
+        Parameters:
+        -----------
+        y_true : array-like
+            True labels
+        y_pred : array-like
+            Predicted labels
+        Returns:
+        --------
+        matplotlib.figure.Figure
+            The confusion matrix plot
+        """
+        # Calculate confusion matrix
+        cm = confusion_matrix(y_true, y_pred)
+        # Create plot
+        fig, ax = plt.subplots(figsize=(8, 6))
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
+        ax.set_xlabel('Predicted labels')
+        ax.set_ylabel('True labels')
+        ax.set_title('Confusion Matrix')
+        ax.set_xticklabels(['Not Recovered', 'Recovered'])
+        ax.set_yticklabels(['Not Recovered', 'Recovered'])
+        plt.tight_layout()
+        return fig

src/preprocessing/__pycache__/data_processor.cpython-311.pyc ADDED Viewed

Binary file (5.53 kB). View file

src/preprocessing/data_processor.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+class LoanDataProcessor:
+    """
+    Class for preprocessing loan data for machine learning models.
+    """
+    def __init__(self):
+        """Initialize the data processor."""
+        self.preprocessor = None
+        self.categorical_features = ['gender', 'employment_status', 'payment_history']
+        self.numerical_features = ['age', 'annual_income', 'credit_score', 'loan_amount',
+                                  'interest_rate', 'loan_term', 'days_past_due',
+                                  'previous_defaults', 'monthly_payment', 'debt_to_income']
+    def fit(self, X):
+        """
+        Fit the preprocessor on the training data.
+        Parameters:
+        -----------
+        X : pandas.DataFrame
+            The training data
+        Returns:
+        --------
+        self : LoanDataProcessor
+            The fitted processor
+        """
+        # Define preprocessing for numerical features
+        numerical_transformer = Pipeline(steps=[
+            ('imputer', SimpleImputer(strategy='median')),
+            ('scaler', StandardScaler())
+        ])
+        # Define preprocessing for categorical features
+        categorical_transformer = Pipeline(steps=[
+            ('imputer', SimpleImputer(strategy='most_frequent')),
+            ('onehot', OneHotEncoder(handle_unknown='ignore'))
+        ])
+        # Combine preprocessing steps
+        self.preprocessor = ColumnTransformer(
+            transformers=[
+                ('num', numerical_transformer, self.numerical_features),
+                ('cat', categorical_transformer, self.categorical_features)
+            ])
+        # Fit the preprocessor
+        self.preprocessor.fit(X)
+        return self
+    def transform(self, X):
+        """
+        Transform the data using the fitted preprocessor.
+        Parameters:
+        -----------
+        X : pandas.DataFrame
+            The data to transform
+        Returns:
+        --------
+        numpy.ndarray
+            The transformed data
+        """
+        if self.preprocessor is None:
+            raise ValueError("Preprocessor has not been fitted. Call fit() first.")
+        return self.preprocessor.transform(X)
+    def fit_transform(self, X):
+        """
+        Fit the preprocessor and transform the data.
+        Parameters:
+        -----------
+        X : pandas.DataFrame
+            The data to fit and transform
+        Returns:
+        --------
+        numpy.ndarray
+            The transformed data
+        """
+        return self.fit(X).transform(X)
+    def get_feature_names(self):
+        """
+        Get the names of the transformed features.
+        Returns:
+        --------
+        list
+            List of feature names after transformation
+        """
+        if self.preprocessor is None:
+            raise ValueError("Preprocessor has not been fitted. Call fit() first.")
+        # Get feature names from the column transformer
+        feature_names = []
+        # Get numerical feature names (these stay the same)
+        feature_names.extend(self.numerical_features)
+        # Get categorical feature names (these are expanded by one-hot encoding)
+        categorical_features = self.preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(
+            self.categorical_features)
+        feature_names.extend(categorical_features)
+        return feature_names
+    def prepare_data(self, data, target_column='recovery_status'):
+        """
+        Prepare data for model training or prediction.
+        Parameters:
+        -----------
+        data : pandas.DataFrame
+            The data to prepare
+        target_column : str, optional
+            The name of the target column, by default 'recovery_status'
+        Returns:
+        --------
+        tuple
+            (X, y) if target_column is in data, otherwise just X
+        """
+        # Drop customer_id as it's not a feature
+        if 'customer_id' in data.columns:
+            data = data.drop('customer_id', axis=1)
+        if target_column in data.columns:
+            X = data.drop(target_column, axis=1)
+            y = data[target_column]
+            return X, y
+        else:
+            return data

src/train_model.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')  # Use non-interactive backend
+import matplotlib.pyplot as plt
+from src.utils.data_generator import generate_loan_data
+from src.models.loan_recovery_model import LoanRecoveryModel
+def train_and_save_model(data_path=None, model_type='random_forest', tune_hyperparameters=False):
+    """
+    Train a loan recovery model and save it to disk.
+    Parameters:
+    -----------
+    data_path : str, optional
+        Path to the loan data CSV file, by default None
+        If None, generates synthetic data
+    model_type : str, optional
+        Type of model to train, by default 'random_forest'
+    tune_hyperparameters : bool, optional
+        Whether to tune hyperparameters, by default False
+    Returns:
+    --------
+    dict
+        Dictionary containing model performance metrics
+    """
+    # Create directories if they don't exist
+    os.makedirs('data', exist_ok=True)
+    os.makedirs('models', exist_ok=True)
+    # Load or generate data
+    if data_path and os.path.exists(data_path):
+        print(f"Loading data from {data_path}")
+        data = pd.read_csv(data_path)
+    else:
+        print("Generating synthetic loan data")
+        data = generate_loan_data(n_samples=1000)
+        # Save generated data
+        data_path = 'data/loan_data.csv'
+        data.to_csv(data_path, index=False)
+        print(f"Saved generated data to {data_path}")
+    # Print data summary
+    print(f"\nData shape: {data.shape}")
+    print(f"Recovery rate: {data['recovery_status'].mean() * 100:.2f}%")
+    # Train model
+    print(f"\nTraining {model_type} model...")
+    model = LoanRecoveryModel(model_type=model_type)
+    metrics = model.train(data, tune_hyperparameters=tune_hyperparameters)
+    # Print performance metrics
+    print("\nModel Performance:")
+    print(f"Accuracy: {metrics['accuracy']:.4f}")
+    print(f"ROC AUC: {metrics['roc_auc']:.4f}")
+    print("\nClassification Report:")
+    for label, values in metrics['classification_report'].items():
+        if label in ['0', '1']:
+            label_name = 'Not Recovered' if label == '0' else 'Recovered'
+            print(f"{label_name}:")
+            print(f"  Precision: {values['precision']:.4f}")
+            print(f"  Recall: {values['recall']:.4f}")
+            print(f"  F1-score: {values['f1-score']:.4f}")
+    # Save model
+    model_path = f"models/loan_recovery_{model_type}.pkl"
+    model.save_model(model_path)
+    print(f"\nSaved model to {model_path}")
+    # Plot feature importance if available
+    if 'feature_importance' in metrics:
+        fig = model.plot_feature_importance(top_n=10)
+        fig_path = f"models/feature_importance_{model_type}.png"
+        fig.savefig(fig_path)
+        plt.close(fig)
+        print(f"Saved feature importance plot to {fig_path}")
+    return metrics
+if __name__ == "__main__":
+    # Train only Random Forest model
+    print(f"\n{'='*50}")
+    print(f"Training Random Forest Model")
+    print(f"{'='*50}")
+    train_and_save_model(model_type='random_forest', tune_hyperparameters=True)

src/utils/__pycache__/data_generator.cpython-311.pyc ADDED Viewed

Binary file (10.8 kB). View file

src/utils/data_generator.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+import random
+def generate_loan_data(n_samples=1000, seed=42):
+    """
+    Generate synthetic loan data for the loan recovery system.
+    Parameters:
+    -----------
+    n_samples : int
+        Number of loan records to generate
+    seed : int
+        Random seed for reproducibility
+    Returns:
+    --------
+    pandas.DataFrame
+        DataFrame containing synthetic loan data
+    """
+    np.random.seed(seed)
+    random.seed(seed)
+    # Customer information
+    customer_ids = [f'CUST{i:06d}' for i in range(1, n_samples + 1)]
+    ages = np.random.randint(22, 65, n_samples)
+    genders = np.random.choice(['Male', 'Female'], n_samples)
+    # Employment information
+    employment_statuses = np.random.choice(
+        ['Employed', 'Self-employed', 'Unemployed', 'Retired'],
+        n_samples,
+        p=[0.65, 0.20, 0.10, 0.05]
+    )
+    annual_incomes = []
+    for status in employment_statuses:
+        if status == 'Employed':
+            annual_incomes.append(np.random.normal(60000, 20000))
+        elif status == 'Self-employed':
+            annual_incomes.append(np.random.normal(75000, 30000))
+        elif status == 'Unemployed':
+            annual_incomes.append(np.random.normal(15000, 10000))
+        else:  # Retired
+            annual_incomes.append(np.random.normal(40000, 15000))
+    # Credit information
+    credit_scores = []
+    for income in annual_incomes:
+        base_score = 300 + (income / 100000) * 400  # Higher income tends to have higher credit score
+        credit_scores.append(min(850, max(300, int(np.random.normal(base_score, 50)))))
+    # Loan information
+    loan_amounts = []
+    for income, credit in zip(annual_incomes, credit_scores):
+        # Higher income and credit score can get larger loans
+        max_loan = income * (0.5 + (credit - 300) / 850)
+        loan_amounts.append(np.random.uniform(5000, max_loan))
+    interest_rates = []
+    for credit in credit_scores:
+        # Lower credit scores get higher interest rates
+        base_rate = 15 - (credit - 300) * (10 / 550)  # Range from ~5% to ~15%
+        interest_rates.append(max(5, min(15, base_rate + np.random.normal(0, 1))))
+    loan_terms = np.random.choice([12, 24, 36, 48, 60], n_samples)
+    # Loan performance
+    payment_histories = []
+    for credit in credit_scores:
+        # Better credit scores tend to have better payment histories
+        if credit > 750:
+            payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair'], p=[0.8, 0.15, 0.05]))
+        elif credit > 650:
+            payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair', 'Poor'], p=[0.4, 0.4, 0.15, 0.05]))
+        elif credit > 550:
+            payment_histories.append(np.random.choice(['Good', 'Fair', 'Poor'], p=[0.3, 0.5, 0.2]))
+        else:
+            payment_histories.append(np.random.choice(['Fair', 'Poor', 'Very Poor'], p=[0.3, 0.5, 0.2]))
+    days_past_due = []
+    for history in payment_histories:
+        if history == 'Excellent':
+            days_past_due.append(np.random.choice([0, 0, 0, 0, np.random.randint(1, 10)], p=[0.9, 0.025, 0.025, 0.025, 0.025]))
+        elif history == 'Good':
+            days_past_due.append(np.random.choice([0, np.random.randint(1, 15), np.random.randint(15, 30)], p=[0.7, 0.2, 0.1]))
+        elif history == 'Fair':
+            days_past_due.append(np.random.choice([0, np.random.randint(1, 30), np.random.randint(30, 60)], p=[0.5, 0.3, 0.2]))
+        elif history == 'Poor':
+            days_past_due.append(np.random.choice([np.random.randint(1, 30), np.random.randint(30, 90), np.random.randint(90, 120)], p=[0.3, 0.4, 0.3]))
+        else:  # Very Poor
+            days_past_due.append(np.random.choice([np.random.randint(30, 90), np.random.randint(90, 180), np.random.randint(180, 365)], p=[0.2, 0.4, 0.4]))
+    # Previous defaults
+    previous_defaults = []
+    for credit, history in zip(credit_scores, payment_histories):
+        if credit < 500 or history in ['Poor', 'Very Poor']:
+            previous_defaults.append(np.random.choice([0, 1, 2, 3], p=[0.2, 0.4, 0.3, 0.1]))
+        elif credit < 650:
+            previous_defaults.append(np.random.choice([0, 1], p=[0.8, 0.2]))
+        else:
+            previous_defaults.append(np.random.choice([0, 1], p=[0.95, 0.05]))
+    # Recovery status (target variable)
+    recovery_status = []
+    for credit, history, dpd, defaults in zip(credit_scores, payment_histories, days_past_due, previous_defaults):
+        # Factors affecting recovery:
+        # 1. Credit score
+        # 2. Payment history
+        # 3. Days past due
+        # 4. Previous defaults
+        recovery_prob = 0.9  # Base probability
+        # Adjust based on credit score
+        if credit < 500:
+            recovery_prob -= 0.3
+        elif credit < 650:
+            recovery_prob -= 0.1
+        # Adjust based on payment history
+        if history == 'Very Poor':
+            recovery_prob -= 0.4
+        elif history == 'Poor':
+            recovery_prob -= 0.2
+        elif history == 'Fair':
+            recovery_prob -= 0.1
+        # Adjust based on days past due
+        if dpd > 180:
+            recovery_prob -= 0.4
+        elif dpd > 90:
+            recovery_prob -= 0.3
+        elif dpd > 30:
+            recovery_prob -= 0.15
+        elif dpd > 0:
+            recovery_prob -= 0.05
+        # Adjust based on previous defaults
+        recovery_prob -= 0.1 * defaults
+        # Ensure probability is between 0 and 1
+        recovery_prob = max(0.05, min(0.95, recovery_prob))
+        recovery_status.append(np.random.choice([1, 0], p=[recovery_prob, 1-recovery_prob]))
+    # Create DataFrame
+    data = {
+        'customer_id': customer_ids,
+        'age': ages,
+        'gender': genders,
+        'employment_status': employment_statuses,
+        'annual_income': annual_incomes,
+        'credit_score': credit_scores,
+        'loan_amount': loan_amounts,
+        'interest_rate': interest_rates,
+        'loan_term': loan_terms,
+        'payment_history': payment_histories,
+        'days_past_due': days_past_due,
+        'previous_defaults': previous_defaults,
+        'recovery_status': recovery_status  # 1 = recovered, 0 = not recovered
+    }
+    df = pd.DataFrame(data)
+    # Add some additional calculated features
+    df['monthly_payment'] = (df['loan_amount'] * (df['interest_rate']/100/12) *
+                            (1 + df['interest_rate']/100/12)**(df['loan_term'])) / \
+                            ((1 + df['interest_rate']/100/12)**(df['loan_term']) - 1)
+    df['debt_to_income'] = (df['monthly_payment'] * 12) / df['annual_income']
+    # Round numeric columns for readability
+    df['annual_income'] = df['annual_income'].round(2)
+    df['loan_amount'] = df['loan_amount'].round(2)
+    df['interest_rate'] = df['interest_rate'].round(2)
+    df['monthly_payment'] = df['monthly_payment'].round(2)
+    df['debt_to_income'] = df['debt_to_income'].round(4)
+    return df
+if __name__ == "__main__":
+    # Generate sample data
+    loan_data = generate_loan_data(n_samples=1000)
+    # Save to CSV
+    import os
+    os.makedirs('data', exist_ok=True)
+    loan_data.to_csv('data/loan_data.csv', index=False)
+    print(f"Generated {len(loan_data)} loan records and saved to data/loan_data.csv")
+    # Display sample
+    print("\nSample data:")
+    print(loan_data.head())
+    # Display summary statistics
+    print("\nSummary statistics:")
+    print(loan_data.describe())
+    # Display recovery rate
+    recovery_rate = loan_data['recovery_status'].mean() * 100
+    print(f"\nOverall recovery rate: {recovery_rate:.2f}%")