Nikhillmahesh701 commited on
Commit
9d99cff
·
verified ·
1 Parent(s): fa19a7e

Upload 13 files

Browse files
src/__pycache__/create_comprehensive_image.cpython-311.pyc ADDED
Binary file (3.38 kB). View file
 
src/__pycache__/detailed_model_comparison.cpython-311.pyc ADDED
Binary file (12.7 kB). View file
 
src/__pycache__/model_comparison.cpython-311.pyc ADDED
Binary file (9.22 kB). View file
 
src/__pycache__/prediction_process.cpython-311.pyc ADDED
Binary file (7.38 kB). View file
 
src/__pycache__/system_summary.cpython-311.pyc ADDED
Binary file (5.87 kB). View file
 
src/__pycache__/train_model.cpython-311.pyc ADDED
Binary file (4.81 kB). View file
 
src/models/__pycache__/loan_recovery_model.cpython-311.pyc ADDED
Binary file (11.4 kB). View file
 
src/models/loan_recovery_model.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ from sklearn.ensemble import RandomForestClassifier
5
+ from sklearn.model_selection import train_test_split, GridSearchCV
6
+ from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+ from src.preprocessing.data_processor import LoanDataProcessor
10
+
11
+ class LoanRecoveryModel:
12
+ """
13
+ Machine learning model for predicting loan recovery.
14
+ """
15
+
16
+ def __init__(self, model_type='random_forest'):
17
+ """
18
+ Initialize the loan recovery model.
19
+
20
+ Parameters:
21
+ -----------
22
+ model_type : str, optional
23
+ Type of model to use, by default 'random_forest'
24
+ Only 'random_forest' is supported
25
+ """
26
+ self.model_type = 'random_forest' # Always use Random Forest
27
+ self.model = None
28
+ self.processor = LoanDataProcessor()
29
+
30
+ # Initialize the Random Forest model
31
+ self.model = RandomForestClassifier(random_state=42)
32
+
33
+ def train(self, data, target_column='recovery_status', test_size=0.2, tune_hyperparameters=False):
34
+ """
35
+ Train the model on the provided data.
36
+
37
+ Parameters:
38
+ -----------
39
+ data : pandas.DataFrame
40
+ The training data
41
+ target_column : str, optional
42
+ The name of the target column, by default 'recovery_status'
43
+ test_size : float, optional
44
+ Proportion of data to use for testing, by default 0.2
45
+ tune_hyperparameters : bool, optional
46
+ Whether to perform hyperparameter tuning, by default False
47
+
48
+ Returns:
49
+ --------
50
+ dict
51
+ Dictionary containing model performance metrics
52
+ """
53
+ # Prepare data
54
+ X, y = self.processor.prepare_data(data, target_column)
55
+
56
+ # Split data into training and testing sets
57
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
58
+
59
+ # Preprocess the data
60
+ X_train_processed = self.processor.fit_transform(X_train)
61
+ X_test_processed = self.processor.transform(X_test)
62
+
63
+ # Tune hyperparameters if requested
64
+ if tune_hyperparameters:
65
+ self._tune_hyperparameters(X_train_processed, y_train)
66
+
67
+ # Train the model
68
+ self.model.fit(X_train_processed, y_train)
69
+
70
+ # Evaluate the model
71
+ y_pred = self.model.predict(X_test_processed)
72
+ y_prob = self.model.predict_proba(X_test_processed)[:, 1]
73
+
74
+ # Calculate metrics
75
+ metrics = {
76
+ 'accuracy': self.model.score(X_test_processed, y_test),
77
+ 'roc_auc': roc_auc_score(y_test, y_prob),
78
+ 'classification_report': classification_report(y_test, y_pred, output_dict=True),
79
+ 'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
80
+ }
81
+
82
+ # Feature importance
83
+ if hasattr(self.model, 'feature_importances_'):
84
+ feature_names = self.processor.get_feature_names()
85
+ metrics['feature_importance'] = dict(zip(feature_names, self.model.feature_importances_))
86
+
87
+ return metrics
88
+
89
+ def predict(self, data):
90
+ """
91
+ Make predictions on new data.
92
+
93
+ Parameters:
94
+ -----------
95
+ data : pandas.DataFrame
96
+ The data to make predictions on
97
+
98
+ Returns:
99
+ --------
100
+ numpy.ndarray
101
+ Array of predicted probabilities of recovery
102
+ """
103
+ if self.model is None:
104
+ raise ValueError("Model has not been trained. Call train() first.")
105
+
106
+ # Prepare data
107
+ if 'recovery_status' in data.columns:
108
+ X, _ = self.processor.prepare_data(data)
109
+ else:
110
+ X = self.processor.prepare_data(data)
111
+
112
+ # Preprocess the data
113
+ X_processed = self.processor.transform(X)
114
+
115
+ # Make predictions
116
+ return self.model.predict_proba(X_processed)[:, 1]
117
+
118
+ def save_model(self, model_path, processor_path=None):
119
+ """
120
+ Save the trained model and preprocessor to disk.
121
+
122
+ Parameters:
123
+ -----------
124
+ model_path : str
125
+ Path to save the model
126
+ processor_path : str, optional
127
+ Path to save the preprocessor, by default None
128
+ If None, will use model_path with '_processor' appended
129
+ """
130
+ if self.model is None:
131
+ raise ValueError("Model has not been trained. Call train() first.")
132
+
133
+ # Save the model
134
+ joblib.dump(self.model, model_path)
135
+
136
+ # Save the preprocessor
137
+ if processor_path is None:
138
+ processor_path = model_path.replace('.pkl', '_processor.pkl')
139
+
140
+ joblib.dump(self.processor, processor_path)
141
+
142
+ @classmethod
143
+ def load_model(cls, model_path, processor_path=None):
144
+ """
145
+ Load a trained model and preprocessor from disk.
146
+
147
+ Parameters:
148
+ -----------
149
+ model_path : str
150
+ Path to the saved model
151
+ processor_path : str, optional
152
+ Path to the saved preprocessor, by default None
153
+ If None, will use model_path with '_processor' appended
154
+
155
+ Returns:
156
+ --------
157
+ LoanRecoveryModel
158
+ The loaded model
159
+ """
160
+ # Create a new instance
161
+ instance = cls()
162
+
163
+ # Load the model
164
+ instance.model = joblib.load(model_path)
165
+
166
+ # Load the preprocessor
167
+ if processor_path is None:
168
+ processor_path = model_path.replace('.pkl', '_processor.pkl')
169
+
170
+ instance.processor = joblib.load(processor_path)
171
+
172
+ return instance
173
+
174
+ def _tune_hyperparameters(self, X_train, y_train):
175
+ """
176
+ Perform hyperparameter tuning for Random Forest model.
177
+
178
+ Parameters:
179
+ -----------
180
+ X_train : numpy.ndarray
181
+ The processed training features
182
+ y_train : numpy.ndarray
183
+ The training target values
184
+ """
185
+ # Random Forest hyperparameters
186
+ param_grid = {
187
+ 'n_estimators': [50, 100, 200],
188
+ 'max_depth': [None, 10, 20, 30],
189
+ 'min_samples_split': [2, 5, 10],
190
+ 'min_samples_leaf': [1, 2, 4]
191
+ }
192
+
193
+ # Create grid search
194
+ grid_search = GridSearchCV(
195
+ self.model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1
196
+ )
197
+
198
+ # Fit grid search
199
+ grid_search.fit(X_train, y_train)
200
+
201
+ # Update model with best parameters
202
+ self.model = grid_search.best_estimator_
203
+
204
+ def plot_feature_importance(self, top_n=10):
205
+ """
206
+ Plot feature importance for the trained model.
207
+
208
+ Parameters:
209
+ -----------
210
+ top_n : int, optional
211
+ Number of top features to display, by default 10
212
+
213
+ Returns:
214
+ --------
215
+ matplotlib.figure.Figure
216
+ The feature importance plot
217
+ """
218
+ if self.model is None:
219
+ raise ValueError("Model has not been trained. Call train() first.")
220
+
221
+ if not hasattr(self.model, 'feature_importances_'):
222
+ raise ValueError("Model does not have feature importances.")
223
+
224
+ # Get feature names and importances
225
+ feature_names = self.processor.get_feature_names()
226
+ importances = self.model.feature_importances_
227
+
228
+ # Sort by importance
229
+ indices = np.argsort(importances)[::-1]
230
+
231
+ # Take top N features
232
+ indices = indices[:top_n]
233
+
234
+ # Create plot
235
+ fig, ax = plt.subplots(figsize=(10, 6))
236
+ ax.barh(range(len(indices)), importances[indices], align='center')
237
+ ax.set_yticks(range(len(indices)))
238
+ ax.set_yticklabels([feature_names[i] for i in indices])
239
+ ax.set_xlabel('Feature Importance')
240
+ ax.set_title('Top {} Feature Importances'.format(top_n))
241
+ plt.tight_layout()
242
+
243
+ return fig
244
+
245
+ def plot_confusion_matrix(self, y_true, y_pred):
246
+ """
247
+ Plot confusion matrix for model predictions.
248
+
249
+ Parameters:
250
+ -----------
251
+ y_true : array-like
252
+ True labels
253
+ y_pred : array-like
254
+ Predicted labels
255
+
256
+ Returns:
257
+ --------
258
+ matplotlib.figure.Figure
259
+ The confusion matrix plot
260
+ """
261
+ # Calculate confusion matrix
262
+ cm = confusion_matrix(y_true, y_pred)
263
+
264
+ # Create plot
265
+ fig, ax = plt.subplots(figsize=(8, 6))
266
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
267
+ ax.set_xlabel('Predicted labels')
268
+ ax.set_ylabel('True labels')
269
+ ax.set_title('Confusion Matrix')
270
+ ax.set_xticklabels(['Not Recovered', 'Recovered'])
271
+ ax.set_yticklabels(['Not Recovered', 'Recovered'])
272
+ plt.tight_layout()
273
+
274
+ return fig
src/preprocessing/__pycache__/data_processor.cpython-311.pyc ADDED
Binary file (5.53 kB). View file
 
src/preprocessing/data_processor.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
4
+ from sklearn.compose import ColumnTransformer
5
+ from sklearn.pipeline import Pipeline
6
+ from sklearn.impute import SimpleImputer
7
+
8
+ class LoanDataProcessor:
9
+ """
10
+ Class for preprocessing loan data for machine learning models.
11
+ """
12
+
13
+ def __init__(self):
14
+ """Initialize the data processor."""
15
+ self.preprocessor = None
16
+ self.categorical_features = ['gender', 'employment_status', 'payment_history']
17
+ self.numerical_features = ['age', 'annual_income', 'credit_score', 'loan_amount',
18
+ 'interest_rate', 'loan_term', 'days_past_due',
19
+ 'previous_defaults', 'monthly_payment', 'debt_to_income']
20
+
21
+ def fit(self, X):
22
+ """
23
+ Fit the preprocessor on the training data.
24
+
25
+ Parameters:
26
+ -----------
27
+ X : pandas.DataFrame
28
+ The training data
29
+
30
+ Returns:
31
+ --------
32
+ self : LoanDataProcessor
33
+ The fitted processor
34
+ """
35
+ # Define preprocessing for numerical features
36
+ numerical_transformer = Pipeline(steps=[
37
+ ('imputer', SimpleImputer(strategy='median')),
38
+ ('scaler', StandardScaler())
39
+ ])
40
+
41
+ # Define preprocessing for categorical features
42
+ categorical_transformer = Pipeline(steps=[
43
+ ('imputer', SimpleImputer(strategy='most_frequent')),
44
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
45
+ ])
46
+
47
+ # Combine preprocessing steps
48
+ self.preprocessor = ColumnTransformer(
49
+ transformers=[
50
+ ('num', numerical_transformer, self.numerical_features),
51
+ ('cat', categorical_transformer, self.categorical_features)
52
+ ])
53
+
54
+ # Fit the preprocessor
55
+ self.preprocessor.fit(X)
56
+
57
+ return self
58
+
59
+ def transform(self, X):
60
+ """
61
+ Transform the data using the fitted preprocessor.
62
+
63
+ Parameters:
64
+ -----------
65
+ X : pandas.DataFrame
66
+ The data to transform
67
+
68
+ Returns:
69
+ --------
70
+ numpy.ndarray
71
+ The transformed data
72
+ """
73
+ if self.preprocessor is None:
74
+ raise ValueError("Preprocessor has not been fitted. Call fit() first.")
75
+
76
+ return self.preprocessor.transform(X)
77
+
78
+ def fit_transform(self, X):
79
+ """
80
+ Fit the preprocessor and transform the data.
81
+
82
+ Parameters:
83
+ -----------
84
+ X : pandas.DataFrame
85
+ The data to fit and transform
86
+
87
+ Returns:
88
+ --------
89
+ numpy.ndarray
90
+ The transformed data
91
+ """
92
+ return self.fit(X).transform(X)
93
+
94
+ def get_feature_names(self):
95
+ """
96
+ Get the names of the transformed features.
97
+
98
+ Returns:
99
+ --------
100
+ list
101
+ List of feature names after transformation
102
+ """
103
+ if self.preprocessor is None:
104
+ raise ValueError("Preprocessor has not been fitted. Call fit() first.")
105
+
106
+ # Get feature names from the column transformer
107
+ feature_names = []
108
+
109
+ # Get numerical feature names (these stay the same)
110
+ feature_names.extend(self.numerical_features)
111
+
112
+ # Get categorical feature names (these are expanded by one-hot encoding)
113
+ categorical_features = self.preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(
114
+ self.categorical_features)
115
+ feature_names.extend(categorical_features)
116
+
117
+ return feature_names
118
+
119
+ def prepare_data(self, data, target_column='recovery_status'):
120
+ """
121
+ Prepare data for model training or prediction.
122
+
123
+ Parameters:
124
+ -----------
125
+ data : pandas.DataFrame
126
+ The data to prepare
127
+ target_column : str, optional
128
+ The name of the target column, by default 'recovery_status'
129
+
130
+ Returns:
131
+ --------
132
+ tuple
133
+ (X, y) if target_column is in data, otherwise just X
134
+ """
135
+ # Drop customer_id as it's not a feature
136
+ if 'customer_id' in data.columns:
137
+ data = data.drop('customer_id', axis=1)
138
+
139
+ if target_column in data.columns:
140
+ X = data.drop(target_column, axis=1)
141
+ y = data[target_column]
142
+ return X, y
143
+ else:
144
+ return data
src/train_model.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import matplotlib
4
+ matplotlib.use('Agg') # Use non-interactive backend
5
+ import matplotlib.pyplot as plt
6
+ from src.utils.data_generator import generate_loan_data
7
+ from src.models.loan_recovery_model import LoanRecoveryModel
8
+
9
+ def train_and_save_model(data_path=None, model_type='random_forest', tune_hyperparameters=False):
10
+ """
11
+ Train a loan recovery model and save it to disk.
12
+
13
+ Parameters:
14
+ -----------
15
+ data_path : str, optional
16
+ Path to the loan data CSV file, by default None
17
+ If None, generates synthetic data
18
+ model_type : str, optional
19
+ Type of model to train, by default 'random_forest'
20
+ tune_hyperparameters : bool, optional
21
+ Whether to tune hyperparameters, by default False
22
+
23
+ Returns:
24
+ --------
25
+ dict
26
+ Dictionary containing model performance metrics
27
+ """
28
+ # Create directories if they don't exist
29
+ os.makedirs('data', exist_ok=True)
30
+ os.makedirs('models', exist_ok=True)
31
+
32
+ # Load or generate data
33
+ if data_path and os.path.exists(data_path):
34
+ print(f"Loading data from {data_path}")
35
+ data = pd.read_csv(data_path)
36
+ else:
37
+ print("Generating synthetic loan data")
38
+ data = generate_loan_data(n_samples=1000)
39
+
40
+ # Save generated data
41
+ data_path = 'data/loan_data.csv'
42
+ data.to_csv(data_path, index=False)
43
+ print(f"Saved generated data to {data_path}")
44
+
45
+ # Print data summary
46
+ print(f"\nData shape: {data.shape}")
47
+ print(f"Recovery rate: {data['recovery_status'].mean() * 100:.2f}%")
48
+
49
+ # Train model
50
+ print(f"\nTraining {model_type} model...")
51
+ model = LoanRecoveryModel(model_type=model_type)
52
+ metrics = model.train(data, tune_hyperparameters=tune_hyperparameters)
53
+
54
+ # Print performance metrics
55
+ print("\nModel Performance:")
56
+ print(f"Accuracy: {metrics['accuracy']:.4f}")
57
+ print(f"ROC AUC: {metrics['roc_auc']:.4f}")
58
+ print("\nClassification Report:")
59
+ for label, values in metrics['classification_report'].items():
60
+ if label in ['0', '1']:
61
+ label_name = 'Not Recovered' if label == '0' else 'Recovered'
62
+ print(f"{label_name}:")
63
+ print(f" Precision: {values['precision']:.4f}")
64
+ print(f" Recall: {values['recall']:.4f}")
65
+ print(f" F1-score: {values['f1-score']:.4f}")
66
+
67
+ # Save model
68
+ model_path = f"models/loan_recovery_{model_type}.pkl"
69
+ model.save_model(model_path)
70
+ print(f"\nSaved model to {model_path}")
71
+
72
+ # Plot feature importance if available
73
+ if 'feature_importance' in metrics:
74
+ fig = model.plot_feature_importance(top_n=10)
75
+ fig_path = f"models/feature_importance_{model_type}.png"
76
+ fig.savefig(fig_path)
77
+ plt.close(fig)
78
+ print(f"Saved feature importance plot to {fig_path}")
79
+
80
+ return metrics
81
+
82
+ if __name__ == "__main__":
83
+ # Train only Random Forest model
84
+ print(f"\n{'='*50}")
85
+ print(f"Training Random Forest Model")
86
+ print(f"{'='*50}")
87
+ train_and_save_model(model_type='random_forest', tune_hyperparameters=True)
src/utils/__pycache__/data_generator.cpython-311.pyc ADDED
Binary file (10.8 kB). View file
 
src/utils/data_generator.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datetime import datetime, timedelta
4
+ import random
5
+
6
+ def generate_loan_data(n_samples=1000, seed=42):
7
+ """
8
+ Generate synthetic loan data for the loan recovery system.
9
+
10
+ Parameters:
11
+ -----------
12
+ n_samples : int
13
+ Number of loan records to generate
14
+ seed : int
15
+ Random seed for reproducibility
16
+
17
+ Returns:
18
+ --------
19
+ pandas.DataFrame
20
+ DataFrame containing synthetic loan data
21
+ """
22
+ np.random.seed(seed)
23
+ random.seed(seed)
24
+
25
+ # Customer information
26
+ customer_ids = [f'CUST{i:06d}' for i in range(1, n_samples + 1)]
27
+ ages = np.random.randint(22, 65, n_samples)
28
+ genders = np.random.choice(['Male', 'Female'], n_samples)
29
+
30
+ # Employment information
31
+ employment_statuses = np.random.choice(
32
+ ['Employed', 'Self-employed', 'Unemployed', 'Retired'],
33
+ n_samples,
34
+ p=[0.65, 0.20, 0.10, 0.05]
35
+ )
36
+ annual_incomes = []
37
+ for status in employment_statuses:
38
+ if status == 'Employed':
39
+ annual_incomes.append(np.random.normal(60000, 20000))
40
+ elif status == 'Self-employed':
41
+ annual_incomes.append(np.random.normal(75000, 30000))
42
+ elif status == 'Unemployed':
43
+ annual_incomes.append(np.random.normal(15000, 10000))
44
+ else: # Retired
45
+ annual_incomes.append(np.random.normal(40000, 15000))
46
+
47
+ # Credit information
48
+ credit_scores = []
49
+ for income in annual_incomes:
50
+ base_score = 300 + (income / 100000) * 400 # Higher income tends to have higher credit score
51
+ credit_scores.append(min(850, max(300, int(np.random.normal(base_score, 50)))))
52
+
53
+ # Loan information
54
+ loan_amounts = []
55
+ for income, credit in zip(annual_incomes, credit_scores):
56
+ # Higher income and credit score can get larger loans
57
+ max_loan = income * (0.5 + (credit - 300) / 850)
58
+ loan_amounts.append(np.random.uniform(5000, max_loan))
59
+
60
+ interest_rates = []
61
+ for credit in credit_scores:
62
+ # Lower credit scores get higher interest rates
63
+ base_rate = 15 - (credit - 300) * (10 / 550) # Range from ~5% to ~15%
64
+ interest_rates.append(max(5, min(15, base_rate + np.random.normal(0, 1))))
65
+
66
+ loan_terms = np.random.choice([12, 24, 36, 48, 60], n_samples)
67
+
68
+ # Loan performance
69
+ payment_histories = []
70
+ for credit in credit_scores:
71
+ # Better credit scores tend to have better payment histories
72
+ if credit > 750:
73
+ payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair'], p=[0.8, 0.15, 0.05]))
74
+ elif credit > 650:
75
+ payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair', 'Poor'], p=[0.4, 0.4, 0.15, 0.05]))
76
+ elif credit > 550:
77
+ payment_histories.append(np.random.choice(['Good', 'Fair', 'Poor'], p=[0.3, 0.5, 0.2]))
78
+ else:
79
+ payment_histories.append(np.random.choice(['Fair', 'Poor', 'Very Poor'], p=[0.3, 0.5, 0.2]))
80
+
81
+ days_past_due = []
82
+ for history in payment_histories:
83
+ if history == 'Excellent':
84
+ days_past_due.append(np.random.choice([0, 0, 0, 0, np.random.randint(1, 10)], p=[0.9, 0.025, 0.025, 0.025, 0.025]))
85
+ elif history == 'Good':
86
+ days_past_due.append(np.random.choice([0, np.random.randint(1, 15), np.random.randint(15, 30)], p=[0.7, 0.2, 0.1]))
87
+ elif history == 'Fair':
88
+ days_past_due.append(np.random.choice([0, np.random.randint(1, 30), np.random.randint(30, 60)], p=[0.5, 0.3, 0.2]))
89
+ elif history == 'Poor':
90
+ days_past_due.append(np.random.choice([np.random.randint(1, 30), np.random.randint(30, 90), np.random.randint(90, 120)], p=[0.3, 0.4, 0.3]))
91
+ else: # Very Poor
92
+ days_past_due.append(np.random.choice([np.random.randint(30, 90), np.random.randint(90, 180), np.random.randint(180, 365)], p=[0.2, 0.4, 0.4]))
93
+
94
+ # Previous defaults
95
+ previous_defaults = []
96
+ for credit, history in zip(credit_scores, payment_histories):
97
+ if credit < 500 or history in ['Poor', 'Very Poor']:
98
+ previous_defaults.append(np.random.choice([0, 1, 2, 3], p=[0.2, 0.4, 0.3, 0.1]))
99
+ elif credit < 650:
100
+ previous_defaults.append(np.random.choice([0, 1], p=[0.8, 0.2]))
101
+ else:
102
+ previous_defaults.append(np.random.choice([0, 1], p=[0.95, 0.05]))
103
+
104
+ # Recovery status (target variable)
105
+ recovery_status = []
106
+ for credit, history, dpd, defaults in zip(credit_scores, payment_histories, days_past_due, previous_defaults):
107
+ # Factors affecting recovery:
108
+ # 1. Credit score
109
+ # 2. Payment history
110
+ # 3. Days past due
111
+ # 4. Previous defaults
112
+
113
+ recovery_prob = 0.9 # Base probability
114
+
115
+ # Adjust based on credit score
116
+ if credit < 500:
117
+ recovery_prob -= 0.3
118
+ elif credit < 650:
119
+ recovery_prob -= 0.1
120
+
121
+ # Adjust based on payment history
122
+ if history == 'Very Poor':
123
+ recovery_prob -= 0.4
124
+ elif history == 'Poor':
125
+ recovery_prob -= 0.2
126
+ elif history == 'Fair':
127
+ recovery_prob -= 0.1
128
+
129
+ # Adjust based on days past due
130
+ if dpd > 180:
131
+ recovery_prob -= 0.4
132
+ elif dpd > 90:
133
+ recovery_prob -= 0.3
134
+ elif dpd > 30:
135
+ recovery_prob -= 0.15
136
+ elif dpd > 0:
137
+ recovery_prob -= 0.05
138
+
139
+ # Adjust based on previous defaults
140
+ recovery_prob -= 0.1 * defaults
141
+
142
+ # Ensure probability is between 0 and 1
143
+ recovery_prob = max(0.05, min(0.95, recovery_prob))
144
+
145
+ recovery_status.append(np.random.choice([1, 0], p=[recovery_prob, 1-recovery_prob]))
146
+
147
+ # Create DataFrame
148
+ data = {
149
+ 'customer_id': customer_ids,
150
+ 'age': ages,
151
+ 'gender': genders,
152
+ 'employment_status': employment_statuses,
153
+ 'annual_income': annual_incomes,
154
+ 'credit_score': credit_scores,
155
+ 'loan_amount': loan_amounts,
156
+ 'interest_rate': interest_rates,
157
+ 'loan_term': loan_terms,
158
+ 'payment_history': payment_histories,
159
+ 'days_past_due': days_past_due,
160
+ 'previous_defaults': previous_defaults,
161
+ 'recovery_status': recovery_status # 1 = recovered, 0 = not recovered
162
+ }
163
+
164
+ df = pd.DataFrame(data)
165
+
166
+ # Add some additional calculated features
167
+ df['monthly_payment'] = (df['loan_amount'] * (df['interest_rate']/100/12) *
168
+ (1 + df['interest_rate']/100/12)**(df['loan_term'])) / \
169
+ ((1 + df['interest_rate']/100/12)**(df['loan_term']) - 1)
170
+
171
+ df['debt_to_income'] = (df['monthly_payment'] * 12) / df['annual_income']
172
+
173
+ # Round numeric columns for readability
174
+ df['annual_income'] = df['annual_income'].round(2)
175
+ df['loan_amount'] = df['loan_amount'].round(2)
176
+ df['interest_rate'] = df['interest_rate'].round(2)
177
+ df['monthly_payment'] = df['monthly_payment'].round(2)
178
+ df['debt_to_income'] = df['debt_to_income'].round(4)
179
+
180
+ return df
181
+
182
+ if __name__ == "__main__":
183
+ # Generate sample data
184
+ loan_data = generate_loan_data(n_samples=1000)
185
+
186
+ # Save to CSV
187
+ import os
188
+ os.makedirs('data', exist_ok=True)
189
+ loan_data.to_csv('data/loan_data.csv', index=False)
190
+ print(f"Generated {len(loan_data)} loan records and saved to data/loan_data.csv")
191
+
192
+ # Display sample
193
+ print("\nSample data:")
194
+ print(loan_data.head())
195
+
196
+ # Display summary statistics
197
+ print("\nSummary statistics:")
198
+ print(loan_data.describe())
199
+
200
+ # Display recovery rate
201
+ recovery_rate = loan_data['recovery_status'].mean() * 100
202
+ print(f"\nOverall recovery rate: {recovery_rate:.2f}%")