Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- src/__pycache__/create_comprehensive_image.cpython-311.pyc +0 -0
- src/__pycache__/detailed_model_comparison.cpython-311.pyc +0 -0
- src/__pycache__/model_comparison.cpython-311.pyc +0 -0
- src/__pycache__/prediction_process.cpython-311.pyc +0 -0
- src/__pycache__/system_summary.cpython-311.pyc +0 -0
- src/__pycache__/train_model.cpython-311.pyc +0 -0
- src/models/__pycache__/loan_recovery_model.cpython-311.pyc +0 -0
- src/models/loan_recovery_model.py +274 -0
- src/preprocessing/__pycache__/data_processor.cpython-311.pyc +0 -0
- src/preprocessing/data_processor.py +144 -0
- src/train_model.py +87 -0
- src/utils/__pycache__/data_generator.cpython-311.pyc +0 -0
- src/utils/data_generator.py +202 -0
src/__pycache__/create_comprehensive_image.cpython-311.pyc
ADDED
Binary file (3.38 kB). View file
|
|
src/__pycache__/detailed_model_comparison.cpython-311.pyc
ADDED
Binary file (12.7 kB). View file
|
|
src/__pycache__/model_comparison.cpython-311.pyc
ADDED
Binary file (9.22 kB). View file
|
|
src/__pycache__/prediction_process.cpython-311.pyc
ADDED
Binary file (7.38 kB). View file
|
|
src/__pycache__/system_summary.cpython-311.pyc
ADDED
Binary file (5.87 kB). View file
|
|
src/__pycache__/train_model.cpython-311.pyc
ADDED
Binary file (4.81 kB). View file
|
|
src/models/__pycache__/loan_recovery_model.cpython-311.pyc
ADDED
Binary file (11.4 kB). View file
|
|
src/models/loan_recovery_model.py
ADDED
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import joblib
|
4 |
+
from sklearn.ensemble import RandomForestClassifier
|
5 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
6 |
+
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import seaborn as sns
|
9 |
+
from src.preprocessing.data_processor import LoanDataProcessor
|
10 |
+
|
11 |
+
class LoanRecoveryModel:
|
12 |
+
"""
|
13 |
+
Machine learning model for predicting loan recovery.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self, model_type='random_forest'):
|
17 |
+
"""
|
18 |
+
Initialize the loan recovery model.
|
19 |
+
|
20 |
+
Parameters:
|
21 |
+
-----------
|
22 |
+
model_type : str, optional
|
23 |
+
Type of model to use, by default 'random_forest'
|
24 |
+
Only 'random_forest' is supported
|
25 |
+
"""
|
26 |
+
self.model_type = 'random_forest' # Always use Random Forest
|
27 |
+
self.model = None
|
28 |
+
self.processor = LoanDataProcessor()
|
29 |
+
|
30 |
+
# Initialize the Random Forest model
|
31 |
+
self.model = RandomForestClassifier(random_state=42)
|
32 |
+
|
33 |
+
def train(self, data, target_column='recovery_status', test_size=0.2, tune_hyperparameters=False):
|
34 |
+
"""
|
35 |
+
Train the model on the provided data.
|
36 |
+
|
37 |
+
Parameters:
|
38 |
+
-----------
|
39 |
+
data : pandas.DataFrame
|
40 |
+
The training data
|
41 |
+
target_column : str, optional
|
42 |
+
The name of the target column, by default 'recovery_status'
|
43 |
+
test_size : float, optional
|
44 |
+
Proportion of data to use for testing, by default 0.2
|
45 |
+
tune_hyperparameters : bool, optional
|
46 |
+
Whether to perform hyperparameter tuning, by default False
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
--------
|
50 |
+
dict
|
51 |
+
Dictionary containing model performance metrics
|
52 |
+
"""
|
53 |
+
# Prepare data
|
54 |
+
X, y = self.processor.prepare_data(data, target_column)
|
55 |
+
|
56 |
+
# Split data into training and testing sets
|
57 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
|
58 |
+
|
59 |
+
# Preprocess the data
|
60 |
+
X_train_processed = self.processor.fit_transform(X_train)
|
61 |
+
X_test_processed = self.processor.transform(X_test)
|
62 |
+
|
63 |
+
# Tune hyperparameters if requested
|
64 |
+
if tune_hyperparameters:
|
65 |
+
self._tune_hyperparameters(X_train_processed, y_train)
|
66 |
+
|
67 |
+
# Train the model
|
68 |
+
self.model.fit(X_train_processed, y_train)
|
69 |
+
|
70 |
+
# Evaluate the model
|
71 |
+
y_pred = self.model.predict(X_test_processed)
|
72 |
+
y_prob = self.model.predict_proba(X_test_processed)[:, 1]
|
73 |
+
|
74 |
+
# Calculate metrics
|
75 |
+
metrics = {
|
76 |
+
'accuracy': self.model.score(X_test_processed, y_test),
|
77 |
+
'roc_auc': roc_auc_score(y_test, y_prob),
|
78 |
+
'classification_report': classification_report(y_test, y_pred, output_dict=True),
|
79 |
+
'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
|
80 |
+
}
|
81 |
+
|
82 |
+
# Feature importance
|
83 |
+
if hasattr(self.model, 'feature_importances_'):
|
84 |
+
feature_names = self.processor.get_feature_names()
|
85 |
+
metrics['feature_importance'] = dict(zip(feature_names, self.model.feature_importances_))
|
86 |
+
|
87 |
+
return metrics
|
88 |
+
|
89 |
+
def predict(self, data):
|
90 |
+
"""
|
91 |
+
Make predictions on new data.
|
92 |
+
|
93 |
+
Parameters:
|
94 |
+
-----------
|
95 |
+
data : pandas.DataFrame
|
96 |
+
The data to make predictions on
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
--------
|
100 |
+
numpy.ndarray
|
101 |
+
Array of predicted probabilities of recovery
|
102 |
+
"""
|
103 |
+
if self.model is None:
|
104 |
+
raise ValueError("Model has not been trained. Call train() first.")
|
105 |
+
|
106 |
+
# Prepare data
|
107 |
+
if 'recovery_status' in data.columns:
|
108 |
+
X, _ = self.processor.prepare_data(data)
|
109 |
+
else:
|
110 |
+
X = self.processor.prepare_data(data)
|
111 |
+
|
112 |
+
# Preprocess the data
|
113 |
+
X_processed = self.processor.transform(X)
|
114 |
+
|
115 |
+
# Make predictions
|
116 |
+
return self.model.predict_proba(X_processed)[:, 1]
|
117 |
+
|
118 |
+
def save_model(self, model_path, processor_path=None):
|
119 |
+
"""
|
120 |
+
Save the trained model and preprocessor to disk.
|
121 |
+
|
122 |
+
Parameters:
|
123 |
+
-----------
|
124 |
+
model_path : str
|
125 |
+
Path to save the model
|
126 |
+
processor_path : str, optional
|
127 |
+
Path to save the preprocessor, by default None
|
128 |
+
If None, will use model_path with '_processor' appended
|
129 |
+
"""
|
130 |
+
if self.model is None:
|
131 |
+
raise ValueError("Model has not been trained. Call train() first.")
|
132 |
+
|
133 |
+
# Save the model
|
134 |
+
joblib.dump(self.model, model_path)
|
135 |
+
|
136 |
+
# Save the preprocessor
|
137 |
+
if processor_path is None:
|
138 |
+
processor_path = model_path.replace('.pkl', '_processor.pkl')
|
139 |
+
|
140 |
+
joblib.dump(self.processor, processor_path)
|
141 |
+
|
142 |
+
@classmethod
|
143 |
+
def load_model(cls, model_path, processor_path=None):
|
144 |
+
"""
|
145 |
+
Load a trained model and preprocessor from disk.
|
146 |
+
|
147 |
+
Parameters:
|
148 |
+
-----------
|
149 |
+
model_path : str
|
150 |
+
Path to the saved model
|
151 |
+
processor_path : str, optional
|
152 |
+
Path to the saved preprocessor, by default None
|
153 |
+
If None, will use model_path with '_processor' appended
|
154 |
+
|
155 |
+
Returns:
|
156 |
+
--------
|
157 |
+
LoanRecoveryModel
|
158 |
+
The loaded model
|
159 |
+
"""
|
160 |
+
# Create a new instance
|
161 |
+
instance = cls()
|
162 |
+
|
163 |
+
# Load the model
|
164 |
+
instance.model = joblib.load(model_path)
|
165 |
+
|
166 |
+
# Load the preprocessor
|
167 |
+
if processor_path is None:
|
168 |
+
processor_path = model_path.replace('.pkl', '_processor.pkl')
|
169 |
+
|
170 |
+
instance.processor = joblib.load(processor_path)
|
171 |
+
|
172 |
+
return instance
|
173 |
+
|
174 |
+
def _tune_hyperparameters(self, X_train, y_train):
|
175 |
+
"""
|
176 |
+
Perform hyperparameter tuning for Random Forest model.
|
177 |
+
|
178 |
+
Parameters:
|
179 |
+
-----------
|
180 |
+
X_train : numpy.ndarray
|
181 |
+
The processed training features
|
182 |
+
y_train : numpy.ndarray
|
183 |
+
The training target values
|
184 |
+
"""
|
185 |
+
# Random Forest hyperparameters
|
186 |
+
param_grid = {
|
187 |
+
'n_estimators': [50, 100, 200],
|
188 |
+
'max_depth': [None, 10, 20, 30],
|
189 |
+
'min_samples_split': [2, 5, 10],
|
190 |
+
'min_samples_leaf': [1, 2, 4]
|
191 |
+
}
|
192 |
+
|
193 |
+
# Create grid search
|
194 |
+
grid_search = GridSearchCV(
|
195 |
+
self.model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1
|
196 |
+
)
|
197 |
+
|
198 |
+
# Fit grid search
|
199 |
+
grid_search.fit(X_train, y_train)
|
200 |
+
|
201 |
+
# Update model with best parameters
|
202 |
+
self.model = grid_search.best_estimator_
|
203 |
+
|
204 |
+
def plot_feature_importance(self, top_n=10):
|
205 |
+
"""
|
206 |
+
Plot feature importance for the trained model.
|
207 |
+
|
208 |
+
Parameters:
|
209 |
+
-----------
|
210 |
+
top_n : int, optional
|
211 |
+
Number of top features to display, by default 10
|
212 |
+
|
213 |
+
Returns:
|
214 |
+
--------
|
215 |
+
matplotlib.figure.Figure
|
216 |
+
The feature importance plot
|
217 |
+
"""
|
218 |
+
if self.model is None:
|
219 |
+
raise ValueError("Model has not been trained. Call train() first.")
|
220 |
+
|
221 |
+
if not hasattr(self.model, 'feature_importances_'):
|
222 |
+
raise ValueError("Model does not have feature importances.")
|
223 |
+
|
224 |
+
# Get feature names and importances
|
225 |
+
feature_names = self.processor.get_feature_names()
|
226 |
+
importances = self.model.feature_importances_
|
227 |
+
|
228 |
+
# Sort by importance
|
229 |
+
indices = np.argsort(importances)[::-1]
|
230 |
+
|
231 |
+
# Take top N features
|
232 |
+
indices = indices[:top_n]
|
233 |
+
|
234 |
+
# Create plot
|
235 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
236 |
+
ax.barh(range(len(indices)), importances[indices], align='center')
|
237 |
+
ax.set_yticks(range(len(indices)))
|
238 |
+
ax.set_yticklabels([feature_names[i] for i in indices])
|
239 |
+
ax.set_xlabel('Feature Importance')
|
240 |
+
ax.set_title('Top {} Feature Importances'.format(top_n))
|
241 |
+
plt.tight_layout()
|
242 |
+
|
243 |
+
return fig
|
244 |
+
|
245 |
+
def plot_confusion_matrix(self, y_true, y_pred):
|
246 |
+
"""
|
247 |
+
Plot confusion matrix for model predictions.
|
248 |
+
|
249 |
+
Parameters:
|
250 |
+
-----------
|
251 |
+
y_true : array-like
|
252 |
+
True labels
|
253 |
+
y_pred : array-like
|
254 |
+
Predicted labels
|
255 |
+
|
256 |
+
Returns:
|
257 |
+
--------
|
258 |
+
matplotlib.figure.Figure
|
259 |
+
The confusion matrix plot
|
260 |
+
"""
|
261 |
+
# Calculate confusion matrix
|
262 |
+
cm = confusion_matrix(y_true, y_pred)
|
263 |
+
|
264 |
+
# Create plot
|
265 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
266 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
|
267 |
+
ax.set_xlabel('Predicted labels')
|
268 |
+
ax.set_ylabel('True labels')
|
269 |
+
ax.set_title('Confusion Matrix')
|
270 |
+
ax.set_xticklabels(['Not Recovered', 'Recovered'])
|
271 |
+
ax.set_yticklabels(['Not Recovered', 'Recovered'])
|
272 |
+
plt.tight_layout()
|
273 |
+
|
274 |
+
return fig
|
src/preprocessing/__pycache__/data_processor.cpython-311.pyc
ADDED
Binary file (5.53 kB). View file
|
|
src/preprocessing/data_processor.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
4 |
+
from sklearn.compose import ColumnTransformer
|
5 |
+
from sklearn.pipeline import Pipeline
|
6 |
+
from sklearn.impute import SimpleImputer
|
7 |
+
|
8 |
+
class LoanDataProcessor:
|
9 |
+
"""
|
10 |
+
Class for preprocessing loan data for machine learning models.
|
11 |
+
"""
|
12 |
+
|
13 |
+
def __init__(self):
|
14 |
+
"""Initialize the data processor."""
|
15 |
+
self.preprocessor = None
|
16 |
+
self.categorical_features = ['gender', 'employment_status', 'payment_history']
|
17 |
+
self.numerical_features = ['age', 'annual_income', 'credit_score', 'loan_amount',
|
18 |
+
'interest_rate', 'loan_term', 'days_past_due',
|
19 |
+
'previous_defaults', 'monthly_payment', 'debt_to_income']
|
20 |
+
|
21 |
+
def fit(self, X):
|
22 |
+
"""
|
23 |
+
Fit the preprocessor on the training data.
|
24 |
+
|
25 |
+
Parameters:
|
26 |
+
-----------
|
27 |
+
X : pandas.DataFrame
|
28 |
+
The training data
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
--------
|
32 |
+
self : LoanDataProcessor
|
33 |
+
The fitted processor
|
34 |
+
"""
|
35 |
+
# Define preprocessing for numerical features
|
36 |
+
numerical_transformer = Pipeline(steps=[
|
37 |
+
('imputer', SimpleImputer(strategy='median')),
|
38 |
+
('scaler', StandardScaler())
|
39 |
+
])
|
40 |
+
|
41 |
+
# Define preprocessing for categorical features
|
42 |
+
categorical_transformer = Pipeline(steps=[
|
43 |
+
('imputer', SimpleImputer(strategy='most_frequent')),
|
44 |
+
('onehot', OneHotEncoder(handle_unknown='ignore'))
|
45 |
+
])
|
46 |
+
|
47 |
+
# Combine preprocessing steps
|
48 |
+
self.preprocessor = ColumnTransformer(
|
49 |
+
transformers=[
|
50 |
+
('num', numerical_transformer, self.numerical_features),
|
51 |
+
('cat', categorical_transformer, self.categorical_features)
|
52 |
+
])
|
53 |
+
|
54 |
+
# Fit the preprocessor
|
55 |
+
self.preprocessor.fit(X)
|
56 |
+
|
57 |
+
return self
|
58 |
+
|
59 |
+
def transform(self, X):
|
60 |
+
"""
|
61 |
+
Transform the data using the fitted preprocessor.
|
62 |
+
|
63 |
+
Parameters:
|
64 |
+
-----------
|
65 |
+
X : pandas.DataFrame
|
66 |
+
The data to transform
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
--------
|
70 |
+
numpy.ndarray
|
71 |
+
The transformed data
|
72 |
+
"""
|
73 |
+
if self.preprocessor is None:
|
74 |
+
raise ValueError("Preprocessor has not been fitted. Call fit() first.")
|
75 |
+
|
76 |
+
return self.preprocessor.transform(X)
|
77 |
+
|
78 |
+
def fit_transform(self, X):
|
79 |
+
"""
|
80 |
+
Fit the preprocessor and transform the data.
|
81 |
+
|
82 |
+
Parameters:
|
83 |
+
-----------
|
84 |
+
X : pandas.DataFrame
|
85 |
+
The data to fit and transform
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
--------
|
89 |
+
numpy.ndarray
|
90 |
+
The transformed data
|
91 |
+
"""
|
92 |
+
return self.fit(X).transform(X)
|
93 |
+
|
94 |
+
def get_feature_names(self):
|
95 |
+
"""
|
96 |
+
Get the names of the transformed features.
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
--------
|
100 |
+
list
|
101 |
+
List of feature names after transformation
|
102 |
+
"""
|
103 |
+
if self.preprocessor is None:
|
104 |
+
raise ValueError("Preprocessor has not been fitted. Call fit() first.")
|
105 |
+
|
106 |
+
# Get feature names from the column transformer
|
107 |
+
feature_names = []
|
108 |
+
|
109 |
+
# Get numerical feature names (these stay the same)
|
110 |
+
feature_names.extend(self.numerical_features)
|
111 |
+
|
112 |
+
# Get categorical feature names (these are expanded by one-hot encoding)
|
113 |
+
categorical_features = self.preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(
|
114 |
+
self.categorical_features)
|
115 |
+
feature_names.extend(categorical_features)
|
116 |
+
|
117 |
+
return feature_names
|
118 |
+
|
119 |
+
def prepare_data(self, data, target_column='recovery_status'):
|
120 |
+
"""
|
121 |
+
Prepare data for model training or prediction.
|
122 |
+
|
123 |
+
Parameters:
|
124 |
+
-----------
|
125 |
+
data : pandas.DataFrame
|
126 |
+
The data to prepare
|
127 |
+
target_column : str, optional
|
128 |
+
The name of the target column, by default 'recovery_status'
|
129 |
+
|
130 |
+
Returns:
|
131 |
+
--------
|
132 |
+
tuple
|
133 |
+
(X, y) if target_column is in data, otherwise just X
|
134 |
+
"""
|
135 |
+
# Drop customer_id as it's not a feature
|
136 |
+
if 'customer_id' in data.columns:
|
137 |
+
data = data.drop('customer_id', axis=1)
|
138 |
+
|
139 |
+
if target_column in data.columns:
|
140 |
+
X = data.drop(target_column, axis=1)
|
141 |
+
y = data[target_column]
|
142 |
+
return X, y
|
143 |
+
else:
|
144 |
+
return data
|
src/train_model.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import matplotlib
|
4 |
+
matplotlib.use('Agg') # Use non-interactive backend
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from src.utils.data_generator import generate_loan_data
|
7 |
+
from src.models.loan_recovery_model import LoanRecoveryModel
|
8 |
+
|
9 |
+
def train_and_save_model(data_path=None, model_type='random_forest', tune_hyperparameters=False):
|
10 |
+
"""
|
11 |
+
Train a loan recovery model and save it to disk.
|
12 |
+
|
13 |
+
Parameters:
|
14 |
+
-----------
|
15 |
+
data_path : str, optional
|
16 |
+
Path to the loan data CSV file, by default None
|
17 |
+
If None, generates synthetic data
|
18 |
+
model_type : str, optional
|
19 |
+
Type of model to train, by default 'random_forest'
|
20 |
+
tune_hyperparameters : bool, optional
|
21 |
+
Whether to tune hyperparameters, by default False
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
--------
|
25 |
+
dict
|
26 |
+
Dictionary containing model performance metrics
|
27 |
+
"""
|
28 |
+
# Create directories if they don't exist
|
29 |
+
os.makedirs('data', exist_ok=True)
|
30 |
+
os.makedirs('models', exist_ok=True)
|
31 |
+
|
32 |
+
# Load or generate data
|
33 |
+
if data_path and os.path.exists(data_path):
|
34 |
+
print(f"Loading data from {data_path}")
|
35 |
+
data = pd.read_csv(data_path)
|
36 |
+
else:
|
37 |
+
print("Generating synthetic loan data")
|
38 |
+
data = generate_loan_data(n_samples=1000)
|
39 |
+
|
40 |
+
# Save generated data
|
41 |
+
data_path = 'data/loan_data.csv'
|
42 |
+
data.to_csv(data_path, index=False)
|
43 |
+
print(f"Saved generated data to {data_path}")
|
44 |
+
|
45 |
+
# Print data summary
|
46 |
+
print(f"\nData shape: {data.shape}")
|
47 |
+
print(f"Recovery rate: {data['recovery_status'].mean() * 100:.2f}%")
|
48 |
+
|
49 |
+
# Train model
|
50 |
+
print(f"\nTraining {model_type} model...")
|
51 |
+
model = LoanRecoveryModel(model_type=model_type)
|
52 |
+
metrics = model.train(data, tune_hyperparameters=tune_hyperparameters)
|
53 |
+
|
54 |
+
# Print performance metrics
|
55 |
+
print("\nModel Performance:")
|
56 |
+
print(f"Accuracy: {metrics['accuracy']:.4f}")
|
57 |
+
print(f"ROC AUC: {metrics['roc_auc']:.4f}")
|
58 |
+
print("\nClassification Report:")
|
59 |
+
for label, values in metrics['classification_report'].items():
|
60 |
+
if label in ['0', '1']:
|
61 |
+
label_name = 'Not Recovered' if label == '0' else 'Recovered'
|
62 |
+
print(f"{label_name}:")
|
63 |
+
print(f" Precision: {values['precision']:.4f}")
|
64 |
+
print(f" Recall: {values['recall']:.4f}")
|
65 |
+
print(f" F1-score: {values['f1-score']:.4f}")
|
66 |
+
|
67 |
+
# Save model
|
68 |
+
model_path = f"models/loan_recovery_{model_type}.pkl"
|
69 |
+
model.save_model(model_path)
|
70 |
+
print(f"\nSaved model to {model_path}")
|
71 |
+
|
72 |
+
# Plot feature importance if available
|
73 |
+
if 'feature_importance' in metrics:
|
74 |
+
fig = model.plot_feature_importance(top_n=10)
|
75 |
+
fig_path = f"models/feature_importance_{model_type}.png"
|
76 |
+
fig.savefig(fig_path)
|
77 |
+
plt.close(fig)
|
78 |
+
print(f"Saved feature importance plot to {fig_path}")
|
79 |
+
|
80 |
+
return metrics
|
81 |
+
|
82 |
+
if __name__ == "__main__":
|
83 |
+
# Train only Random Forest model
|
84 |
+
print(f"\n{'='*50}")
|
85 |
+
print(f"Training Random Forest Model")
|
86 |
+
print(f"{'='*50}")
|
87 |
+
train_and_save_model(model_type='random_forest', tune_hyperparameters=True)
|
src/utils/__pycache__/data_generator.cpython-311.pyc
ADDED
Binary file (10.8 kB). View file
|
|
src/utils/data_generator.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
import random
|
5 |
+
|
6 |
+
def generate_loan_data(n_samples=1000, seed=42):
|
7 |
+
"""
|
8 |
+
Generate synthetic loan data for the loan recovery system.
|
9 |
+
|
10 |
+
Parameters:
|
11 |
+
-----------
|
12 |
+
n_samples : int
|
13 |
+
Number of loan records to generate
|
14 |
+
seed : int
|
15 |
+
Random seed for reproducibility
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
--------
|
19 |
+
pandas.DataFrame
|
20 |
+
DataFrame containing synthetic loan data
|
21 |
+
"""
|
22 |
+
np.random.seed(seed)
|
23 |
+
random.seed(seed)
|
24 |
+
|
25 |
+
# Customer information
|
26 |
+
customer_ids = [f'CUST{i:06d}' for i in range(1, n_samples + 1)]
|
27 |
+
ages = np.random.randint(22, 65, n_samples)
|
28 |
+
genders = np.random.choice(['Male', 'Female'], n_samples)
|
29 |
+
|
30 |
+
# Employment information
|
31 |
+
employment_statuses = np.random.choice(
|
32 |
+
['Employed', 'Self-employed', 'Unemployed', 'Retired'],
|
33 |
+
n_samples,
|
34 |
+
p=[0.65, 0.20, 0.10, 0.05]
|
35 |
+
)
|
36 |
+
annual_incomes = []
|
37 |
+
for status in employment_statuses:
|
38 |
+
if status == 'Employed':
|
39 |
+
annual_incomes.append(np.random.normal(60000, 20000))
|
40 |
+
elif status == 'Self-employed':
|
41 |
+
annual_incomes.append(np.random.normal(75000, 30000))
|
42 |
+
elif status == 'Unemployed':
|
43 |
+
annual_incomes.append(np.random.normal(15000, 10000))
|
44 |
+
else: # Retired
|
45 |
+
annual_incomes.append(np.random.normal(40000, 15000))
|
46 |
+
|
47 |
+
# Credit information
|
48 |
+
credit_scores = []
|
49 |
+
for income in annual_incomes:
|
50 |
+
base_score = 300 + (income / 100000) * 400 # Higher income tends to have higher credit score
|
51 |
+
credit_scores.append(min(850, max(300, int(np.random.normal(base_score, 50)))))
|
52 |
+
|
53 |
+
# Loan information
|
54 |
+
loan_amounts = []
|
55 |
+
for income, credit in zip(annual_incomes, credit_scores):
|
56 |
+
# Higher income and credit score can get larger loans
|
57 |
+
max_loan = income * (0.5 + (credit - 300) / 850)
|
58 |
+
loan_amounts.append(np.random.uniform(5000, max_loan))
|
59 |
+
|
60 |
+
interest_rates = []
|
61 |
+
for credit in credit_scores:
|
62 |
+
# Lower credit scores get higher interest rates
|
63 |
+
base_rate = 15 - (credit - 300) * (10 / 550) # Range from ~5% to ~15%
|
64 |
+
interest_rates.append(max(5, min(15, base_rate + np.random.normal(0, 1))))
|
65 |
+
|
66 |
+
loan_terms = np.random.choice([12, 24, 36, 48, 60], n_samples)
|
67 |
+
|
68 |
+
# Loan performance
|
69 |
+
payment_histories = []
|
70 |
+
for credit in credit_scores:
|
71 |
+
# Better credit scores tend to have better payment histories
|
72 |
+
if credit > 750:
|
73 |
+
payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair'], p=[0.8, 0.15, 0.05]))
|
74 |
+
elif credit > 650:
|
75 |
+
payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair', 'Poor'], p=[0.4, 0.4, 0.15, 0.05]))
|
76 |
+
elif credit > 550:
|
77 |
+
payment_histories.append(np.random.choice(['Good', 'Fair', 'Poor'], p=[0.3, 0.5, 0.2]))
|
78 |
+
else:
|
79 |
+
payment_histories.append(np.random.choice(['Fair', 'Poor', 'Very Poor'], p=[0.3, 0.5, 0.2]))
|
80 |
+
|
81 |
+
days_past_due = []
|
82 |
+
for history in payment_histories:
|
83 |
+
if history == 'Excellent':
|
84 |
+
days_past_due.append(np.random.choice([0, 0, 0, 0, np.random.randint(1, 10)], p=[0.9, 0.025, 0.025, 0.025, 0.025]))
|
85 |
+
elif history == 'Good':
|
86 |
+
days_past_due.append(np.random.choice([0, np.random.randint(1, 15), np.random.randint(15, 30)], p=[0.7, 0.2, 0.1]))
|
87 |
+
elif history == 'Fair':
|
88 |
+
days_past_due.append(np.random.choice([0, np.random.randint(1, 30), np.random.randint(30, 60)], p=[0.5, 0.3, 0.2]))
|
89 |
+
elif history == 'Poor':
|
90 |
+
days_past_due.append(np.random.choice([np.random.randint(1, 30), np.random.randint(30, 90), np.random.randint(90, 120)], p=[0.3, 0.4, 0.3]))
|
91 |
+
else: # Very Poor
|
92 |
+
days_past_due.append(np.random.choice([np.random.randint(30, 90), np.random.randint(90, 180), np.random.randint(180, 365)], p=[0.2, 0.4, 0.4]))
|
93 |
+
|
94 |
+
# Previous defaults
|
95 |
+
previous_defaults = []
|
96 |
+
for credit, history in zip(credit_scores, payment_histories):
|
97 |
+
if credit < 500 or history in ['Poor', 'Very Poor']:
|
98 |
+
previous_defaults.append(np.random.choice([0, 1, 2, 3], p=[0.2, 0.4, 0.3, 0.1]))
|
99 |
+
elif credit < 650:
|
100 |
+
previous_defaults.append(np.random.choice([0, 1], p=[0.8, 0.2]))
|
101 |
+
else:
|
102 |
+
previous_defaults.append(np.random.choice([0, 1], p=[0.95, 0.05]))
|
103 |
+
|
104 |
+
# Recovery status (target variable)
|
105 |
+
recovery_status = []
|
106 |
+
for credit, history, dpd, defaults in zip(credit_scores, payment_histories, days_past_due, previous_defaults):
|
107 |
+
# Factors affecting recovery:
|
108 |
+
# 1. Credit score
|
109 |
+
# 2. Payment history
|
110 |
+
# 3. Days past due
|
111 |
+
# 4. Previous defaults
|
112 |
+
|
113 |
+
recovery_prob = 0.9 # Base probability
|
114 |
+
|
115 |
+
# Adjust based on credit score
|
116 |
+
if credit < 500:
|
117 |
+
recovery_prob -= 0.3
|
118 |
+
elif credit < 650:
|
119 |
+
recovery_prob -= 0.1
|
120 |
+
|
121 |
+
# Adjust based on payment history
|
122 |
+
if history == 'Very Poor':
|
123 |
+
recovery_prob -= 0.4
|
124 |
+
elif history == 'Poor':
|
125 |
+
recovery_prob -= 0.2
|
126 |
+
elif history == 'Fair':
|
127 |
+
recovery_prob -= 0.1
|
128 |
+
|
129 |
+
# Adjust based on days past due
|
130 |
+
if dpd > 180:
|
131 |
+
recovery_prob -= 0.4
|
132 |
+
elif dpd > 90:
|
133 |
+
recovery_prob -= 0.3
|
134 |
+
elif dpd > 30:
|
135 |
+
recovery_prob -= 0.15
|
136 |
+
elif dpd > 0:
|
137 |
+
recovery_prob -= 0.05
|
138 |
+
|
139 |
+
# Adjust based on previous defaults
|
140 |
+
recovery_prob -= 0.1 * defaults
|
141 |
+
|
142 |
+
# Ensure probability is between 0 and 1
|
143 |
+
recovery_prob = max(0.05, min(0.95, recovery_prob))
|
144 |
+
|
145 |
+
recovery_status.append(np.random.choice([1, 0], p=[recovery_prob, 1-recovery_prob]))
|
146 |
+
|
147 |
+
# Create DataFrame
|
148 |
+
data = {
|
149 |
+
'customer_id': customer_ids,
|
150 |
+
'age': ages,
|
151 |
+
'gender': genders,
|
152 |
+
'employment_status': employment_statuses,
|
153 |
+
'annual_income': annual_incomes,
|
154 |
+
'credit_score': credit_scores,
|
155 |
+
'loan_amount': loan_amounts,
|
156 |
+
'interest_rate': interest_rates,
|
157 |
+
'loan_term': loan_terms,
|
158 |
+
'payment_history': payment_histories,
|
159 |
+
'days_past_due': days_past_due,
|
160 |
+
'previous_defaults': previous_defaults,
|
161 |
+
'recovery_status': recovery_status # 1 = recovered, 0 = not recovered
|
162 |
+
}
|
163 |
+
|
164 |
+
df = pd.DataFrame(data)
|
165 |
+
|
166 |
+
# Add some additional calculated features
|
167 |
+
df['monthly_payment'] = (df['loan_amount'] * (df['interest_rate']/100/12) *
|
168 |
+
(1 + df['interest_rate']/100/12)**(df['loan_term'])) / \
|
169 |
+
((1 + df['interest_rate']/100/12)**(df['loan_term']) - 1)
|
170 |
+
|
171 |
+
df['debt_to_income'] = (df['monthly_payment'] * 12) / df['annual_income']
|
172 |
+
|
173 |
+
# Round numeric columns for readability
|
174 |
+
df['annual_income'] = df['annual_income'].round(2)
|
175 |
+
df['loan_amount'] = df['loan_amount'].round(2)
|
176 |
+
df['interest_rate'] = df['interest_rate'].round(2)
|
177 |
+
df['monthly_payment'] = df['monthly_payment'].round(2)
|
178 |
+
df['debt_to_income'] = df['debt_to_income'].round(4)
|
179 |
+
|
180 |
+
return df
|
181 |
+
|
182 |
+
if __name__ == "__main__":
|
183 |
+
# Generate sample data
|
184 |
+
loan_data = generate_loan_data(n_samples=1000)
|
185 |
+
|
186 |
+
# Save to CSV
|
187 |
+
import os
|
188 |
+
os.makedirs('data', exist_ok=True)
|
189 |
+
loan_data.to_csv('data/loan_data.csv', index=False)
|
190 |
+
print(f"Generated {len(loan_data)} loan records and saved to data/loan_data.csv")
|
191 |
+
|
192 |
+
# Display sample
|
193 |
+
print("\nSample data:")
|
194 |
+
print(loan_data.head())
|
195 |
+
|
196 |
+
# Display summary statistics
|
197 |
+
print("\nSummary statistics:")
|
198 |
+
print(loan_data.describe())
|
199 |
+
|
200 |
+
# Display recovery rate
|
201 |
+
recovery_rate = loan_data['recovery_status'].mean() * 100
|
202 |
+
print(f"\nOverall recovery rate: {recovery_rate:.2f}%")
|