Nikhillmahesh701 commited on
Commit
a47b6e9
·
verified ·
1 Parent(s): 9f70e0c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +388 -0
app.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib
5
+ matplotlib.use('Agg') # Use non-interactive backend
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ import os
9
+ import joblib
10
+ from src.models.loan_recovery_model import LoanRecoveryModel
11
+ from src.utils.data_generator import generate_loan_data
12
+ from src.preprocessing.data_processor import LoanDataProcessor
13
+
14
+ # Set page configuration
15
+ st.set_page_config(
16
+ page_title="Smart Loan Recovery System",
17
+ page_icon="💰",
18
+ layout="wide",
19
+ initial_sidebar_state="expanded"
20
+ )
21
+
22
+ # Define functions
23
+ @st.cache_data
24
+ def load_sample_data():
25
+ """Load or generate sample data."""
26
+ data_path = "data/loan_data.csv"
27
+ if os.path.exists(data_path):
28
+ return pd.read_csv(data_path)
29
+ else:
30
+ data = generate_loan_data(n_samples=1000)
31
+ os.makedirs("data", exist_ok=True)
32
+ data.to_csv(data_path, index=False)
33
+ return data
34
+
35
+ @st.cache_resource
36
+ def load_model(model_type="random_forest"):
37
+ """Load the trained model."""
38
+ model_path = f"models/loan_recovery_{model_type}.pkl"
39
+
40
+ # Check if model exists, if not train it
41
+ if not os.path.exists(model_path):
42
+ st.info(f"Model not found. Training a new {model_type} model...")
43
+ from src.train_model import train_and_save_model
44
+ train_and_save_model(model_type=model_type)
45
+
46
+ return LoanRecoveryModel.load_model(model_path)
47
+
48
+ def predict_recovery(model, data):
49
+ """Make predictions using the model."""
50
+ recovery_probs = model.predict(data)
51
+ return recovery_probs
52
+
53
+ def plot_recovery_distribution(data):
54
+ """Plot the distribution of recovery status."""
55
+ fig, ax = plt.subplots(figsize=(10, 6))
56
+ recovery_counts = data['recovery_status'].value_counts()
57
+ labels = ['Not Recovered', 'Recovered']
58
+ ax.bar(labels, recovery_counts.values)
59
+ ax.set_ylabel('Count')
60
+ ax.set_title('Distribution of Loan Recovery Status')
61
+ for i, v in enumerate(recovery_counts.values):
62
+ ax.text(i, v + 5, str(v), ha='center')
63
+
64
+ # Add percentage labels
65
+ total = len(data)
66
+ for i, v in enumerate(recovery_counts.values):
67
+ percentage = v / total * 100
68
+ ax.text(i, v/2, f"{percentage:.1f}%", ha='center', color='white', fontweight='bold')
69
+
70
+ return fig
71
+
72
+ def plot_feature_importance(model):
73
+ """Plot feature importance."""
74
+ return model.plot_feature_importance(top_n=10)
75
+
76
+ def plot_recovery_by_feature(data, feature, is_categorical=False):
77
+ """Plot recovery rate by a specific feature."""
78
+ fig, ax = plt.subplots(figsize=(10, 6))
79
+
80
+ if is_categorical:
81
+ # For categorical features
82
+ recovery_by_feature = data.groupby(feature)['recovery_status'].mean().sort_values()
83
+ counts = data.groupby(feature).size()
84
+
85
+ # Create a bar plot
86
+ bars = ax.bar(recovery_by_feature.index, recovery_by_feature.values * 100)
87
+ ax.set_ylabel('Recovery Rate (%)')
88
+ ax.set_title(f'Recovery Rate by {feature.replace("_", " ").title()}')
89
+ ax.set_ylim(0, 100)
90
+
91
+ # Add count labels
92
+ for i, (idx, count) in enumerate(counts.items()):
93
+ ax.text(i, 5, f"n={count}", ha='center', color='white', fontweight='bold')
94
+
95
+ # Rotate x-axis labels if needed
96
+ if len(recovery_by_feature) > 5:
97
+ plt.xticks(rotation=45, ha='right')
98
+ else:
99
+ # For numerical features, create bins
100
+ if feature in ['age', 'loan_term', 'previous_defaults', 'days_past_due']:
101
+ # These features have a small range, so we can use them directly
102
+ data['feature_bin'] = data[feature]
103
+ else:
104
+ # Create bins for continuous features
105
+ data['feature_bin'] = pd.qcut(data[feature], 5, duplicates='drop')
106
+
107
+ # Calculate recovery rate by bin
108
+ recovery_by_bin = data.groupby('feature_bin')['recovery_status'].mean().sort_index()
109
+ counts = data.groupby('feature_bin').size()
110
+
111
+ # Create a bar plot
112
+ bars = ax.bar(range(len(recovery_by_bin)), recovery_by_bin.values * 100)
113
+ ax.set_ylabel('Recovery Rate (%)')
114
+ ax.set_title(f'Recovery Rate by {feature.replace("_", " ").title()}')
115
+ ax.set_ylim(0, 100)
116
+
117
+ # Set x-axis labels
118
+ if feature in ['age', 'loan_term', 'previous_defaults', 'days_past_due']:
119
+ ax.set_xticks(range(len(recovery_by_bin)))
120
+ ax.set_xticklabels(recovery_by_bin.index)
121
+ else:
122
+ # Format bin labels
123
+ bin_labels = []
124
+ for bin_range in recovery_by_bin.index:
125
+ if hasattr(bin_range, 'left') and hasattr(bin_range, 'right'):
126
+ bin_labels.append(f"{bin_range.left:.1f}-{bin_range.right:.1f}")
127
+ else:
128
+ bin_labels.append(str(bin_range))
129
+
130
+ ax.set_xticks(range(len(recovery_by_bin)))
131
+ ax.set_xticklabels(bin_labels)
132
+ plt.xticks(rotation=45, ha='right')
133
+
134
+ # Add count labels
135
+ for i, count in enumerate(counts.values):
136
+ ax.text(i, 5, f"n={count}", ha='center', color='white', fontweight='bold')
137
+
138
+ # Add feature name to x-axis
139
+ ax.set_xlabel(feature.replace("_", " ").title())
140
+
141
+ plt.tight_layout()
142
+ return fig
143
+
144
+ # Main application
145
+ def main():
146
+ # Header
147
+ st.title("Smart Loan Recovery System")
148
+ st.image("https://img.icons8.com/color/96/000000/loan.png", width=100)
149
+
150
+ # Load data and model
151
+ data = load_sample_data()
152
+
153
+ # Load Random Forest model only
154
+ model = load_model("random_forest")
155
+
156
+ # Prediction page
157
+ st.title("Predict Loan Recovery")
158
+
159
+ st.write("""
160
+ Use this tool to predict the probability of recovering a loan based on customer and loan information.
161
+ You can either:
162
+ 1. Enter information for a single loan
163
+ 2. Upload a CSV file with multiple loans
164
+ """)
165
+
166
+ prediction_type = st.radio("Prediction Type", ["Single Loan", "Batch Prediction"])
167
+
168
+ if prediction_type == "Single Loan":
169
+ st.subheader("Enter Loan Information")
170
+
171
+ col1, col2, col3 = st.columns(3)
172
+
173
+ with col1:
174
+ age = st.number_input("Age", min_value=18, max_value=100, value=35)
175
+ gender = st.selectbox("Gender", ["Male", "Female"])
176
+ employment_status = st.selectbox(
177
+ "Employment Status",
178
+ ["Employed", "Self-employed", "Unemployed", "Retired"]
179
+ )
180
+ annual_income = st.number_input("Annual Income ($)", min_value=0, value=60000)
181
+
182
+ with col2:
183
+ credit_score = st.slider("Credit Score", 300, 850, 650)
184
+ loan_amount = st.number_input("Loan Amount ($)", min_value=1000, value=20000)
185
+ interest_rate = st.slider("Interest Rate (%)", 1.0, 25.0, 8.0, 0.1)
186
+ loan_term = st.selectbox("Loan Term (months)", [12, 24, 36, 48, 60])
187
+
188
+ with col3:
189
+ payment_history = st.selectbox(
190
+ "Payment History",
191
+ ["Excellent", "Good", "Fair", "Poor", "Very Poor"]
192
+ )
193
+ days_past_due = st.number_input("Days Past Due", min_value=0, value=0)
194
+ previous_defaults = st.number_input("Previous Defaults", min_value=0, max_value=10, value=0)
195
+
196
+ # Calculate derived features
197
+ monthly_payment = (loan_amount * (interest_rate/100/12) *
198
+ (1 + interest_rate/100/12)**(loan_term)) / \
199
+ ((1 + interest_rate/100/12)**(loan_term) - 1)
200
+
201
+ debt_to_income = (monthly_payment * 12) / max(1, annual_income)
202
+
203
+ # Display calculated values
204
+ st.subheader("Calculated Values")
205
+ col1, col2 = st.columns(2)
206
+ with col1:
207
+ st.metric("Monthly Payment", f"${monthly_payment:.2f}")
208
+ with col2:
209
+ st.metric("Debt-to-Income Ratio", f"{debt_to_income*100:.2f}%")
210
+
211
+ # Create input dataframe
212
+ input_data = pd.DataFrame({
213
+ 'age': [age],
214
+ 'gender': [gender],
215
+ 'employment_status': [employment_status],
216
+ 'annual_income': [annual_income],
217
+ 'credit_score': [credit_score],
218
+ 'loan_amount': [loan_amount],
219
+ 'interest_rate': [interest_rate],
220
+ 'loan_term': [loan_term],
221
+ 'payment_history': [payment_history],
222
+ 'days_past_due': [days_past_due],
223
+ 'previous_defaults': [previous_defaults],
224
+ 'monthly_payment': [monthly_payment],
225
+ 'debt_to_income': [debt_to_income]
226
+ })
227
+
228
+ # Make prediction
229
+ if st.button("Predict Recovery Probability"):
230
+ with st.spinner("Calculating recovery probability..."):
231
+ recovery_prob = predict_recovery(model, input_data)[0]
232
+
233
+ # Display result
234
+ st.subheader("Prediction Result")
235
+
236
+ # Create gauge chart for probability
237
+ fig, ax = plt.subplots(figsize=(10, 2))
238
+ ax.barh([0], [100], color='lightgray', height=0.5)
239
+ ax.barh([0], [recovery_prob * 100], color='green' if recovery_prob >= 0.5 else 'red', height=0.5)
240
+ ax.set_xlim(0, 100)
241
+ ax.set_yticks([])
242
+ ax.set_xticks([0, 25, 50, 75, 100])
243
+ ax.set_xticklabels(['0%', '25%', '50%', '75%', '100%'])
244
+ ax.axvline(50, color='gray', linestyle='--', alpha=0.5)
245
+ ax.text(recovery_prob * 100, 0, f"{recovery_prob*100:.1f}%",
246
+ ha='center', va='center', fontweight='bold', color='black')
247
+
248
+ st.pyplot(fig)
249
+
250
+ # Recommendation
251
+ st.subheader("Recovery Assessment")
252
+ if recovery_prob >= 0.8:
253
+ st.success("High probability of recovery. Standard collection procedures recommended.")
254
+ elif recovery_prob >= 0.5:
255
+ st.info("Moderate probability of recovery. Consider offering a payment plan.")
256
+ elif recovery_prob >= 0.3:
257
+ st.warning("Low probability of recovery. Consider debt restructuring or settlement offers.")
258
+ else:
259
+ st.error("Very low probability of recovery. Consider debt write-off or third-party collection.")
260
+
261
+ # Risk factors
262
+ st.subheader("Key Risk Factors")
263
+ risk_factors = []
264
+
265
+ if credit_score < 600:
266
+ risk_factors.append("Low credit score")
267
+ if days_past_due > 30:
268
+ risk_factors.append("Significant payment delay")
269
+ if previous_defaults > 0:
270
+ risk_factors.append("History of defaults")
271
+ if debt_to_income > 0.4:
272
+ risk_factors.append("High debt-to-income ratio")
273
+ if payment_history in ["Poor", "Very Poor"]:
274
+ risk_factors.append("Poor payment history")
275
+
276
+ if risk_factors:
277
+ for factor in risk_factors:
278
+ st.write(f"• {factor}")
279
+ else:
280
+ st.write("No significant risk factors identified.")
281
+
282
+ else: # Batch prediction
283
+ st.subheader("Upload CSV File")
284
+ st.write("""
285
+ Upload a CSV file with loan information. The file should contain the following columns:
286
+ age, gender, employment_status, annual_income, credit_score, loan_amount, interest_rate,
287
+ loan_term, payment_history, days_past_due, previous_defaults
288
+ """)
289
+
290
+ # Sample file download
291
+ sample_data = data.sample(5).drop(['customer_id', 'recovery_status'], axis=1, errors='ignore')
292
+
293
+ @st.cache_data
294
+ def convert_df_to_csv(df):
295
+ return df.to_csv(index=False).encode('utf-8')
296
+
297
+ csv = convert_df_to_csv(sample_data)
298
+ st.download_button(
299
+ "Download Sample CSV",
300
+ csv,
301
+ "sample_loans.csv",
302
+ "text/csv",
303
+ key='download-csv'
304
+ )
305
+
306
+ # File upload
307
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
308
+
309
+ if uploaded_file is not None:
310
+ # Load and display the data
311
+ batch_data = pd.read_csv(uploaded_file)
312
+ st.write("Preview of uploaded data:")
313
+ st.dataframe(batch_data.head())
314
+
315
+ # Check for required columns
316
+ required_cols = ['age', 'gender', 'employment_status', 'annual_income',
317
+ 'credit_score', 'loan_amount', 'interest_rate',
318
+ 'loan_term', 'payment_history', 'days_past_due',
319
+ 'previous_defaults']
320
+
321
+ missing_cols = [col for col in required_cols if col not in batch_data.columns]
322
+
323
+ if missing_cols:
324
+ st.error(f"Missing required columns: {', '.join(missing_cols)}")
325
+ else:
326
+ # Calculate derived features if not present
327
+ if 'monthly_payment' not in batch_data.columns:
328
+ batch_data['monthly_payment'] = (
329
+ batch_data['loan_amount'] * (batch_data['interest_rate']/100/12) *
330
+ (1 + batch_data['interest_rate']/100/12)**(batch_data['loan_term'])
331
+ ) / (
332
+ (1 + batch_data['interest_rate']/100/12)**(batch_data['loan_term']) - 1
333
+ )
334
+
335
+ if 'debt_to_income' not in batch_data.columns:
336
+ batch_data['debt_to_income'] = (batch_data['monthly_payment'] * 12) / batch_data['annual_income'].replace(0, 1)
337
+
338
+ # Make predictions
339
+ if st.button("Run Batch Prediction"):
340
+ with st.spinner("Processing batch predictions..."):
341
+ # Make predictions
342
+ recovery_probs = predict_recovery(model, batch_data)
343
+
344
+ # Add predictions to the dataframe
345
+ batch_data['recovery_probability'] = recovery_probs
346
+ batch_data['recovery_prediction'] = (recovery_probs >= 0.5).astype(int)
347
+
348
+ # Display results
349
+ st.subheader("Prediction Results")
350
+ st.dataframe(batch_data)
351
+
352
+ # Summary statistics
353
+ st.subheader("Summary")
354
+ avg_prob = batch_data['recovery_probability'].mean() * 100
355
+ predicted_recoveries = batch_data['recovery_prediction'].sum()
356
+ recovery_rate = predicted_recoveries / len(batch_data) * 100
357
+
358
+ col1, col2 = st.columns(2)
359
+ with col1:
360
+ st.metric("Average Recovery Probability", f"{avg_prob:.2f}%")
361
+ with col2:
362
+ st.metric("Predicted Recovery Rate", f"{recovery_rate:.2f}% ({predicted_recoveries}/{len(batch_data)})")
363
+
364
+ # Distribution of probabilities
365
+ st.subheader("Distribution of Recovery Probabilities")
366
+ fig, ax = plt.subplots(figsize=(10, 6))
367
+ sns.histplot(batch_data['recovery_probability'], bins=20, kde=True, ax=ax)
368
+ ax.set_xlabel("Recovery Probability")
369
+ ax.set_ylabel("Count")
370
+ ax.axvline(0.5, color='red', linestyle='--')
371
+ ax.text(0.5, ax.get_ylim()[1]*0.9, "Decision Threshold",
372
+ rotation=90, va='top', ha='right', color='red')
373
+ st.pyplot(fig)
374
+
375
+ # Download results
376
+ csv = convert_df_to_csv(batch_data)
377
+ st.download_button(
378
+ "Download Results CSV",
379
+ csv,
380
+ "loan_recovery_predictions.csv",
381
+ "text/csv",
382
+ key='download-results'
383
+ )
384
+
385
+
386
+
387
+ if __name__ == "__main__":
388
+ main()