Spaces:
Sleeping
Sleeping
File size: 7,660 Bytes
9d99cff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
def generate_loan_data(n_samples=1000, seed=42):
"""
Generate synthetic loan data for the loan recovery system.
Parameters:
-----------
n_samples : int
Number of loan records to generate
seed : int
Random seed for reproducibility
Returns:
--------
pandas.DataFrame
DataFrame containing synthetic loan data
"""
np.random.seed(seed)
random.seed(seed)
# Customer information
customer_ids = [f'CUST{i:06d}' for i in range(1, n_samples + 1)]
ages = np.random.randint(22, 65, n_samples)
genders = np.random.choice(['Male', 'Female'], n_samples)
# Employment information
employment_statuses = np.random.choice(
['Employed', 'Self-employed', 'Unemployed', 'Retired'],
n_samples,
p=[0.65, 0.20, 0.10, 0.05]
)
annual_incomes = []
for status in employment_statuses:
if status == 'Employed':
annual_incomes.append(np.random.normal(60000, 20000))
elif status == 'Self-employed':
annual_incomes.append(np.random.normal(75000, 30000))
elif status == 'Unemployed':
annual_incomes.append(np.random.normal(15000, 10000))
else: # Retired
annual_incomes.append(np.random.normal(40000, 15000))
# Credit information
credit_scores = []
for income in annual_incomes:
base_score = 300 + (income / 100000) * 400 # Higher income tends to have higher credit score
credit_scores.append(min(850, max(300, int(np.random.normal(base_score, 50)))))
# Loan information
loan_amounts = []
for income, credit in zip(annual_incomes, credit_scores):
# Higher income and credit score can get larger loans
max_loan = income * (0.5 + (credit - 300) / 850)
loan_amounts.append(np.random.uniform(5000, max_loan))
interest_rates = []
for credit in credit_scores:
# Lower credit scores get higher interest rates
base_rate = 15 - (credit - 300) * (10 / 550) # Range from ~5% to ~15%
interest_rates.append(max(5, min(15, base_rate + np.random.normal(0, 1))))
loan_terms = np.random.choice([12, 24, 36, 48, 60], n_samples)
# Loan performance
payment_histories = []
for credit in credit_scores:
# Better credit scores tend to have better payment histories
if credit > 750:
payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair'], p=[0.8, 0.15, 0.05]))
elif credit > 650:
payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair', 'Poor'], p=[0.4, 0.4, 0.15, 0.05]))
elif credit > 550:
payment_histories.append(np.random.choice(['Good', 'Fair', 'Poor'], p=[0.3, 0.5, 0.2]))
else:
payment_histories.append(np.random.choice(['Fair', 'Poor', 'Very Poor'], p=[0.3, 0.5, 0.2]))
days_past_due = []
for history in payment_histories:
if history == 'Excellent':
days_past_due.append(np.random.choice([0, 0, 0, 0, np.random.randint(1, 10)], p=[0.9, 0.025, 0.025, 0.025, 0.025]))
elif history == 'Good':
days_past_due.append(np.random.choice([0, np.random.randint(1, 15), np.random.randint(15, 30)], p=[0.7, 0.2, 0.1]))
elif history == 'Fair':
days_past_due.append(np.random.choice([0, np.random.randint(1, 30), np.random.randint(30, 60)], p=[0.5, 0.3, 0.2]))
elif history == 'Poor':
days_past_due.append(np.random.choice([np.random.randint(1, 30), np.random.randint(30, 90), np.random.randint(90, 120)], p=[0.3, 0.4, 0.3]))
else: # Very Poor
days_past_due.append(np.random.choice([np.random.randint(30, 90), np.random.randint(90, 180), np.random.randint(180, 365)], p=[0.2, 0.4, 0.4]))
# Previous defaults
previous_defaults = []
for credit, history in zip(credit_scores, payment_histories):
if credit < 500 or history in ['Poor', 'Very Poor']:
previous_defaults.append(np.random.choice([0, 1, 2, 3], p=[0.2, 0.4, 0.3, 0.1]))
elif credit < 650:
previous_defaults.append(np.random.choice([0, 1], p=[0.8, 0.2]))
else:
previous_defaults.append(np.random.choice([0, 1], p=[0.95, 0.05]))
# Recovery status (target variable)
recovery_status = []
for credit, history, dpd, defaults in zip(credit_scores, payment_histories, days_past_due, previous_defaults):
# Factors affecting recovery:
# 1. Credit score
# 2. Payment history
# 3. Days past due
# 4. Previous defaults
recovery_prob = 0.9 # Base probability
# Adjust based on credit score
if credit < 500:
recovery_prob -= 0.3
elif credit < 650:
recovery_prob -= 0.1
# Adjust based on payment history
if history == 'Very Poor':
recovery_prob -= 0.4
elif history == 'Poor':
recovery_prob -= 0.2
elif history == 'Fair':
recovery_prob -= 0.1
# Adjust based on days past due
if dpd > 180:
recovery_prob -= 0.4
elif dpd > 90:
recovery_prob -= 0.3
elif dpd > 30:
recovery_prob -= 0.15
elif dpd > 0:
recovery_prob -= 0.05
# Adjust based on previous defaults
recovery_prob -= 0.1 * defaults
# Ensure probability is between 0 and 1
recovery_prob = max(0.05, min(0.95, recovery_prob))
recovery_status.append(np.random.choice([1, 0], p=[recovery_prob, 1-recovery_prob]))
# Create DataFrame
data = {
'customer_id': customer_ids,
'age': ages,
'gender': genders,
'employment_status': employment_statuses,
'annual_income': annual_incomes,
'credit_score': credit_scores,
'loan_amount': loan_amounts,
'interest_rate': interest_rates,
'loan_term': loan_terms,
'payment_history': payment_histories,
'days_past_due': days_past_due,
'previous_defaults': previous_defaults,
'recovery_status': recovery_status # 1 = recovered, 0 = not recovered
}
df = pd.DataFrame(data)
# Add some additional calculated features
df['monthly_payment'] = (df['loan_amount'] * (df['interest_rate']/100/12) *
(1 + df['interest_rate']/100/12)**(df['loan_term'])) / \
((1 + df['interest_rate']/100/12)**(df['loan_term']) - 1)
df['debt_to_income'] = (df['monthly_payment'] * 12) / df['annual_income']
# Round numeric columns for readability
df['annual_income'] = df['annual_income'].round(2)
df['loan_amount'] = df['loan_amount'].round(2)
df['interest_rate'] = df['interest_rate'].round(2)
df['monthly_payment'] = df['monthly_payment'].round(2)
df['debt_to_income'] = df['debt_to_income'].round(4)
return df
if __name__ == "__main__":
# Generate sample data
loan_data = generate_loan_data(n_samples=1000)
# Save to CSV
import os
os.makedirs('data', exist_ok=True)
loan_data.to_csv('data/loan_data.csv', index=False)
print(f"Generated {len(loan_data)} loan records and saved to data/loan_data.csv")
# Display sample
print("\nSample data:")
print(loan_data.head())
# Display summary statistics
print("\nSummary statistics:")
print(loan_data.describe())
# Display recovery rate
recovery_rate = loan_data['recovery_status'].mean() * 100
print(f"\nOverall recovery rate: {recovery_rate:.2f}%")
|