Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from datetime import datetime, timedelta | |
import random | |
def generate_loan_data(n_samples=1000, seed=42): | |
""" | |
Generate synthetic loan data for the loan recovery system. | |
Parameters: | |
----------- | |
n_samples : int | |
Number of loan records to generate | |
seed : int | |
Random seed for reproducibility | |
Returns: | |
-------- | |
pandas.DataFrame | |
DataFrame containing synthetic loan data | |
""" | |
np.random.seed(seed) | |
random.seed(seed) | |
# Customer information | |
customer_ids = [f'CUST{i:06d}' for i in range(1, n_samples + 1)] | |
ages = np.random.randint(22, 65, n_samples) | |
genders = np.random.choice(['Male', 'Female'], n_samples) | |
# Employment information | |
employment_statuses = np.random.choice( | |
['Employed', 'Self-employed', 'Unemployed', 'Retired'], | |
n_samples, | |
p=[0.65, 0.20, 0.10, 0.05] | |
) | |
annual_incomes = [] | |
for status in employment_statuses: | |
if status == 'Employed': | |
annual_incomes.append(np.random.normal(60000, 20000)) | |
elif status == 'Self-employed': | |
annual_incomes.append(np.random.normal(75000, 30000)) | |
elif status == 'Unemployed': | |
annual_incomes.append(np.random.normal(15000, 10000)) | |
else: # Retired | |
annual_incomes.append(np.random.normal(40000, 15000)) | |
# Credit information | |
credit_scores = [] | |
for income in annual_incomes: | |
base_score = 300 + (income / 100000) * 400 # Higher income tends to have higher credit score | |
credit_scores.append(min(850, max(300, int(np.random.normal(base_score, 50))))) | |
# Loan information | |
loan_amounts = [] | |
for income, credit in zip(annual_incomes, credit_scores): | |
# Higher income and credit score can get larger loans | |
max_loan = income * (0.5 + (credit - 300) / 850) | |
loan_amounts.append(np.random.uniform(5000, max_loan)) | |
interest_rates = [] | |
for credit in credit_scores: | |
# Lower credit scores get higher interest rates | |
base_rate = 15 - (credit - 300) * (10 / 550) # Range from ~5% to ~15% | |
interest_rates.append(max(5, min(15, base_rate + np.random.normal(0, 1)))) | |
loan_terms = np.random.choice([12, 24, 36, 48, 60], n_samples) | |
# Loan performance | |
payment_histories = [] | |
for credit in credit_scores: | |
# Better credit scores tend to have better payment histories | |
if credit > 750: | |
payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair'], p=[0.8, 0.15, 0.05])) | |
elif credit > 650: | |
payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair', 'Poor'], p=[0.4, 0.4, 0.15, 0.05])) | |
elif credit > 550: | |
payment_histories.append(np.random.choice(['Good', 'Fair', 'Poor'], p=[0.3, 0.5, 0.2])) | |
else: | |
payment_histories.append(np.random.choice(['Fair', 'Poor', 'Very Poor'], p=[0.3, 0.5, 0.2])) | |
days_past_due = [] | |
for history in payment_histories: | |
if history == 'Excellent': | |
days_past_due.append(np.random.choice([0, 0, 0, 0, np.random.randint(1, 10)], p=[0.9, 0.025, 0.025, 0.025, 0.025])) | |
elif history == 'Good': | |
days_past_due.append(np.random.choice([0, np.random.randint(1, 15), np.random.randint(15, 30)], p=[0.7, 0.2, 0.1])) | |
elif history == 'Fair': | |
days_past_due.append(np.random.choice([0, np.random.randint(1, 30), np.random.randint(30, 60)], p=[0.5, 0.3, 0.2])) | |
elif history == 'Poor': | |
days_past_due.append(np.random.choice([np.random.randint(1, 30), np.random.randint(30, 90), np.random.randint(90, 120)], p=[0.3, 0.4, 0.3])) | |
else: # Very Poor | |
days_past_due.append(np.random.choice([np.random.randint(30, 90), np.random.randint(90, 180), np.random.randint(180, 365)], p=[0.2, 0.4, 0.4])) | |
# Previous defaults | |
previous_defaults = [] | |
for credit, history in zip(credit_scores, payment_histories): | |
if credit < 500 or history in ['Poor', 'Very Poor']: | |
previous_defaults.append(np.random.choice([0, 1, 2, 3], p=[0.2, 0.4, 0.3, 0.1])) | |
elif credit < 650: | |
previous_defaults.append(np.random.choice([0, 1], p=[0.8, 0.2])) | |
else: | |
previous_defaults.append(np.random.choice([0, 1], p=[0.95, 0.05])) | |
# Recovery status (target variable) | |
recovery_status = [] | |
for credit, history, dpd, defaults in zip(credit_scores, payment_histories, days_past_due, previous_defaults): | |
# Factors affecting recovery: | |
# 1. Credit score | |
# 2. Payment history | |
# 3. Days past due | |
# 4. Previous defaults | |
recovery_prob = 0.9 # Base probability | |
# Adjust based on credit score | |
if credit < 500: | |
recovery_prob -= 0.3 | |
elif credit < 650: | |
recovery_prob -= 0.1 | |
# Adjust based on payment history | |
if history == 'Very Poor': | |
recovery_prob -= 0.4 | |
elif history == 'Poor': | |
recovery_prob -= 0.2 | |
elif history == 'Fair': | |
recovery_prob -= 0.1 | |
# Adjust based on days past due | |
if dpd > 180: | |
recovery_prob -= 0.4 | |
elif dpd > 90: | |
recovery_prob -= 0.3 | |
elif dpd > 30: | |
recovery_prob -= 0.15 | |
elif dpd > 0: | |
recovery_prob -= 0.05 | |
# Adjust based on previous defaults | |
recovery_prob -= 0.1 * defaults | |
# Ensure probability is between 0 and 1 | |
recovery_prob = max(0.05, min(0.95, recovery_prob)) | |
recovery_status.append(np.random.choice([1, 0], p=[recovery_prob, 1-recovery_prob])) | |
# Create DataFrame | |
data = { | |
'customer_id': customer_ids, | |
'age': ages, | |
'gender': genders, | |
'employment_status': employment_statuses, | |
'annual_income': annual_incomes, | |
'credit_score': credit_scores, | |
'loan_amount': loan_amounts, | |
'interest_rate': interest_rates, | |
'loan_term': loan_terms, | |
'payment_history': payment_histories, | |
'days_past_due': days_past_due, | |
'previous_defaults': previous_defaults, | |
'recovery_status': recovery_status # 1 = recovered, 0 = not recovered | |
} | |
df = pd.DataFrame(data) | |
# Add some additional calculated features | |
df['monthly_payment'] = (df['loan_amount'] * (df['interest_rate']/100/12) * | |
(1 + df['interest_rate']/100/12)**(df['loan_term'])) / \ | |
((1 + df['interest_rate']/100/12)**(df['loan_term']) - 1) | |
df['debt_to_income'] = (df['monthly_payment'] * 12) / df['annual_income'] | |
# Round numeric columns for readability | |
df['annual_income'] = df['annual_income'].round(2) | |
df['loan_amount'] = df['loan_amount'].round(2) | |
df['interest_rate'] = df['interest_rate'].round(2) | |
df['monthly_payment'] = df['monthly_payment'].round(2) | |
df['debt_to_income'] = df['debt_to_income'].round(4) | |
return df | |
if __name__ == "__main__": | |
# Generate sample data | |
loan_data = generate_loan_data(n_samples=1000) | |
# Save to CSV | |
import os | |
os.makedirs('data', exist_ok=True) | |
loan_data.to_csv('data/loan_data.csv', index=False) | |
print(f"Generated {len(loan_data)} loan records and saved to data/loan_data.csv") | |
# Display sample | |
print("\nSample data:") | |
print(loan_data.head()) | |
# Display summary statistics | |
print("\nSummary statistics:") | |
print(loan_data.describe()) | |
# Display recovery rate | |
recovery_rate = loan_data['recovery_status'].mean() * 100 | |
print(f"\nOverall recovery rate: {recovery_rate:.2f}%") | |