Spaces:

Nikhillmahesh701
/

Loan_Recovery

Sleeping

App Files Files Community

Loan_Recovery / src /utils /data_generator.py

Nikhillmahesh701

Upload 13 files

9d99cff verified about 2 months ago

raw

history blame contribute delete

7.66 kB

	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta
	import random

	def generate_loan_data(n_samples=1000, seed=42):
	"""
	Generate synthetic loan data for the loan recovery system.

	Parameters:
	-----------
	n_samples : int
	Number of loan records to generate
	seed : int
	Random seed for reproducibility

	Returns:
	--------
	pandas.DataFrame
	DataFrame containing synthetic loan data
	"""
	np.random.seed(seed)
	random.seed(seed)

	# Customer information
	customer_ids = [f'CUST{i:06d}' for i in range(1, n_samples + 1)]
	ages = np.random.randint(22, 65, n_samples)
	genders = np.random.choice(['Male', 'Female'], n_samples)

	# Employment information
	employment_statuses = np.random.choice(
	['Employed', 'Self-employed', 'Unemployed', 'Retired'],
	n_samples,
	p=[0.65, 0.20, 0.10, 0.05]
	)
	annual_incomes = []
	for status in employment_statuses:
	if status == 'Employed':
	annual_incomes.append(np.random.normal(60000, 20000))
	elif status == 'Self-employed':
	annual_incomes.append(np.random.normal(75000, 30000))
	elif status == 'Unemployed':
	annual_incomes.append(np.random.normal(15000, 10000))
	else: # Retired
	annual_incomes.append(np.random.normal(40000, 15000))

	# Credit information
	credit_scores = []
	for income in annual_incomes:
	base_score = 300 + (income / 100000) * 400 # Higher income tends to have higher credit score
	credit_scores.append(min(850, max(300, int(np.random.normal(base_score, 50)))))

	# Loan information
	loan_amounts = []
	for income, credit in zip(annual_incomes, credit_scores):
	# Higher income and credit score can get larger loans
	max_loan = income * (0.5 + (credit - 300) / 850)
	loan_amounts.append(np.random.uniform(5000, max_loan))

	interest_rates = []
	for credit in credit_scores:
	# Lower credit scores get higher interest rates
	base_rate = 15 - (credit - 300) * (10 / 550) # Range from ~5% to ~15%
	interest_rates.append(max(5, min(15, base_rate + np.random.normal(0, 1))))

	loan_terms = np.random.choice([12, 24, 36, 48, 60], n_samples)

	# Loan performance
	payment_histories = []
	for credit in credit_scores:
	# Better credit scores tend to have better payment histories
	if credit > 750:
	payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair'], p=[0.8, 0.15, 0.05]))
	elif credit > 650:
	payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair', 'Poor'], p=[0.4, 0.4, 0.15, 0.05]))
	elif credit > 550:
	payment_histories.append(np.random.choice(['Good', 'Fair', 'Poor'], p=[0.3, 0.5, 0.2]))
	else:
	payment_histories.append(np.random.choice(['Fair', 'Poor', 'Very Poor'], p=[0.3, 0.5, 0.2]))

	days_past_due = []
	for history in payment_histories:
	if history == 'Excellent':
	days_past_due.append(np.random.choice([0, 0, 0, 0, np.random.randint(1, 10)], p=[0.9, 0.025, 0.025, 0.025, 0.025]))
	elif history == 'Good':
	days_past_due.append(np.random.choice([0, np.random.randint(1, 15), np.random.randint(15, 30)], p=[0.7, 0.2, 0.1]))
	elif history == 'Fair':
	days_past_due.append(np.random.choice([0, np.random.randint(1, 30), np.random.randint(30, 60)], p=[0.5, 0.3, 0.2]))
	elif history == 'Poor':
	days_past_due.append(np.random.choice([np.random.randint(1, 30), np.random.randint(30, 90), np.random.randint(90, 120)], p=[0.3, 0.4, 0.3]))
	else: # Very Poor
	days_past_due.append(np.random.choice([np.random.randint(30, 90), np.random.randint(90, 180), np.random.randint(180, 365)], p=[0.2, 0.4, 0.4]))

	# Previous defaults
	previous_defaults = []
	for credit, history in zip(credit_scores, payment_histories):
	if credit < 500 or history in ['Poor', 'Very Poor']:
	previous_defaults.append(np.random.choice([0, 1, 2, 3], p=[0.2, 0.4, 0.3, 0.1]))
	elif credit < 650:
	previous_defaults.append(np.random.choice([0, 1], p=[0.8, 0.2]))
	else:
	previous_defaults.append(np.random.choice([0, 1], p=[0.95, 0.05]))

	# Recovery status (target variable)
	recovery_status = []
	for credit, history, dpd, defaults in zip(credit_scores, payment_histories, days_past_due, previous_defaults):
	# Factors affecting recovery:
	# 1. Credit score
	# 2. Payment history
	# 3. Days past due
	# 4. Previous defaults

	recovery_prob = 0.9 # Base probability

	# Adjust based on credit score
	if credit < 500:
	recovery_prob -= 0.3
	elif credit < 650:
	recovery_prob -= 0.1

	# Adjust based on payment history
	if history == 'Very Poor':
	recovery_prob -= 0.4
	elif history == 'Poor':
	recovery_prob -= 0.2
	elif history == 'Fair':
	recovery_prob -= 0.1

	# Adjust based on days past due
	if dpd > 180:
	recovery_prob -= 0.4
	elif dpd > 90:
	recovery_prob -= 0.3
	elif dpd > 30:
	recovery_prob -= 0.15
	elif dpd > 0:
	recovery_prob -= 0.05

	# Adjust based on previous defaults
	recovery_prob -= 0.1 * defaults

	# Ensure probability is between 0 and 1
	recovery_prob = max(0.05, min(0.95, recovery_prob))

	recovery_status.append(np.random.choice([1, 0], p=[recovery_prob, 1-recovery_prob]))

	# Create DataFrame
	data = {
	'customer_id': customer_ids,
	'age': ages,
	'gender': genders,
	'employment_status': employment_statuses,
	'annual_income': annual_incomes,
	'credit_score': credit_scores,
	'loan_amount': loan_amounts,
	'interest_rate': interest_rates,
	'loan_term': loan_terms,
	'payment_history': payment_histories,
	'days_past_due': days_past_due,
	'previous_defaults': previous_defaults,
	'recovery_status': recovery_status # 1 = recovered, 0 = not recovered
	}

	df = pd.DataFrame(data)

	# Add some additional calculated features
	df['monthly_payment'] = (df['loan_amount'] * (df['interest_rate']/100/12) *
	(1 + df['interest_rate']/100/12)**(df['loan_term'])) / \
	((1 + df['interest_rate']/100/12)**(df['loan_term']) - 1)

	df['debt_to_income'] = (df['monthly_payment'] * 12) / df['annual_income']

	# Round numeric columns for readability
	df['annual_income'] = df['annual_income'].round(2)
	df['loan_amount'] = df['loan_amount'].round(2)
	df['interest_rate'] = df['interest_rate'].round(2)
	df['monthly_payment'] = df['monthly_payment'].round(2)
	df['debt_to_income'] = df['debt_to_income'].round(4)

	return df

	if __name__ == "__main__":
	# Generate sample data
	loan_data = generate_loan_data(n_samples=1000)

	# Save to CSV
	import os
	os.makedirs('data', exist_ok=True)
	loan_data.to_csv('data/loan_data.csv', index=False)
	print(f"Generated {len(loan_data)} loan records and saved to data/loan_data.csv")

	# Display sample
	print("\nSample data:")
	print(loan_data.head())

	# Display summary statistics
	print("\nSummary statistics:")
	print(loan_data.describe())

	# Display recovery rate
	recovery_rate = loan_data['recovery_status'].mean() * 100
	print(f"\nOverall recovery rate: {recovery_rate:.2f}%")