import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def generate_loan_data(n_samples=1000, seed=42):
    """
    Generate synthetic loan data for the loan recovery system.

    Parameters:
    -----------
    n_samples : int
        Number of loan records to generate
    seed : int
        Random seed for reproducibility

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing synthetic loan data
    """
    np.random.seed(seed)
    random.seed(seed)

    # Customer information
    customer_ids = [f'CUST{i:06d}' for i in range(1, n_samples + 1)]
    ages = np.random.randint(22, 65, n_samples)
    genders = np.random.choice(['Male', 'Female'], n_samples)

    # Employment information
    employment_statuses = np.random.choice(
        ['Employed', 'Self-employed', 'Unemployed', 'Retired'],
        n_samples,
        p=[0.65, 0.20, 0.10, 0.05]
    )
    annual_incomes = []
    for status in employment_statuses:
        if status == 'Employed':
            annual_incomes.append(np.random.normal(60000, 20000))
        elif status == 'Self-employed':
            annual_incomes.append(np.random.normal(75000, 30000))
        elif status == 'Unemployed':
            annual_incomes.append(np.random.normal(15000, 10000))
        else:  # Retired
            annual_incomes.append(np.random.normal(40000, 15000))

    # Credit information
    credit_scores = []
    for income in annual_incomes:
        base_score = 300 + (income / 100000) * 400  # Higher income tends to have higher credit score
        credit_scores.append(min(850, max(300, int(np.random.normal(base_score, 50)))))

    # Loan information
    loan_amounts = []
    for income, credit in zip(annual_incomes, credit_scores):
        # Higher income and credit score can get larger loans
        max_loan = income * (0.5 + (credit - 300) / 850)
        loan_amounts.append(np.random.uniform(5000, max_loan))

    interest_rates = []
    for credit in credit_scores:
        # Lower credit scores get higher interest rates
        base_rate = 15 - (credit - 300) * (10 / 550)  # Range from ~5% to ~15%
        interest_rates.append(max(5, min(15, base_rate + np.random.normal(0, 1))))

    loan_terms = np.random.choice([12, 24, 36, 48, 60], n_samples)

    # Loan performance
    payment_histories = []
    for credit in credit_scores:
        # Better credit scores tend to have better payment histories
        if credit > 750:
            payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair'], p=[0.8, 0.15, 0.05]))
        elif credit > 650:
            payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair', 'Poor'], p=[0.4, 0.4, 0.15, 0.05]))
        elif credit > 550:
            payment_histories.append(np.random.choice(['Good', 'Fair', 'Poor'], p=[0.3, 0.5, 0.2]))
        else:
            payment_histories.append(np.random.choice(['Fair', 'Poor', 'Very Poor'], p=[0.3, 0.5, 0.2]))

    days_past_due = []
    for history in payment_histories:
        if history == 'Excellent':
            days_past_due.append(np.random.choice([0, 0, 0, 0, np.random.randint(1, 10)], p=[0.9, 0.025, 0.025, 0.025, 0.025]))
        elif history == 'Good':
            days_past_due.append(np.random.choice([0, np.random.randint(1, 15), np.random.randint(15, 30)], p=[0.7, 0.2, 0.1]))
        elif history == 'Fair':
            days_past_due.append(np.random.choice([0, np.random.randint(1, 30), np.random.randint(30, 60)], p=[0.5, 0.3, 0.2]))
        elif history == 'Poor':
            days_past_due.append(np.random.choice([np.random.randint(1, 30), np.random.randint(30, 90), np.random.randint(90, 120)], p=[0.3, 0.4, 0.3]))
        else:  # Very Poor
            days_past_due.append(np.random.choice([np.random.randint(30, 90), np.random.randint(90, 180), np.random.randint(180, 365)], p=[0.2, 0.4, 0.4]))

    # Previous defaults
    previous_defaults = []
    for credit, history in zip(credit_scores, payment_histories):
        if credit < 500 or history in ['Poor', 'Very Poor']:
            previous_defaults.append(np.random.choice([0, 1, 2, 3], p=[0.2, 0.4, 0.3, 0.1]))
        elif credit < 650:
            previous_defaults.append(np.random.choice([0, 1], p=[0.8, 0.2]))
        else:
            previous_defaults.append(np.random.choice([0, 1], p=[0.95, 0.05]))

    # Recovery status (target variable)
    recovery_status = []
    for credit, history, dpd, defaults in zip(credit_scores, payment_histories, days_past_due, previous_defaults):
        # Factors affecting recovery:
        # 1. Credit score
        # 2. Payment history
        # 3. Days past due
        # 4. Previous defaults

        recovery_prob = 0.9  # Base probability

        # Adjust based on credit score
        if credit < 500:
            recovery_prob -= 0.3
        elif credit < 650:
            recovery_prob -= 0.1

        # Adjust based on payment history
        if history == 'Very Poor':
            recovery_prob -= 0.4
        elif history == 'Poor':
            recovery_prob -= 0.2
        elif history == 'Fair':
            recovery_prob -= 0.1

        # Adjust based on days past due
        if dpd > 180:
            recovery_prob -= 0.4
        elif dpd > 90:
            recovery_prob -= 0.3
        elif dpd > 30:
            recovery_prob -= 0.15
        elif dpd > 0:
            recovery_prob -= 0.05

        # Adjust based on previous defaults
        recovery_prob -= 0.1 * defaults

        # Ensure probability is between 0 and 1
        recovery_prob = max(0.05, min(0.95, recovery_prob))

        recovery_status.append(np.random.choice([1, 0], p=[recovery_prob, 1-recovery_prob]))

    # Create DataFrame
    data = {
        'customer_id': customer_ids,
        'age': ages,
        'gender': genders,
        'employment_status': employment_statuses,
        'annual_income': annual_incomes,
        'credit_score': credit_scores,
        'loan_amount': loan_amounts,
        'interest_rate': interest_rates,
        'loan_term': loan_terms,
        'payment_history': payment_histories,
        'days_past_due': days_past_due,
        'previous_defaults': previous_defaults,
        'recovery_status': recovery_status  # 1 = recovered, 0 = not recovered
    }

    df = pd.DataFrame(data)

    # Add some additional calculated features
    df['monthly_payment'] = (df['loan_amount'] * (df['interest_rate']/100/12) *
                            (1 + df['interest_rate']/100/12)**(df['loan_term'])) / \
                            ((1 + df['interest_rate']/100/12)**(df['loan_term']) - 1)

    df['debt_to_income'] = (df['monthly_payment'] * 12) / df['annual_income']

    # Round numeric columns for readability
    df['annual_income'] = df['annual_income'].round(2)
    df['loan_amount'] = df['loan_amount'].round(2)
    df['interest_rate'] = df['interest_rate'].round(2)
    df['monthly_payment'] = df['monthly_payment'].round(2)
    df['debt_to_income'] = df['debt_to_income'].round(4)

    return df

if __name__ == "__main__":
    # Generate sample data
    loan_data = generate_loan_data(n_samples=1000)

    # Save to CSV
    import os
    os.makedirs('data', exist_ok=True)
    loan_data.to_csv('data/loan_data.csv', index=False)
    print(f"Generated {len(loan_data)} loan records and saved to data/loan_data.csv")

    # Display sample
    print("\nSample data:")
    print(loan_data.head())

    # Display summary statistics
    print("\nSummary statistics:")
    print(loan_data.describe())

    # Display recovery rate
    recovery_rate = loan_data['recovery_status'].mean() * 100
    print(f"\nOverall recovery rate: {recovery_rate:.2f}%")