import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

class LoanDataProcessor:
    """
    Class for preprocessing loan data for machine learning models.
    """
    
    def __init__(self):
        """Initialize the data processor."""
        self.preprocessor = None
        self.categorical_features = ['gender', 'employment_status', 'payment_history']
        self.numerical_features = ['age', 'annual_income', 'credit_score', 'loan_amount', 
                                  'interest_rate', 'loan_term', 'days_past_due', 
                                  'previous_defaults', 'monthly_payment', 'debt_to_income']
        
    def fit(self, X):
        """
        Fit the preprocessor on the training data.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            The training data
            
        Returns:
        --------
        self : LoanDataProcessor
            The fitted processor
        """
        # Define preprocessing for numerical features
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        # Define preprocessing for categorical features
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        # Combine preprocessing steps
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, self.numerical_features),
                ('cat', categorical_transformer, self.categorical_features)
            ])
        
        # Fit the preprocessor
        self.preprocessor.fit(X)
        
        return self
    
    def transform(self, X):
        """
        Transform the data using the fitted preprocessor.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            The data to transform
            
        Returns:
        --------
        numpy.ndarray
            The transformed data
        """
        if self.preprocessor is None:
            raise ValueError("Preprocessor has not been fitted. Call fit() first.")
        
        return self.preprocessor.transform(X)
    
    def fit_transform(self, X):
        """
        Fit the preprocessor and transform the data.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            The data to fit and transform
            
        Returns:
        --------
        numpy.ndarray
            The transformed data
        """
        return self.fit(X).transform(X)
    
    def get_feature_names(self):
        """
        Get the names of the transformed features.
        
        Returns:
        --------
        list
            List of feature names after transformation
        """
        if self.preprocessor is None:
            raise ValueError("Preprocessor has not been fitted. Call fit() first.")
        
        # Get feature names from the column transformer
        feature_names = []
        
        # Get numerical feature names (these stay the same)
        feature_names.extend(self.numerical_features)
        
        # Get categorical feature names (these are expanded by one-hot encoding)
        categorical_features = self.preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(
            self.categorical_features)
        feature_names.extend(categorical_features)
        
        return feature_names
    
    def prepare_data(self, data, target_column='recovery_status'):
        """
        Prepare data for model training or prediction.
        
        Parameters:
        -----------
        data : pandas.DataFrame
            The data to prepare
        target_column : str, optional
            The name of the target column, by default 'recovery_status'
            
        Returns:
        --------
        tuple
            (X, y) if target_column is in data, otherwise just X
        """
        # Drop customer_id as it's not a feature
        if 'customer_id' in data.columns:
            data = data.drop('customer_id', axis=1)
        
        if target_column in data.columns:
            X = data.drop(target_column, axis=1)
            y = data[target_column]
            return X, y
        else:
            return data