Spaces:

0xnu
/

fraud-detection

Sleeping

File size: 7,171 Bytes

4de4bb8

import joblib
import pandas as pd
import numpy as np

def get_model_feature_names(preprocessor):
    """Extract the exact feature names that the model expects"""
    try:
        if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
            return preprocessor.feature_names
        else:
            return None
    except:
        return None

def create_complete_feature_set(transaction_data, expected_features):
    """Create a complete feature set matching exactly what the model expects"""
    
    # Start with provided data
    complete_data = transaction_data.copy()
    
    # Define comprehensive defaults
    feature_defaults = {
        # Transaction basics
        'TransactionAmt': 100.0,
        'TransactionDT': 86400,
        'ProductCD': 'W',
        
        # Card features
        'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa', 'card5': 142.0, 'card6': 'credit',
        
        # Address and distance
        'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,
        
        # Email domains
        'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',
        
        # Device info
        'DeviceType': 'desktop', 'DeviceInfo': 'Windows',
        
        # Count features (C1-C14)
        **{f'C{i}': 0.0 for i in range(1, 15)},
        'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0,
        
        # Time delta features (D1-D15) 
        **{f'D{i}': 0.0 for i in range(1, 16)},
        'D5': 20.0,
        
        # Match features (M1-M9)
        **{f'M{i}': 'F' for i in range(1, 10)},
        'M1': 'T', 'M2': 'T', 'M3': 'T', 'M4': 'M0',
        
        # All possible V features (V1-V339 based on error)
        **{f'V{i}': 1.0 for i in range(1, 340)},
        
        # All possible identity features
        **{f'id_{i:02d}': 0.0 for i in range(1, 39)},
    }
    
    # Fill missing features
    for feature in expected_features:
        if feature not in complete_data:
            if feature in feature_defaults:
                complete_data[feature] = feature_defaults[feature]
            else:
                # Default based on feature name pattern
                if feature.startswith('V'):
                    complete_data[feature] = 1.0
                elif feature.startswith('id_'):
                    complete_data[feature] = 0.0
                elif feature.startswith('C'):
                    complete_data[feature] = 0.0
                elif feature.startswith('D'):
                    complete_data[feature] = 0.0
                elif feature.startswith('M'):
                    complete_data[feature] = 'F'
                else:
                    # Numeric default
                    complete_data[feature] = 0.0
    
    return complete_data

def debug_feature_mismatch(model_features, data_features):
    """Debug feature mismatches between model and data"""
    model_set = set(model_features) if model_features else set()
    data_set = set(data_features) if data_features else set()
    
    missing_in_data = model_set - data_set
    extra_in_data = data_set - model_set
    
    print(f"Model expects {len(model_set)} features")
    print(f"Data has {len(data_set)} features")
    print(f"Missing in data: {len(missing_in_data)} features")
    print(f"Extra in data: {len(extra_in_data)} features")
    
    if missing_in_data:
        print(f"First 10 missing features: {list(missing_in_data)[:10]}")
    
    return {
        'missing': list(missing_in_data),
        'extra': list(extra_in_data),
        'model_count': len(model_set),
        'data_count': len(data_set)
    }

def safe_predict_with_feature_matching(model, preprocessor, transaction_data):
    """Safely make predictions by ensuring feature alignment"""
    
    try:
        # Get expected features from preprocessor
        expected_features = get_model_feature_names(preprocessor)
        
        if expected_features is None:
            raise ValueError("Could not determine expected features from preprocessor")
        
        # Create complete feature set
        complete_data = create_complete_feature_set(transaction_data, expected_features)
        
        # Create DataFrame
        df = pd.DataFrame([complete_data])
        
        # Add TransactionID if not present
        if 'TransactionID' not in df.columns:
            df['TransactionID'] = 'temp_id'
        
        # Preprocess
        X_processed, _ = preprocessor.preprocess(df, fit=False)
        
        # Debug feature alignment
        debug_info = debug_feature_mismatch(expected_features, X_processed.columns.tolist())
        print(f"Feature alignment: {debug_info}")
        
        # Make prediction
        prediction_proba = model.predict_proba(X_processed)[0, 1]
        
        return {
            'success': True,
            'probability': float(prediction_proba),
            'debug_info': debug_info
        }
        
    except Exception as e:
        return {
            'success': False,
            'error': str(e),
            'debug_info': None
        }

# Function to inspect your saved model's expected features
def inspect_model_features(model_path, preprocessor_path):
    """Inspect what features your saved model actually expects"""
    
    try:
        print("Loading model and preprocessor...")
        model = joblib.load(model_path)
        preprocessor = joblib.load(preprocessor_path)
        
        print(f"Model type: {type(model).__name__}")
        
        # Try to get feature names from preprocessor
        if hasattr(preprocessor, 'feature_names'):
            features = preprocessor.feature_names
            print(f"Preprocessor has {len(features)} feature names")
            print("First 20 features:", features[:20])
            print("Last 20 features:", features[-20:])
            
            # Analyze feature patterns
            v_features = [f for f in features if f.startswith('V')]
            id_features = [f for f in features if f.startswith('id_')]
            c_features = [f for f in features if f.startswith('C')]
            d_features = [f for f in features if f.startswith('D')]
            m_features = [f for f in features if f.startswith('M')]
            
            print(f"\nFeature breakdown:")
            print(f"V features: {len(v_features)} (range: {min(v_features)} to {max(v_features)})")
            print(f"ID features: {len(id_features)}")
            print(f"C features: {len(c_features)}")
            print(f"D features: {len(d_features)}")
            print(f"M features: {len(m_features)}")
            
            return features
        else:
            print("Preprocessor doesn't have feature_names attribute")
            return None
            
    except Exception as e:
        print(f"Error inspecting model: {e}")
        return None

if __name__ == "__main__":
    # Run this to inspect your model's expected features
    model_path = "fraud_detection_model_xgboost_20250727_145448.joblib"
    preprocessor_path = "preprocessor_20250727_145448.joblib"
    
    features = inspect_model_features(model_path, preprocessor_path)