File size: 7,171 Bytes
4de4bb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import joblib
import pandas as pd
import numpy as np

def get_model_feature_names(preprocessor):
    """Extract the exact feature names that the model expects"""
    try:
        if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
            return preprocessor.feature_names
        else:
            return None
    except:
        return None

def create_complete_feature_set(transaction_data, expected_features):
    """Create a complete feature set matching exactly what the model expects"""
    
    # Start with provided data
    complete_data = transaction_data.copy()
    
    # Define comprehensive defaults
    feature_defaults = {
        # Transaction basics
        'TransactionAmt': 100.0,
        'TransactionDT': 86400,
        'ProductCD': 'W',
        
        # Card features
        'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa', 'card5': 142.0, 'card6': 'credit',
        
        # Address and distance
        'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,
        
        # Email domains
        'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',
        
        # Device info
        'DeviceType': 'desktop', 'DeviceInfo': 'Windows',
        
        # Count features (C1-C14)
        **{f'C{i}': 0.0 for i in range(1, 15)},
        'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0,
        
        # Time delta features (D1-D15) 
        **{f'D{i}': 0.0 for i in range(1, 16)},
        'D5': 20.0,
        
        # Match features (M1-M9)
        **{f'M{i}': 'F' for i in range(1, 10)},
        'M1': 'T', 'M2': 'T', 'M3': 'T', 'M4': 'M0',
        
        # All possible V features (V1-V339 based on error)
        **{f'V{i}': 1.0 for i in range(1, 340)},
        
        # All possible identity features
        **{f'id_{i:02d}': 0.0 for i in range(1, 39)},
    }
    
    # Fill missing features
    for feature in expected_features:
        if feature not in complete_data:
            if feature in feature_defaults:
                complete_data[feature] = feature_defaults[feature]
            else:
                # Default based on feature name pattern
                if feature.startswith('V'):
                    complete_data[feature] = 1.0
                elif feature.startswith('id_'):
                    complete_data[feature] = 0.0
                elif feature.startswith('C'):
                    complete_data[feature] = 0.0
                elif feature.startswith('D'):
                    complete_data[feature] = 0.0
                elif feature.startswith('M'):
                    complete_data[feature] = 'F'
                else:
                    # Numeric default
                    complete_data[feature] = 0.0
    
    return complete_data

def debug_feature_mismatch(model_features, data_features):
    """Debug feature mismatches between model and data"""
    model_set = set(model_features) if model_features else set()
    data_set = set(data_features) if data_features else set()
    
    missing_in_data = model_set - data_set
    extra_in_data = data_set - model_set
    
    print(f"Model expects {len(model_set)} features")
    print(f"Data has {len(data_set)} features")
    print(f"Missing in data: {len(missing_in_data)} features")
    print(f"Extra in data: {len(extra_in_data)} features")
    
    if missing_in_data:
        print(f"First 10 missing features: {list(missing_in_data)[:10]}")
    
    return {
        'missing': list(missing_in_data),
        'extra': list(extra_in_data),
        'model_count': len(model_set),
        'data_count': len(data_set)
    }

def safe_predict_with_feature_matching(model, preprocessor, transaction_data):
    """Safely make predictions by ensuring feature alignment"""
    
    try:
        # Get expected features from preprocessor
        expected_features = get_model_feature_names(preprocessor)
        
        if expected_features is None:
            raise ValueError("Could not determine expected features from preprocessor")
        
        # Create complete feature set
        complete_data = create_complete_feature_set(transaction_data, expected_features)
        
        # Create DataFrame
        df = pd.DataFrame([complete_data])
        
        # Add TransactionID if not present
        if 'TransactionID' not in df.columns:
            df['TransactionID'] = 'temp_id'
        
        # Preprocess
        X_processed, _ = preprocessor.preprocess(df, fit=False)
        
        # Debug feature alignment
        debug_info = debug_feature_mismatch(expected_features, X_processed.columns.tolist())
        print(f"Feature alignment: {debug_info}")
        
        # Make prediction
        prediction_proba = model.predict_proba(X_processed)[0, 1]
        
        return {
            'success': True,
            'probability': float(prediction_proba),
            'debug_info': debug_info
        }
        
    except Exception as e:
        return {
            'success': False,
            'error': str(e),
            'debug_info': None
        }

# Function to inspect your saved model's expected features
def inspect_model_features(model_path, preprocessor_path):
    """Inspect what features your saved model actually expects"""
    
    try:
        print("Loading model and preprocessor...")
        model = joblib.load(model_path)
        preprocessor = joblib.load(preprocessor_path)
        
        print(f"Model type: {type(model).__name__}")
        
        # Try to get feature names from preprocessor
        if hasattr(preprocessor, 'feature_names'):
            features = preprocessor.feature_names
            print(f"Preprocessor has {len(features)} feature names")
            print("First 20 features:", features[:20])
            print("Last 20 features:", features[-20:])
            
            # Analyze feature patterns
            v_features = [f for f in features if f.startswith('V')]
            id_features = [f for f in features if f.startswith('id_')]
            c_features = [f for f in features if f.startswith('C')]
            d_features = [f for f in features if f.startswith('D')]
            m_features = [f for f in features if f.startswith('M')]
            
            print(f"\nFeature breakdown:")
            print(f"V features: {len(v_features)} (range: {min(v_features)} to {max(v_features)})")
            print(f"ID features: {len(id_features)}")
            print(f"C features: {len(c_features)}")
            print(f"D features: {len(d_features)}")
            print(f"M features: {len(m_features)}")
            
            return features
        else:
            print("Preprocessor doesn't have feature_names attribute")
            return None
            
    except Exception as e:
        print(f"Error inspecting model: {e}")
        return None

if __name__ == "__main__":
    # Run this to inspect your model's expected features
    model_path = "fraud_detection_model_xgboost_20250727_145448.joblib"
    preprocessor_path = "preprocessor_20250727_145448.joblib"
    
    features = inspect_model_features(model_path, preprocessor_path)