import joblib import pandas as pd import numpy as np def get_model_feature_names(preprocessor): """Extract the exact feature names that the model expects""" try: if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names: return preprocessor.feature_names else: return None except: return None def create_complete_feature_set(transaction_data, expected_features): """Create a complete feature set matching exactly what the model expects""" # Start with provided data complete_data = transaction_data.copy() # Define comprehensive defaults feature_defaults = { # Transaction basics 'TransactionAmt': 100.0, 'TransactionDT': 86400, 'ProductCD': 'W', # Card features 'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa', 'card5': 142.0, 'card6': 'credit', # Address and distance 'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0, # Email domains 'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com', # Device info 'DeviceType': 'desktop', 'DeviceInfo': 'Windows', # Count features (C1-C14) **{f'C{i}': 0.0 for i in range(1, 15)}, 'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0, # Time delta features (D1-D15) **{f'D{i}': 0.0 for i in range(1, 16)}, 'D5': 20.0, # Match features (M1-M9) **{f'M{i}': 'F' for i in range(1, 10)}, 'M1': 'T', 'M2': 'T', 'M3': 'T', 'M4': 'M0', # All possible V features (V1-V339 based on error) **{f'V{i}': 1.0 for i in range(1, 340)}, # All possible identity features **{f'id_{i:02d}': 0.0 for i in range(1, 39)}, } # Fill missing features for feature in expected_features: if feature not in complete_data: if feature in feature_defaults: complete_data[feature] = feature_defaults[feature] else: # Default based on feature name pattern if feature.startswith('V'): complete_data[feature] = 1.0 elif feature.startswith('id_'): complete_data[feature] = 0.0 elif feature.startswith('C'): complete_data[feature] = 0.0 elif feature.startswith('D'): complete_data[feature] = 0.0 elif feature.startswith('M'): complete_data[feature] = 'F' else: # Numeric default complete_data[feature] = 0.0 return complete_data def debug_feature_mismatch(model_features, data_features): """Debug feature mismatches between model and data""" model_set = set(model_features) if model_features else set() data_set = set(data_features) if data_features else set() missing_in_data = model_set - data_set extra_in_data = data_set - model_set print(f"Model expects {len(model_set)} features") print(f"Data has {len(data_set)} features") print(f"Missing in data: {len(missing_in_data)} features") print(f"Extra in data: {len(extra_in_data)} features") if missing_in_data: print(f"First 10 missing features: {list(missing_in_data)[:10]}") return { 'missing': list(missing_in_data), 'extra': list(extra_in_data), 'model_count': len(model_set), 'data_count': len(data_set) } def safe_predict_with_feature_matching(model, preprocessor, transaction_data): """Safely make predictions by ensuring feature alignment""" try: # Get expected features from preprocessor expected_features = get_model_feature_names(preprocessor) if expected_features is None: raise ValueError("Could not determine expected features from preprocessor") # Create complete feature set complete_data = create_complete_feature_set(transaction_data, expected_features) # Create DataFrame df = pd.DataFrame([complete_data]) # Add TransactionID if not present if 'TransactionID' not in df.columns: df['TransactionID'] = 'temp_id' # Preprocess X_processed, _ = preprocessor.preprocess(df, fit=False) # Debug feature alignment debug_info = debug_feature_mismatch(expected_features, X_processed.columns.tolist()) print(f"Feature alignment: {debug_info}") # Make prediction prediction_proba = model.predict_proba(X_processed)[0, 1] return { 'success': True, 'probability': float(prediction_proba), 'debug_info': debug_info } except Exception as e: return { 'success': False, 'error': str(e), 'debug_info': None } # Function to inspect your saved model's expected features def inspect_model_features(model_path, preprocessor_path): """Inspect what features your saved model actually expects""" try: print("Loading model and preprocessor...") model = joblib.load(model_path) preprocessor = joblib.load(preprocessor_path) print(f"Model type: {type(model).__name__}") # Try to get feature names from preprocessor if hasattr(preprocessor, 'feature_names'): features = preprocessor.feature_names print(f"Preprocessor has {len(features)} feature names") print("First 20 features:", features[:20]) print("Last 20 features:", features[-20:]) # Analyze feature patterns v_features = [f for f in features if f.startswith('V')] id_features = [f for f in features if f.startswith('id_')] c_features = [f for f in features if f.startswith('C')] d_features = [f for f in features if f.startswith('D')] m_features = [f for f in features if f.startswith('M')] print(f"\nFeature breakdown:") print(f"V features: {len(v_features)} (range: {min(v_features)} to {max(v_features)})") print(f"ID features: {len(id_features)}") print(f"C features: {len(c_features)}") print(f"D features: {len(d_features)}") print(f"M features: {len(m_features)}") return features else: print("Preprocessor doesn't have feature_names attribute") return None except Exception as e: print(f"Error inspecting model: {e}") return None if __name__ == "__main__": # Run this to inspect your model's expected features model_path = "fraud_detection_model_xgboost_20250727_145448.joblib" preprocessor_path = "preprocessor_20250727_145448.joblib" features = inspect_model_features(model_path, preprocessor_path)