Spaces:
Sleeping
Sleeping
File size: 7,171 Bytes
4de4bb8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import joblib
import pandas as pd
import numpy as np
def get_model_feature_names(preprocessor):
"""Extract the exact feature names that the model expects"""
try:
if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
return preprocessor.feature_names
else:
return None
except:
return None
def create_complete_feature_set(transaction_data, expected_features):
"""Create a complete feature set matching exactly what the model expects"""
# Start with provided data
complete_data = transaction_data.copy()
# Define comprehensive defaults
feature_defaults = {
# Transaction basics
'TransactionAmt': 100.0,
'TransactionDT': 86400,
'ProductCD': 'W',
# Card features
'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa', 'card5': 142.0, 'card6': 'credit',
# Address and distance
'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,
# Email domains
'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',
# Device info
'DeviceType': 'desktop', 'DeviceInfo': 'Windows',
# Count features (C1-C14)
**{f'C{i}': 0.0 for i in range(1, 15)},
'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0,
# Time delta features (D1-D15)
**{f'D{i}': 0.0 for i in range(1, 16)},
'D5': 20.0,
# Match features (M1-M9)
**{f'M{i}': 'F' for i in range(1, 10)},
'M1': 'T', 'M2': 'T', 'M3': 'T', 'M4': 'M0',
# All possible V features (V1-V339 based on error)
**{f'V{i}': 1.0 for i in range(1, 340)},
# All possible identity features
**{f'id_{i:02d}': 0.0 for i in range(1, 39)},
}
# Fill missing features
for feature in expected_features:
if feature not in complete_data:
if feature in feature_defaults:
complete_data[feature] = feature_defaults[feature]
else:
# Default based on feature name pattern
if feature.startswith('V'):
complete_data[feature] = 1.0
elif feature.startswith('id_'):
complete_data[feature] = 0.0
elif feature.startswith('C'):
complete_data[feature] = 0.0
elif feature.startswith('D'):
complete_data[feature] = 0.0
elif feature.startswith('M'):
complete_data[feature] = 'F'
else:
# Numeric default
complete_data[feature] = 0.0
return complete_data
def debug_feature_mismatch(model_features, data_features):
"""Debug feature mismatches between model and data"""
model_set = set(model_features) if model_features else set()
data_set = set(data_features) if data_features else set()
missing_in_data = model_set - data_set
extra_in_data = data_set - model_set
print(f"Model expects {len(model_set)} features")
print(f"Data has {len(data_set)} features")
print(f"Missing in data: {len(missing_in_data)} features")
print(f"Extra in data: {len(extra_in_data)} features")
if missing_in_data:
print(f"First 10 missing features: {list(missing_in_data)[:10]}")
return {
'missing': list(missing_in_data),
'extra': list(extra_in_data),
'model_count': len(model_set),
'data_count': len(data_set)
}
def safe_predict_with_feature_matching(model, preprocessor, transaction_data):
"""Safely make predictions by ensuring feature alignment"""
try:
# Get expected features from preprocessor
expected_features = get_model_feature_names(preprocessor)
if expected_features is None:
raise ValueError("Could not determine expected features from preprocessor")
# Create complete feature set
complete_data = create_complete_feature_set(transaction_data, expected_features)
# Create DataFrame
df = pd.DataFrame([complete_data])
# Add TransactionID if not present
if 'TransactionID' not in df.columns:
df['TransactionID'] = 'temp_id'
# Preprocess
X_processed, _ = preprocessor.preprocess(df, fit=False)
# Debug feature alignment
debug_info = debug_feature_mismatch(expected_features, X_processed.columns.tolist())
print(f"Feature alignment: {debug_info}")
# Make prediction
prediction_proba = model.predict_proba(X_processed)[0, 1]
return {
'success': True,
'probability': float(prediction_proba),
'debug_info': debug_info
}
except Exception as e:
return {
'success': False,
'error': str(e),
'debug_info': None
}
# Function to inspect your saved model's expected features
def inspect_model_features(model_path, preprocessor_path):
"""Inspect what features your saved model actually expects"""
try:
print("Loading model and preprocessor...")
model = joblib.load(model_path)
preprocessor = joblib.load(preprocessor_path)
print(f"Model type: {type(model).__name__}")
# Try to get feature names from preprocessor
if hasattr(preprocessor, 'feature_names'):
features = preprocessor.feature_names
print(f"Preprocessor has {len(features)} feature names")
print("First 20 features:", features[:20])
print("Last 20 features:", features[-20:])
# Analyze feature patterns
v_features = [f for f in features if f.startswith('V')]
id_features = [f for f in features if f.startswith('id_')]
c_features = [f for f in features if f.startswith('C')]
d_features = [f for f in features if f.startswith('D')]
m_features = [f for f in features if f.startswith('M')]
print(f"\nFeature breakdown:")
print(f"V features: {len(v_features)} (range: {min(v_features)} to {max(v_features)})")
print(f"ID features: {len(id_features)}")
print(f"C features: {len(c_features)}")
print(f"D features: {len(d_features)}")
print(f"M features: {len(m_features)}")
return features
else:
print("Preprocessor doesn't have feature_names attribute")
return None
except Exception as e:
print(f"Error inspecting model: {e}")
return None
if __name__ == "__main__":
# Run this to inspect your model's expected features
model_path = "fraud_detection_model_xgboost_20250727_145448.joblib"
preprocessor_path = "preprocessor_20250727_145448.joblib"
features = inspect_model_features(model_path, preprocessor_path) |