fraud-detection / feature_detector.py
0xnu's picture
Upload 5 files
4de4bb8 verified
raw
history blame
7.17 kB
import joblib
import pandas as pd
import numpy as np
def get_model_feature_names(preprocessor):
"""Extract the exact feature names that the model expects"""
try:
if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
return preprocessor.feature_names
else:
return None
except:
return None
def create_complete_feature_set(transaction_data, expected_features):
"""Create a complete feature set matching exactly what the model expects"""
# Start with provided data
complete_data = transaction_data.copy()
# Define comprehensive defaults
feature_defaults = {
# Transaction basics
'TransactionAmt': 100.0,
'TransactionDT': 86400,
'ProductCD': 'W',
# Card features
'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa', 'card5': 142.0, 'card6': 'credit',
# Address and distance
'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,
# Email domains
'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',
# Device info
'DeviceType': 'desktop', 'DeviceInfo': 'Windows',
# Count features (C1-C14)
**{f'C{i}': 0.0 for i in range(1, 15)},
'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0,
# Time delta features (D1-D15)
**{f'D{i}': 0.0 for i in range(1, 16)},
'D5': 20.0,
# Match features (M1-M9)
**{f'M{i}': 'F' for i in range(1, 10)},
'M1': 'T', 'M2': 'T', 'M3': 'T', 'M4': 'M0',
# All possible V features (V1-V339 based on error)
**{f'V{i}': 1.0 for i in range(1, 340)},
# All possible identity features
**{f'id_{i:02d}': 0.0 for i in range(1, 39)},
}
# Fill missing features
for feature in expected_features:
if feature not in complete_data:
if feature in feature_defaults:
complete_data[feature] = feature_defaults[feature]
else:
# Default based on feature name pattern
if feature.startswith('V'):
complete_data[feature] = 1.0
elif feature.startswith('id_'):
complete_data[feature] = 0.0
elif feature.startswith('C'):
complete_data[feature] = 0.0
elif feature.startswith('D'):
complete_data[feature] = 0.0
elif feature.startswith('M'):
complete_data[feature] = 'F'
else:
# Numeric default
complete_data[feature] = 0.0
return complete_data
def debug_feature_mismatch(model_features, data_features):
"""Debug feature mismatches between model and data"""
model_set = set(model_features) if model_features else set()
data_set = set(data_features) if data_features else set()
missing_in_data = model_set - data_set
extra_in_data = data_set - model_set
print(f"Model expects {len(model_set)} features")
print(f"Data has {len(data_set)} features")
print(f"Missing in data: {len(missing_in_data)} features")
print(f"Extra in data: {len(extra_in_data)} features")
if missing_in_data:
print(f"First 10 missing features: {list(missing_in_data)[:10]}")
return {
'missing': list(missing_in_data),
'extra': list(extra_in_data),
'model_count': len(model_set),
'data_count': len(data_set)
}
def safe_predict_with_feature_matching(model, preprocessor, transaction_data):
"""Safely make predictions by ensuring feature alignment"""
try:
# Get expected features from preprocessor
expected_features = get_model_feature_names(preprocessor)
if expected_features is None:
raise ValueError("Could not determine expected features from preprocessor")
# Create complete feature set
complete_data = create_complete_feature_set(transaction_data, expected_features)
# Create DataFrame
df = pd.DataFrame([complete_data])
# Add TransactionID if not present
if 'TransactionID' not in df.columns:
df['TransactionID'] = 'temp_id'
# Preprocess
X_processed, _ = preprocessor.preprocess(df, fit=False)
# Debug feature alignment
debug_info = debug_feature_mismatch(expected_features, X_processed.columns.tolist())
print(f"Feature alignment: {debug_info}")
# Make prediction
prediction_proba = model.predict_proba(X_processed)[0, 1]
return {
'success': True,
'probability': float(prediction_proba),
'debug_info': debug_info
}
except Exception as e:
return {
'success': False,
'error': str(e),
'debug_info': None
}
# Function to inspect your saved model's expected features
def inspect_model_features(model_path, preprocessor_path):
"""Inspect what features your saved model actually expects"""
try:
print("Loading model and preprocessor...")
model = joblib.load(model_path)
preprocessor = joblib.load(preprocessor_path)
print(f"Model type: {type(model).__name__}")
# Try to get feature names from preprocessor
if hasattr(preprocessor, 'feature_names'):
features = preprocessor.feature_names
print(f"Preprocessor has {len(features)} feature names")
print("First 20 features:", features[:20])
print("Last 20 features:", features[-20:])
# Analyze feature patterns
v_features = [f for f in features if f.startswith('V')]
id_features = [f for f in features if f.startswith('id_')]
c_features = [f for f in features if f.startswith('C')]
d_features = [f for f in features if f.startswith('D')]
m_features = [f for f in features if f.startswith('M')]
print(f"\nFeature breakdown:")
print(f"V features: {len(v_features)} (range: {min(v_features)} to {max(v_features)})")
print(f"ID features: {len(id_features)}")
print(f"C features: {len(c_features)}")
print(f"D features: {len(d_features)}")
print(f"M features: {len(m_features)}")
return features
else:
print("Preprocessor doesn't have feature_names attribute")
return None
except Exception as e:
print(f"Error inspecting model: {e}")
return None
if __name__ == "__main__":
# Run this to inspect your model's expected features
model_path = "fraud_detection_model_xgboost_20250727_145448.joblib"
preprocessor_path = "preprocessor_20250727_145448.joblib"
features = inspect_model_features(model_path, preprocessor_path)