fraud-detection / feature_detector.py
0xnu's picture
Upload 5 files
22773a1 verified
raw
history blame
10.7 kB
import joblib
import pandas as pd
import numpy as np
def get_model_feature_names(preprocessor):
"""Extract the exact feature names that the model expects"""
try:
if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
return preprocessor.feature_names
else:
return None
except:
return None
def create_complete_feature_set(transaction_data, expected_features):
"""Create a complete feature set matching exactly what the model expects"""
# Define comprehensive defaults
feature_defaults = {
# Transaction basics
'TransactionAmt': 100.0,
'TransactionDT': 86400,
'ProductCD': 'W',
# Card features
'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa', 'card5': 142.0, 'card6': 'credit',
# Address and distance
'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,
# Email domains
'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',
# Device info
'DeviceType': 'desktop', 'DeviceInfo': 'Windows',
# Count features (C1-C14)
**{f'C{i}': 0.0 for i in range(1, 15)},
'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0,
# Time delta features (D1-D15)
**{f'D{i}': 0.0 for i in range(1, 16)},
'D5': 20.0,
# Match features (M1-M9)
**{f'M{i}': 'F' for i in range(1, 10)},
'M1': 'T', 'M2': 'T', 'M3': 'T', 'M4': 'M0',
# All possible V features (V1-V339 based on error)
**{f'V{i}': 1.0 for i in range(1, 340)},
# All possible identity features
**{f'id_{i:02d}': 0.0 for i in range(1, 39)},
}
# Create ordered dictionary with EXACT feature order
complete_data = {}
# Fill features in the EXACT order expected by the model
for feature in expected_features:
if feature in transaction_data:
complete_data[feature] = transaction_data[feature]
elif feature in feature_defaults:
complete_data[feature] = feature_defaults[feature]
else:
# Default based on feature name pattern
if feature.startswith('V'):
complete_data[feature] = 1.0
elif feature.startswith('id_'):
complete_data[feature] = 0.0
elif feature.startswith('C'):
complete_data[feature] = 0.0
elif feature.startswith('D'):
complete_data[feature] = 0.0
elif feature.startswith('M'):
complete_data[feature] = 'F'
else:
# Numeric default
complete_data[feature] = 0.0
return complete_data
def create_ordered_dataframe(transaction_data, expected_features):
"""Create DataFrame with features in exact order expected by model"""
# Create complete feature set
complete_data = create_complete_feature_set(transaction_data, expected_features)
# Create DataFrame with features in EXACT order
ordered_data = {}
for feature in expected_features:
ordered_data[feature] = complete_data[feature]
# Create DataFrame with ordered columns
df = pd.DataFrame([ordered_data], columns=expected_features)
return df
def safe_predict_with_exact_features(model, preprocessor, transaction_data):
"""Safely make predictions with exact feature order matching"""
try:
# Get expected features from preprocessor
expected_features = get_model_feature_names(preprocessor)
if expected_features is None:
raise ValueError("Could not determine expected features from preprocessor")
print(f"Model expects {len(expected_features)} features")
print(f"First 10 features: {expected_features[:10]}")
print(f"Last 10 features: {expected_features[-10:]}")
# Create DataFrame with exact feature order (BYPASS PREPROCESSING)
complete_data = create_complete_feature_set(transaction_data, expected_features)
# Create DataFrame with features in exact order expected by model
X_ordered = pd.DataFrame([complete_data], columns=expected_features)
print(f"Created DataFrame with shape: {X_ordered.shape}")
print(f"DataFrame columns match expected: {list(X_ordered.columns) == expected_features}")
# Make prediction directly (skip preprocessing to avoid feature reordering)
prediction_proba = model.predict_proba(X_ordered)[0, 1]
return {
'success': True,
'probability': float(prediction_proba)
}
except Exception as e:
print(f"Prediction error: {str(e)}")
return {
'success': False,
'error': str(e)
}
def bypass_preprocessing_predict(model, preprocessor, transaction_data):
"""Make prediction by bypassing preprocessing and using direct feature mapping"""
try:
# Get the exact features and their order from the preprocessor
if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
expected_features = preprocessor.feature_names
else:
raise ValueError("Cannot determine expected features from preprocessor")
print(f"Expected features count: {len(expected_features)}")
# Create feature defaults that match training data patterns
defaults = {
# Core transaction features
'TransactionAmt': 100.0,
'TransactionDT': 86400,
'ProductCD': 'W',
# Card features
'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa',
'card5': 142.0, 'card6': 'credit',
# Address features
'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,
# Email features
'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',
# Device features
'DeviceType': 'desktop', 'DeviceInfo': 'Windows',
# Engineered features that preprocessing creates
'TransactionAmt_log': np.log1p(100.0),
'TransactionAmt_sqrt': np.sqrt(100.0),
'TransactionDT_hour': (86400 / 3600) % 24,
'TransactionDT_day': (86400 / (3600 * 24)) % 7,
'card1_card2_ratio': 13553 / (150.0 + 1),
'addr_match': 0,
}
# Add all possible C, D, M, V, id features with defaults
for i in range(1, 15):
defaults[f'C{i}'] = 1.0 if i in [1, 2, 6, 9, 11, 12, 13, 14] else 0.0
for i in range(1, 16):
defaults[f'D{i}'] = 20.0 if i == 5 else 0.0
for i in range(1, 10):
if i <= 3:
defaults[f'M{i}'] = 1.0 # Encoded T
elif i == 4:
defaults[f'M{i}'] = 0.0 # Encoded M0
else:
defaults[f'M{i}'] = 0.0 # Encoded F
# V features (1-339) - default to 1.0
for i in range(1, 340):
defaults[f'V{i}'] = 1.0
# Identity features
for i in range(1, 39):
defaults[f'id_{i:02d}'] = 0.0
# Create feature vector in exact order
feature_values = []
for feature in expected_features:
if feature in transaction_data:
value = transaction_data[feature]
# Apply same transformations as preprocessing would
if feature == 'TransactionAmt_log':
value = np.log1p(transaction_data.get('TransactionAmt', 100.0))
elif feature == 'TransactionAmt_sqrt':
value = np.sqrt(transaction_data.get('TransactionAmt', 100.0))
elif feature == 'TransactionDT_hour':
dt = transaction_data.get('TransactionDT', 86400)
value = (dt / 3600) % 24
elif feature == 'TransactionDT_day':
dt = transaction_data.get('TransactionDT', 86400)
value = (dt / (3600 * 24)) % 7
feature_values.append(value)
else:
feature_values.append(defaults.get(feature, 0.0))
# Create properly ordered DataFrame
X = pd.DataFrame([feature_values], columns=expected_features)
print(f"Final feature matrix shape: {X.shape}")
print(f"Sample values: {X.iloc[0, :5].values}")
# Make prediction
prediction_proba = model.predict_proba(X)[0, 1]
return {
'success': True,
'probability': float(prediction_proba)
}
except Exception as e:
print(f"Bypass prediction error: {str(e)}")
return {
'success': False,
'error': str(e)
}
# Function to inspect your saved model's expected features
def inspect_model_features(model_path, preprocessor_path):
"""Inspect what features your saved model actually expects"""
try:
print("Loading model and preprocessor...")
model = joblib.load(model_path)
preprocessor = joblib.load(preprocessor_path)
print(f"Model type: {type(model).__name__}")
# Try to get feature names from preprocessor
if hasattr(preprocessor, 'feature_names'):
features = preprocessor.feature_names
print(f"Preprocessor has {len(features)} feature names")
print("First 20 features:", features[:20])
print("Last 20 features:", features[-20:])
# Save feature order to file
with open('model_feature_order.txt', 'w') as f:
for i, feature in enumerate(features):
f.write(f"{i:4d}: {feature}\n")
print("Feature order saved to model_feature_order.txt")
return features
else:
print("Preprocessor doesn't have feature_names attribute")
return None
except Exception as e:
print(f"Error inspecting model: {e}")
return None
if __name__ == "__main__":
# Run this to inspect your model's expected features
model_path = "fraud_detection_model_xgboost_20250727_145448.joblib"
preprocessor_path = "preprocessor_20250727_145448.joblib"
features = inspect_model_features(model_path, preprocessor_path)