Spaces:
Sleeping
Sleeping
import joblib | |
import pandas as pd | |
import numpy as np | |
def get_model_feature_names(preprocessor): | |
"""Extract the exact feature names that the model expects""" | |
try: | |
if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names: | |
return preprocessor.feature_names | |
else: | |
return None | |
except: | |
return None | |
def create_complete_feature_set(transaction_data, expected_features): | |
"""Create a complete feature set matching exactly what the model expects""" | |
# Start with provided data | |
complete_data = transaction_data.copy() | |
# Define comprehensive defaults | |
feature_defaults = { | |
# Transaction basics | |
'TransactionAmt': 100.0, | |
'TransactionDT': 86400, | |
'ProductCD': 'W', | |
# Card features | |
'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa', 'card5': 142.0, 'card6': 'credit', | |
# Address and distance | |
'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0, | |
# Email domains | |
'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com', | |
# Device info | |
'DeviceType': 'desktop', 'DeviceInfo': 'Windows', | |
# Count features (C1-C14) | |
**{f'C{i}': 0.0 for i in range(1, 15)}, | |
'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0, | |
# Time delta features (D1-D15) | |
**{f'D{i}': 0.0 for i in range(1, 16)}, | |
'D5': 20.0, | |
# Match features (M1-M9) | |
**{f'M{i}': 'F' for i in range(1, 10)}, | |
'M1': 'T', 'M2': 'T', 'M3': 'T', 'M4': 'M0', | |
# All possible V features (V1-V339 based on error) | |
**{f'V{i}': 1.0 for i in range(1, 340)}, | |
# All possible identity features | |
**{f'id_{i:02d}': 0.0 for i in range(1, 39)}, | |
} | |
# Fill missing features | |
for feature in expected_features: | |
if feature not in complete_data: | |
if feature in feature_defaults: | |
complete_data[feature] = feature_defaults[feature] | |
else: | |
# Default based on feature name pattern | |
if feature.startswith('V'): | |
complete_data[feature] = 1.0 | |
elif feature.startswith('id_'): | |
complete_data[feature] = 0.0 | |
elif feature.startswith('C'): | |
complete_data[feature] = 0.0 | |
elif feature.startswith('D'): | |
complete_data[feature] = 0.0 | |
elif feature.startswith('M'): | |
complete_data[feature] = 'F' | |
else: | |
# Numeric default | |
complete_data[feature] = 0.0 | |
return complete_data | |
def debug_feature_mismatch(model_features, data_features): | |
"""Debug feature mismatches between model and data""" | |
model_set = set(model_features) if model_features else set() | |
data_set = set(data_features) if data_features else set() | |
missing_in_data = model_set - data_set | |
extra_in_data = data_set - model_set | |
print(f"Model expects {len(model_set)} features") | |
print(f"Data has {len(data_set)} features") | |
print(f"Missing in data: {len(missing_in_data)} features") | |
print(f"Extra in data: {len(extra_in_data)} features") | |
if missing_in_data: | |
print(f"First 10 missing features: {list(missing_in_data)[:10]}") | |
return { | |
'missing': list(missing_in_data), | |
'extra': list(extra_in_data), | |
'model_count': len(model_set), | |
'data_count': len(data_set) | |
} | |
def safe_predict_with_feature_matching(model, preprocessor, transaction_data): | |
"""Safely make predictions by ensuring feature alignment""" | |
try: | |
# Get expected features from preprocessor | |
expected_features = get_model_feature_names(preprocessor) | |
if expected_features is None: | |
raise ValueError("Could not determine expected features from preprocessor") | |
# Create complete feature set | |
complete_data = create_complete_feature_set(transaction_data, expected_features) | |
# Create DataFrame | |
df = pd.DataFrame([complete_data]) | |
# Add TransactionID if not present | |
if 'TransactionID' not in df.columns: | |
df['TransactionID'] = 'temp_id' | |
# Preprocess | |
X_processed, _ = preprocessor.preprocess(df, fit=False) | |
# Debug feature alignment | |
debug_info = debug_feature_mismatch(expected_features, X_processed.columns.tolist()) | |
print(f"Feature alignment: {debug_info}") | |
# Make prediction | |
prediction_proba = model.predict_proba(X_processed)[0, 1] | |
return { | |
'success': True, | |
'probability': float(prediction_proba), | |
'debug_info': debug_info | |
} | |
except Exception as e: | |
return { | |
'success': False, | |
'error': str(e), | |
'debug_info': None | |
} | |
# Function to inspect your saved model's expected features | |
def inspect_model_features(model_path, preprocessor_path): | |
"""Inspect what features your saved model actually expects""" | |
try: | |
print("Loading model and preprocessor...") | |
model = joblib.load(model_path) | |
preprocessor = joblib.load(preprocessor_path) | |
print(f"Model type: {type(model).__name__}") | |
# Try to get feature names from preprocessor | |
if hasattr(preprocessor, 'feature_names'): | |
features = preprocessor.feature_names | |
print(f"Preprocessor has {len(features)} feature names") | |
print("First 20 features:", features[:20]) | |
print("Last 20 features:", features[-20:]) | |
# Analyze feature patterns | |
v_features = [f for f in features if f.startswith('V')] | |
id_features = [f for f in features if f.startswith('id_')] | |
c_features = [f for f in features if f.startswith('C')] | |
d_features = [f for f in features if f.startswith('D')] | |
m_features = [f for f in features if f.startswith('M')] | |
print(f"\nFeature breakdown:") | |
print(f"V features: {len(v_features)} (range: {min(v_features)} to {max(v_features)})") | |
print(f"ID features: {len(id_features)}") | |
print(f"C features: {len(c_features)}") | |
print(f"D features: {len(d_features)}") | |
print(f"M features: {len(m_features)}") | |
return features | |
else: | |
print("Preprocessor doesn't have feature_names attribute") | |
return None | |
except Exception as e: | |
print(f"Error inspecting model: {e}") | |
return None | |
if __name__ == "__main__": | |
# Run this to inspect your model's expected features | |
model_path = "fraud_detection_model_xgboost_20250727_145448.joblib" | |
preprocessor_path = "preprocessor_20250727_145448.joblib" | |
features = inspect_model_features(model_path, preprocessor_path) |