Spaces:

0xnu
/

fraud-detection

Sleeping

App Files Files Community

fraud-detection / feature_detector.py

0xnu

Upload 5 files

22773a1 verified about 2 months ago

raw

history blame

10.7 kB

	import joblib
	import pandas as pd
	import numpy as np

	def get_model_feature_names(preprocessor):
	"""Extract the exact feature names that the model expects"""
	try:
	if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
	return preprocessor.feature_names
	else:
	return None
	except:
	return None

	def create_complete_feature_set(transaction_data, expected_features):
	"""Create a complete feature set matching exactly what the model expects"""

	# Define comprehensive defaults
	feature_defaults = {
	# Transaction basics
	'TransactionAmt': 100.0,
	'TransactionDT': 86400,
	'ProductCD': 'W',

	# Card features
	'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa', 'card5': 142.0, 'card6': 'credit',

	# Address and distance
	'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,

	# Email domains
	'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',

	# Device info
	'DeviceType': 'desktop', 'DeviceInfo': 'Windows',

	# Count features (C1-C14)
	**{f'C{i}': 0.0 for i in range(1, 15)},
	'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0,

	# Time delta features (D1-D15)
	**{f'D{i}': 0.0 for i in range(1, 16)},
	'D5': 20.0,

	# Match features (M1-M9)
	**{f'M{i}': 'F' for i in range(1, 10)},
	'M1': 'T', 'M2': 'T', 'M3': 'T', 'M4': 'M0',

	# All possible V features (V1-V339 based on error)
	**{f'V{i}': 1.0 for i in range(1, 340)},

	# All possible identity features
	**{f'id_{i:02d}': 0.0 for i in range(1, 39)},
	}

	# Create ordered dictionary with EXACT feature order
	complete_data = {}

	# Fill features in the EXACT order expected by the model
	for feature in expected_features:
	if feature in transaction_data:
	complete_data[feature] = transaction_data[feature]
	elif feature in feature_defaults:
	complete_data[feature] = feature_defaults[feature]
	else:
	# Default based on feature name pattern
	if feature.startswith('V'):
	complete_data[feature] = 1.0
	elif feature.startswith('id_'):
	complete_data[feature] = 0.0
	elif feature.startswith('C'):
	complete_data[feature] = 0.0
	elif feature.startswith('D'):
	complete_data[feature] = 0.0
	elif feature.startswith('M'):
	complete_data[feature] = 'F'
	else:
	# Numeric default
	complete_data[feature] = 0.0

	return complete_data

	def create_ordered_dataframe(transaction_data, expected_features):
	"""Create DataFrame with features in exact order expected by model"""

	# Create complete feature set
	complete_data = create_complete_feature_set(transaction_data, expected_features)

	# Create DataFrame with features in EXACT order
	ordered_data = {}
	for feature in expected_features:
	ordered_data[feature] = complete_data[feature]

	# Create DataFrame with ordered columns
	df = pd.DataFrame([ordered_data], columns=expected_features)

	return df

	def safe_predict_with_exact_features(model, preprocessor, transaction_data):
	"""Safely make predictions with exact feature order matching"""

	try:
	# Get expected features from preprocessor
	expected_features = get_model_feature_names(preprocessor)

	if expected_features is None:
	raise ValueError("Could not determine expected features from preprocessor")

	print(f"Model expects {len(expected_features)} features")
	print(f"First 10 features: {expected_features[:10]}")
	print(f"Last 10 features: {expected_features[-10:]}")

	# Create DataFrame with exact feature order (BYPASS PREPROCESSING)
	complete_data = create_complete_feature_set(transaction_data, expected_features)

	# Create DataFrame with features in exact order expected by model
	X_ordered = pd.DataFrame([complete_data], columns=expected_features)

	print(f"Created DataFrame with shape: {X_ordered.shape}")
	print(f"DataFrame columns match expected: {list(X_ordered.columns) == expected_features}")

	# Make prediction directly (skip preprocessing to avoid feature reordering)
	prediction_proba = model.predict_proba(X_ordered)[0, 1]

	return {
	'success': True,
	'probability': float(prediction_proba)
	}

	except Exception as e:
	print(f"Prediction error: {str(e)}")
	return {
	'success': False,
	'error': str(e)
	}

	def bypass_preprocessing_predict(model, preprocessor, transaction_data):
	"""Make prediction by bypassing preprocessing and using direct feature mapping"""

	try:
	# Get the exact features and their order from the preprocessor
	if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
	expected_features = preprocessor.feature_names
	else:
	raise ValueError("Cannot determine expected features from preprocessor")

	print(f"Expected features count: {len(expected_features)}")

	# Create feature defaults that match training data patterns
	defaults = {
	# Core transaction features
	'TransactionAmt': 100.0,
	'TransactionDT': 86400,
	'ProductCD': 'W',

	# Card features
	'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa',
	'card5': 142.0, 'card6': 'credit',

	# Address features
	'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,

	# Email features
	'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',

	# Device features
	'DeviceType': 'desktop', 'DeviceInfo': 'Windows',

	# Engineered features that preprocessing creates
	'TransactionAmt_log': np.log1p(100.0),
	'TransactionAmt_sqrt': np.sqrt(100.0),
	'TransactionDT_hour': (86400 / 3600) % 24,
	'TransactionDT_day': (86400 / (3600 * 24)) % 7,
	'card1_card2_ratio': 13553 / (150.0 + 1),
	'addr_match': 0,
	}

	# Add all possible C, D, M, V, id features with defaults
	for i in range(1, 15):
	defaults[f'C{i}'] = 1.0 if i in [1, 2, 6, 9, 11, 12, 13, 14] else 0.0

	for i in range(1, 16):
	defaults[f'D{i}'] = 20.0 if i == 5 else 0.0

	for i in range(1, 10):
	if i <= 3:
	defaults[f'M{i}'] = 1.0 # Encoded T
	elif i == 4:
	defaults[f'M{i}'] = 0.0 # Encoded M0
	else:
	defaults[f'M{i}'] = 0.0 # Encoded F

	# V features (1-339) - default to 1.0
	for i in range(1, 340):
	defaults[f'V{i}'] = 1.0

	# Identity features
	for i in range(1, 39):
	defaults[f'id_{i:02d}'] = 0.0

	# Create feature vector in exact order
	feature_values = []
	for feature in expected_features:
	if feature in transaction_data:
	value = transaction_data[feature]
	# Apply same transformations as preprocessing would
	if feature == 'TransactionAmt_log':
	value = np.log1p(transaction_data.get('TransactionAmt', 100.0))
	elif feature == 'TransactionAmt_sqrt':
	value = np.sqrt(transaction_data.get('TransactionAmt', 100.0))
	elif feature == 'TransactionDT_hour':
	dt = transaction_data.get('TransactionDT', 86400)
	value = (dt / 3600) % 24
	elif feature == 'TransactionDT_day':
	dt = transaction_data.get('TransactionDT', 86400)
	value = (dt / (3600 * 24)) % 7
	feature_values.append(value)
	else:
	feature_values.append(defaults.get(feature, 0.0))

	# Create properly ordered DataFrame
	X = pd.DataFrame([feature_values], columns=expected_features)

	print(f"Final feature matrix shape: {X.shape}")
	print(f"Sample values: {X.iloc[0, :5].values}")

	# Make prediction
	prediction_proba = model.predict_proba(X)[0, 1]

	return {
	'success': True,
	'probability': float(prediction_proba)
	}

	except Exception as e:
	print(f"Bypass prediction error: {str(e)}")
	return {
	'success': False,
	'error': str(e)
	}

	# Function to inspect your saved model's expected features
	def inspect_model_features(model_path, preprocessor_path):
	"""Inspect what features your saved model actually expects"""

	try:
	print("Loading model and preprocessor...")
	model = joblib.load(model_path)
	preprocessor = joblib.load(preprocessor_path)

	print(f"Model type: {type(model).__name__}")

	# Try to get feature names from preprocessor
	if hasattr(preprocessor, 'feature_names'):
	features = preprocessor.feature_names
	print(f"Preprocessor has {len(features)} feature names")
	print("First 20 features:", features[:20])
	print("Last 20 features:", features[-20:])

	# Save feature order to file
	with open('model_feature_order.txt', 'w') as f:
	for i, feature in enumerate(features):
	f.write(f"{i:4d}: {feature}\n")
	print("Feature order saved to model_feature_order.txt")

	return features
	else:
	print("Preprocessor doesn't have feature_names attribute")
	return None

	except Exception as e:
	print(f"Error inspecting model: {e}")
	return None

	if __name__ == "__main__":
	# Run this to inspect your model's expected features
	model_path = "fraud_detection_model_xgboost_20250727_145448.joblib"
	preprocessor_path = "preprocessor_20250727_145448.joblib"

	features = inspect_model_features(model_path, preprocessor_path)