Spaces:

0xnu
/

fraud-detection

Sleeping

App Files Files Community

fraud-detection / feature_detector.py

0xnu

Upload 5 files

4de4bb8 verified about 1 month ago

raw

history blame

7.17 kB

	import joblib
	import pandas as pd
	import numpy as np

	def get_model_feature_names(preprocessor):
	"""Extract the exact feature names that the model expects"""
	try:
	if hasattr(preprocessor, 'feature_names') and preprocessor.feature_names:
	return preprocessor.feature_names
	else:
	return None
	except:
	return None

	def create_complete_feature_set(transaction_data, expected_features):
	"""Create a complete feature set matching exactly what the model expects"""

	# Start with provided data
	complete_data = transaction_data.copy()

	# Define comprehensive defaults
	feature_defaults = {
	# Transaction basics
	'TransactionAmt': 100.0,
	'TransactionDT': 86400,
	'ProductCD': 'W',

	# Card features
	'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card4': 'visa', 'card5': 142.0, 'card6': 'credit',

	# Address and distance
	'addr1': 325.0, 'addr2': 87.0, 'dist1': 19.0, 'dist2': 19.0,

	# Email domains
	'P_emaildomain': 'gmail.com', 'R_emaildomain': 'gmail.com',

	# Device info
	'DeviceType': 'desktop', 'DeviceInfo': 'Windows',

	# Count features (C1-C14)
	**{f'C{i}': 0.0 for i in range(1, 15)},
	'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0,

	# Time delta features (D1-D15)
	**{f'D{i}': 0.0 for i in range(1, 16)},
	'D5': 20.0,

	# Match features (M1-M9)
	**{f'M{i}': 'F' for i in range(1, 10)},
	'M1': 'T', 'M2': 'T', 'M3': 'T', 'M4': 'M0',

	# All possible V features (V1-V339 based on error)
	**{f'V{i}': 1.0 for i in range(1, 340)},

	# All possible identity features
	**{f'id_{i:02d}': 0.0 for i in range(1, 39)},
	}

	# Fill missing features
	for feature in expected_features:
	if feature not in complete_data:
	if feature in feature_defaults:
	complete_data[feature] = feature_defaults[feature]
	else:
	# Default based on feature name pattern
	if feature.startswith('V'):
	complete_data[feature] = 1.0
	elif feature.startswith('id_'):
	complete_data[feature] = 0.0
	elif feature.startswith('C'):
	complete_data[feature] = 0.0
	elif feature.startswith('D'):
	complete_data[feature] = 0.0
	elif feature.startswith('M'):
	complete_data[feature] = 'F'
	else:
	# Numeric default
	complete_data[feature] = 0.0

	return complete_data

	def debug_feature_mismatch(model_features, data_features):
	"""Debug feature mismatches between model and data"""
	model_set = set(model_features) if model_features else set()
	data_set = set(data_features) if data_features else set()

	missing_in_data = model_set - data_set
	extra_in_data = data_set - model_set

	print(f"Model expects {len(model_set)} features")
	print(f"Data has {len(data_set)} features")
	print(f"Missing in data: {len(missing_in_data)} features")
	print(f"Extra in data: {len(extra_in_data)} features")

	if missing_in_data:
	print(f"First 10 missing features: {list(missing_in_data)[:10]}")

	return {
	'missing': list(missing_in_data),
	'extra': list(extra_in_data),
	'model_count': len(model_set),
	'data_count': len(data_set)
	}

	def safe_predict_with_feature_matching(model, preprocessor, transaction_data):
	"""Safely make predictions by ensuring feature alignment"""

	try:
	# Get expected features from preprocessor
	expected_features = get_model_feature_names(preprocessor)

	if expected_features is None:
	raise ValueError("Could not determine expected features from preprocessor")

	# Create complete feature set
	complete_data = create_complete_feature_set(transaction_data, expected_features)

	# Create DataFrame
	df = pd.DataFrame([complete_data])

	# Add TransactionID if not present
	if 'TransactionID' not in df.columns:
	df['TransactionID'] = 'temp_id'

	# Preprocess
	X_processed, _ = preprocessor.preprocess(df, fit=False)

	# Debug feature alignment
	debug_info = debug_feature_mismatch(expected_features, X_processed.columns.tolist())
	print(f"Feature alignment: {debug_info}")

	# Make prediction
	prediction_proba = model.predict_proba(X_processed)[0, 1]

	return {
	'success': True,
	'probability': float(prediction_proba),
	'debug_info': debug_info
	}

	except Exception as e:
	return {
	'success': False,
	'error': str(e),
	'debug_info': None
	}

	# Function to inspect your saved model's expected features
	def inspect_model_features(model_path, preprocessor_path):
	"""Inspect what features your saved model actually expects"""

	try:
	print("Loading model and preprocessor...")
	model = joblib.load(model_path)
	preprocessor = joblib.load(preprocessor_path)

	print(f"Model type: {type(model).__name__}")

	# Try to get feature names from preprocessor
	if hasattr(preprocessor, 'feature_names'):
	features = preprocessor.feature_names
	print(f"Preprocessor has {len(features)} feature names")
	print("First 20 features:", features[:20])
	print("Last 20 features:", features[-20:])

	# Analyze feature patterns
	v_features = [f for f in features if f.startswith('V')]
	id_features = [f for f in features if f.startswith('id_')]
	c_features = [f for f in features if f.startswith('C')]
	d_features = [f for f in features if f.startswith('D')]
	m_features = [f for f in features if f.startswith('M')]

	print(f"\nFeature breakdown:")
	print(f"V features: {len(v_features)} (range: {min(v_features)} to {max(v_features)})")
	print(f"ID features: {len(id_features)}")
	print(f"C features: {len(c_features)}")
	print(f"D features: {len(d_features)}")
	print(f"M features: {len(m_features)}")

	return features
	else:
	print("Preprocessor doesn't have feature_names attribute")
	return None

	except Exception as e:
	print(f"Error inspecting model: {e}")
	return None

	if __name__ == "__main__":
	# Run this to inspect your model's expected features
	model_path = "fraud_detection_model_xgboost_20250727_145448.joblib"
	preprocessor_path = "preprocessor_20250727_145448.joblib"

	features = inspect_model_features(model_path, preprocessor_path)