Spaces:
Sleeping
Sleeping
Upload feature_utils.py
Browse files- feature_utils.py +136 -0
feature_utils.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
def get_model_expected_features():
|
5 |
+
"""Return all features that the trained model expects"""
|
6 |
+
# Based on the original training data, these are typical features
|
7 |
+
expected_features = [
|
8 |
+
# Basic transaction features
|
9 |
+
'TransactionAmt', 'TransactionDT',
|
10 |
+
|
11 |
+
# Card features
|
12 |
+
'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
|
13 |
+
|
14 |
+
# Address features
|
15 |
+
'addr1', 'addr2',
|
16 |
+
|
17 |
+
# Distance features
|
18 |
+
'dist1', 'dist2',
|
19 |
+
|
20 |
+
# Email features
|
21 |
+
'P_emaildomain', 'R_emaildomain',
|
22 |
+
|
23 |
+
# Count features (C1-C14)
|
24 |
+
'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14',
|
25 |
+
|
26 |
+
# Time delta features (D1-D15)
|
27 |
+
'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15',
|
28 |
+
|
29 |
+
# Match features (M1-M9)
|
30 |
+
'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
|
31 |
+
|
32 |
+
# Vesta features (sample - there are many more V features)
|
33 |
+
'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
|
34 |
+
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
|
35 |
+
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30',
|
36 |
+
'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40',
|
37 |
+
'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50',
|
38 |
+
'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60',
|
39 |
+
'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70',
|
40 |
+
'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80',
|
41 |
+
'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90',
|
42 |
+
'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100',
|
43 |
+
|
44 |
+
# Identity features (id_01 to id_38)
|
45 |
+
'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10',
|
46 |
+
'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20',
|
47 |
+
'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
|
48 |
+
'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
|
49 |
+
|
50 |
+
# Device features
|
51 |
+
'DeviceType', 'DeviceInfo'
|
52 |
+
]
|
53 |
+
|
54 |
+
return expected_features
|
55 |
+
|
56 |
+
def fill_missing_features(transaction_data):
|
57 |
+
"""Fill missing features with appropriate default values"""
|
58 |
+
|
59 |
+
# Get all expected features
|
60 |
+
expected_features = get_model_expected_features()
|
61 |
+
|
62 |
+
# Default values for different feature types
|
63 |
+
defaults = {
|
64 |
+
# Numeric features default to 0 or reasonable values
|
65 |
+
'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card5': 142.0,
|
66 |
+
'addr1': 325.0, 'addr2': 87.0,
|
67 |
+
'dist1': 19.0, 'dist2': 19.0,
|
68 |
+
|
69 |
+
# Count features (C1-C14) - mostly 0 or 1
|
70 |
+
**{f'C{i}': 0.0 for i in range(1, 15)},
|
71 |
+
'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0,
|
72 |
+
|
73 |
+
# Time delta features (D1-D15) - mostly 0
|
74 |
+
**{f'D{i}': 0.0 for i in range(1, 16)},
|
75 |
+
'D5': 20.0, # Common non-zero value
|
76 |
+
|
77 |
+
# Match features (M1-M9) - mostly F with some T
|
78 |
+
**{f'M{i}': 'F' for i in range(1, 10)},
|
79 |
+
'M1': 'T', 'M2': 'T', 'M3': 'T',
|
80 |
+
'M4': 'M0', # Special case
|
81 |
+
|
82 |
+
# Vesta features (V1-V100) - default to 1.0
|
83 |
+
**{f'V{i}': 1.0 for i in range(1, 101)},
|
84 |
+
|
85 |
+
# Identity features (id_01 to id_38) - default to 0.0
|
86 |
+
**{f'id_{i:02d}': 0.0 for i in range(1, 39)},
|
87 |
+
|
88 |
+
# Categorical features
|
89 |
+
'card4': 'visa',
|
90 |
+
'card6': 'credit',
|
91 |
+
'P_emaildomain': 'gmail.com',
|
92 |
+
'R_emaildomain': 'gmail.com',
|
93 |
+
'DeviceType': 'desktop',
|
94 |
+
'DeviceInfo': 'Windows',
|
95 |
+
|
96 |
+
# Transaction defaults
|
97 |
+
'TransactionDT': 86400, # Default timestamp
|
98 |
+
}
|
99 |
+
|
100 |
+
# Create complete transaction data
|
101 |
+
complete_data = {}
|
102 |
+
|
103 |
+
# First, add all provided data
|
104 |
+
complete_data.update(transaction_data)
|
105 |
+
|
106 |
+
# Then fill missing features with defaults
|
107 |
+
for feature in expected_features:
|
108 |
+
if feature not in complete_data:
|
109 |
+
complete_data[feature] = defaults.get(feature, 0.0)
|
110 |
+
|
111 |
+
return complete_data
|
112 |
+
|
113 |
+
def create_simple_transaction(amount, card_type="visa", email_domain="gmail.com", hour=12):
|
114 |
+
"""Create a transaction with minimal inputs and smart defaults"""
|
115 |
+
|
116 |
+
transaction_data = {
|
117 |
+
'TransactionAmt': float(amount),
|
118 |
+
'TransactionDT': hour * 3600,
|
119 |
+
'card4': card_type,
|
120 |
+
'P_emaildomain': email_domain,
|
121 |
+
'R_emaildomain': email_domain,
|
122 |
+
}
|
123 |
+
|
124 |
+
# Fill all missing features
|
125 |
+
return fill_missing_features(transaction_data)
|
126 |
+
|
127 |
+
def validate_features(df, expected_features):
|
128 |
+
"""Validate that DataFrame has all expected features"""
|
129 |
+
missing_features = set(expected_features) - set(df.columns)
|
130 |
+
extra_features = set(df.columns) - set(expected_features)
|
131 |
+
|
132 |
+
return {
|
133 |
+
'missing': list(missing_features),
|
134 |
+
'extra': list(extra_features),
|
135 |
+
'is_valid': len(missing_features) == 0
|
136 |
+
}
|