0xnu commited on
Commit
496ab6f
·
verified ·
1 Parent(s): 4dc9c64

Upload feature_utils.py

Browse files
Files changed (1) hide show
  1. feature_utils.py +136 -0
feature_utils.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def get_model_expected_features():
5
+ """Return all features that the trained model expects"""
6
+ # Based on the original training data, these are typical features
7
+ expected_features = [
8
+ # Basic transaction features
9
+ 'TransactionAmt', 'TransactionDT',
10
+
11
+ # Card features
12
+ 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
13
+
14
+ # Address features
15
+ 'addr1', 'addr2',
16
+
17
+ # Distance features
18
+ 'dist1', 'dist2',
19
+
20
+ # Email features
21
+ 'P_emaildomain', 'R_emaildomain',
22
+
23
+ # Count features (C1-C14)
24
+ 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14',
25
+
26
+ # Time delta features (D1-D15)
27
+ 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15',
28
+
29
+ # Match features (M1-M9)
30
+ 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
31
+
32
+ # Vesta features (sample - there are many more V features)
33
+ 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
34
+ 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
35
+ 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30',
36
+ 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40',
37
+ 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50',
38
+ 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60',
39
+ 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70',
40
+ 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80',
41
+ 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90',
42
+ 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100',
43
+
44
+ # Identity features (id_01 to id_38)
45
+ 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10',
46
+ 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20',
47
+ 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
48
+ 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
49
+
50
+ # Device features
51
+ 'DeviceType', 'DeviceInfo'
52
+ ]
53
+
54
+ return expected_features
55
+
56
+ def fill_missing_features(transaction_data):
57
+ """Fill missing features with appropriate default values"""
58
+
59
+ # Get all expected features
60
+ expected_features = get_model_expected_features()
61
+
62
+ # Default values for different feature types
63
+ defaults = {
64
+ # Numeric features default to 0 or reasonable values
65
+ 'card1': 13553, 'card2': 150.0, 'card3': 150.0, 'card5': 142.0,
66
+ 'addr1': 325.0, 'addr2': 87.0,
67
+ 'dist1': 19.0, 'dist2': 19.0,
68
+
69
+ # Count features (C1-C14) - mostly 0 or 1
70
+ **{f'C{i}': 0.0 for i in range(1, 15)},
71
+ 'C1': 1.0, 'C2': 1.0, 'C6': 1.0, 'C9': 1.0, 'C11': 1.0, 'C12': 1.0, 'C13': 1.0, 'C14': 1.0,
72
+
73
+ # Time delta features (D1-D15) - mostly 0
74
+ **{f'D{i}': 0.0 for i in range(1, 16)},
75
+ 'D5': 20.0, # Common non-zero value
76
+
77
+ # Match features (M1-M9) - mostly F with some T
78
+ **{f'M{i}': 'F' for i in range(1, 10)},
79
+ 'M1': 'T', 'M2': 'T', 'M3': 'T',
80
+ 'M4': 'M0', # Special case
81
+
82
+ # Vesta features (V1-V100) - default to 1.0
83
+ **{f'V{i}': 1.0 for i in range(1, 101)},
84
+
85
+ # Identity features (id_01 to id_38) - default to 0.0
86
+ **{f'id_{i:02d}': 0.0 for i in range(1, 39)},
87
+
88
+ # Categorical features
89
+ 'card4': 'visa',
90
+ 'card6': 'credit',
91
+ 'P_emaildomain': 'gmail.com',
92
+ 'R_emaildomain': 'gmail.com',
93
+ 'DeviceType': 'desktop',
94
+ 'DeviceInfo': 'Windows',
95
+
96
+ # Transaction defaults
97
+ 'TransactionDT': 86400, # Default timestamp
98
+ }
99
+
100
+ # Create complete transaction data
101
+ complete_data = {}
102
+
103
+ # First, add all provided data
104
+ complete_data.update(transaction_data)
105
+
106
+ # Then fill missing features with defaults
107
+ for feature in expected_features:
108
+ if feature not in complete_data:
109
+ complete_data[feature] = defaults.get(feature, 0.0)
110
+
111
+ return complete_data
112
+
113
+ def create_simple_transaction(amount, card_type="visa", email_domain="gmail.com", hour=12):
114
+ """Create a transaction with minimal inputs and smart defaults"""
115
+
116
+ transaction_data = {
117
+ 'TransactionAmt': float(amount),
118
+ 'TransactionDT': hour * 3600,
119
+ 'card4': card_type,
120
+ 'P_emaildomain': email_domain,
121
+ 'R_emaildomain': email_domain,
122
+ }
123
+
124
+ # Fill all missing features
125
+ return fill_missing_features(transaction_data)
126
+
127
+ def validate_features(df, expected_features):
128
+ """Validate that DataFrame has all expected features"""
129
+ missing_features = set(expected_features) - set(df.columns)
130
+ extra_features = set(df.columns) - set(expected_features)
131
+
132
+ return {
133
+ 'missing': list(missing_features),
134
+ 'extra': list(extra_features),
135
+ 'is_valid': len(missing_features) == 0
136
+ }