File size: 12,134 Bytes
bf7e729
 
 
 
 
7fcaffe
bf7e729
 
 
 
7fcaffe
bf7e729
 
 
 
 
 
 
 
 
7fcaffe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf7e729
7fcaffe
 
1999f4d
7fcaffe
1999f4d
 
 
7fcaffe
 
 
 
 
 
 
bf7e729
7fcaffe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf7e729
7fcaffe
 
 
 
 
 
 
bf7e729
 
7fcaffe
 
bf7e729
1999f4d
7fcaffe
bf7e729
7fcaffe
 
 
 
 
bf7e729
 
 
7fcaffe
 
 
 
 
 
 
 
bf7e729
7fcaffe
 
 
 
 
1999f4d
7fcaffe
 
 
 
 
 
 
 
 
 
 
 
bf7e729
7fcaffe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf7e729
 
7fcaffe
bf7e729
 
 
7fcaffe
bf7e729
7fcaffe
 
bf7e729
 
7fcaffe
 
bf7e729
7fcaffe
 
bf7e729
 
7fcaffe
 
 
bf7e729
7fcaffe
 
 
 
bf7e729
7fcaffe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf7e729
 
 
7fcaffe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf7e729
7fcaffe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf7e729
 
7fcaffe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf7e729
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import pandas as pd
import os
from datetime import datetime

def _clean_numeric_column(series):
    """Clean string columns into numbers, handling errors."""
    series_str = series.astype(str)
    return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')

def _calculate_age(dob_str, fight_date_str):
    """Calculate age in years from date of birth and fight date strings."""
    if pd.isna(dob_str) or not dob_str:
        return None
    try:
        dob = datetime.strptime(dob_str, '%b %d, %Y')
        fight_date = datetime.strptime(fight_date_str, '%B %d, %Y')
        return (fight_date - dob).days / 365.25
    except (ValueError, TypeError):
        return None

def _get_days_since_last_fight(current_date, past_fights):
    """Calculate days since a fighter's last fight."""
    if not past_fights:
        return None
    last_fight_date = past_fights[-1]['date_obj']
    return (current_date - last_fight_date).days

def _get_win_streak(fighter_name, current_date, past_fights):
    """Calculate current win streak before a given date."""
    streak = 0
    for fight in reversed(past_fights):
        if fight['date_obj'] >= current_date:
            continue
        if fight['winner'] == fighter_name:
            streak += 1
        else:
            break
    return streak

def _to_int_safe(value):
    """Safely convert a value to integer, returning 0 for invalid values."""
    try:
        return int(float(value)) if value and not pd.isna(value) else 0
    except (ValueError, TypeError):
        return 0

def _get_fighter_history_stats(fighter_name, current_fight_date, past_fights, fighters_df, n_fights=5):
    """Calculate historical performance statistics for a fighter."""
    # Sort fights by date and get last N fights before current fight
    past_fights = [f for f in past_fights if f['date_obj'] < current_fight_date]
    past_fights = sorted(past_fights, key=lambda x: x['date_obj'])
    last_n_fights = past_fights[-n_fights:] if past_fights else []
    
    stats = {
        'wins_last_n': 0,
        'ko_wins': 0,
        'total_finishes': 0,
        'first_round_finishes': 0,
        'knockdowns_scored': 0,
        'knockdowns_absorbed': 0,
        'sig_str_landed': 0,
        'sig_str_attempted': 0,
        'takedowns_landed': 0,
        'takedowns_attempted': 0,
        'sub_attempts': 0,
        'ctrl_time_sec': 0,
        'total_fight_time_sec': 0,
        'fights_in_last_year': 0,
        'avg_opp_elo_last_n': 0
    }
    
    # Calculate fights in last year
    one_year_ago = current_fight_date - pd.Timedelta(days=365)
    stats['fights_in_last_year'] = len([f for f in past_fights if f['date_obj'] >= one_year_ago])
    
    # Process each fight
    total_opp_elo = 0
    for fight in last_n_fights:
        is_fighter_1 = (fight['fighter_1'] == fighter_name)
        f_prefix = 'f1' if is_fighter_1 else 'f2'
        opp_prefix = 'f2' if is_fighter_1 else 'f1'
        opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
        
        # Win/Loss and Finishes
        if fight['winner'] == fighter_name:
            stats['wins_last_n'] += 1
            if fight['method'] != 'Decision':
                stats['total_finishes'] += 1
                if fight['round'] == '1':
                    stats['first_round_finishes'] += 1
            if 'KO' in fight['method']:
                stats['ko_wins'] += 1
        
        # Striking and Grappling Stats
        stats['knockdowns_scored'] += _to_int_safe(fight.get(f'{f_prefix}_kd'))
        stats['knockdowns_absorbed'] += _to_int_safe(fight.get(f'{opp_prefix}_kd'))
        stats['sig_str_landed'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_landed'))
        stats['sig_str_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_attempted'))
        stats['takedowns_landed'] += _to_int_safe(fight.get(f'{f_prefix}_td_landed'))
        stats['takedowns_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_td_attempted'))
        stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_attempts'))
        
        # Control Time
        ctrl_time = fight.get(f'{f_prefix}_ctrl_time', '0:00')
        if isinstance(ctrl_time, str) and ':' in ctrl_time:
            mins, secs = map(int, ctrl_time.split(':'))
            stats['ctrl_time_sec'] += mins * 60 + secs
        
        # Fight Duration
        round_num = _to_int_safe(fight['round'])
        round_time = fight.get('round_time', '0:00')
        if isinstance(round_time, str) and ':' in round_time:
            mins, secs = map(int, round_time.split(':'))
            stats['total_fight_time_sec'] += (round_num - 1) * 300 + mins * 60 + secs
        
        # Opponent ELO
        if opponent_name in fighters_df.index:
            opp_elo = fighters_df.loc[opponent_name, 'elo']
            if not pd.isna(opp_elo):
                total_opp_elo += opp_elo
    
    # Calculate averages and rates
    n_actual_fights = len(last_n_fights)
    
    # Always provide all required keys with default values
    stats['finish_rate_last_n'] = stats['total_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
    stats['first_round_finish_rate_last_n'] = stats['first_round_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
    stats['ko_percent_last_n'] = stats['ko_wins'] / n_actual_fights if n_actual_fights > 0 else 0.0
    stats['avg_knockdowns_per_fight_last_n'] = stats['knockdowns_scored'] / n_actual_fights if n_actual_fights > 0 else 0.0
    stats['knockdowns_absorbed_per_fight_last_n'] = stats['knockdowns_absorbed'] / n_actual_fights if n_actual_fights > 0 else 0.0
    stats['avg_opp_elo_last_n'] = total_opp_elo / n_actual_fights if n_actual_fights > 0 else 1500.0
    
    # Per-minute stats
    total_mins = stats['total_fight_time_sec'] / 60
    stats['sig_str_landed_per_min_last_n'] = stats['sig_str_landed'] / total_mins if total_mins > 0 else 0.0
    stats['sig_str_absorbed_per_min_last_n'] = stats['sig_str_attempted'] / total_mins if total_mins > 0 else 0.0
    stats['sub_attempts_per_min_last_n'] = stats['sub_attempts'] / total_mins if total_mins > 0 else 0.0
    stats['avg_ctrl_time_sec_per_min_last_n'] = stats['ctrl_time_sec'] / total_mins if total_mins > 0 else 0.0
    
    # Accuracy stats
    stats['sig_str_defense_last_n'] = stats['sig_str_landed'] / stats['sig_str_attempted'] if stats['sig_str_attempted'] > 0 else 0.5
    stats['takedown_accuracy_last_n'] = stats['takedowns_landed'] / stats['takedowns_attempted'] if stats['takedowns_attempted'] > 0 else 0.5
    stats['takedown_defense_last_n'] = 1 - (stats['takedowns_landed'] / stats['takedowns_attempted']) if stats['takedowns_attempted'] > 0 else 0.5
    
    return stats

def preprocess_for_ml(fights_to_process, fighters_csv_path):
    """Transform fight data into ML-ready features."""
    if not os.path.exists(fighters_csv_path):
        raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")

    # Load and prepare fighter data
    fighters_df = pd.read_csv(fighters_csv_path)
    fighters_df['full_name'] = fighters_df['first_name'] + ' ' + fighters_df['last_name']
    fighters_df = fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
    
    for col in ['height_cm', 'reach_in', 'elo']:
        if col in fighters_df.columns:
            fighters_df[col] = _clean_numeric_column(fighters_df[col])
    
    # Process fights and calculate features
    processed_fights = []
    for fight in fights_to_process:
        f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
        
        # Skip if either fighter is missing
        if f1_name not in fighters_df.index or f2_name not in fighters_df.index:
            continue
            
        # Get fighter stats
        f1_stats = fighters_df.loc[f1_name]
        f2_stats = fighters_df.loc[f2_name]
        
        # Calculate fight date and ensure date_obj is available
        fight_date = pd.to_datetime(fight['event_date'])
        fight['date_obj'] = fight_date
        
        # Get fighter histories and ensure date_obj is available for all fights
        f1_hist = [f for f in fights_to_process if f1_name in (f['fighter_1'], f['fighter_2'])]
        f2_hist = [f for f in fights_to_process if f2_name in (f['fighter_1'], f['fighter_2'])]
        
        # Ensure date_obj is available for all historical fights
        for hist_fight in f1_hist + f2_hist:
            if 'date_obj' not in hist_fight:
                hist_fight['date_obj'] = pd.to_datetime(hist_fight['event_date'])
        
        # Calculate historical stats
        f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, fighters_df)
        f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, fighters_df)
        
        # Calculate ages
        f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
        f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
        
        # Calculate days since last fight
        f1_days_since_last = _get_days_since_last_fight(fight_date, f1_hist) or 547  # ~1.5 years if no previous fights
        f2_days_since_last = _get_days_since_last_fight(fight_date, f2_hist) or 547
        
        # Calculate win streaks
        f1_win_streak = _get_win_streak(f1_name, fight_date, f1_hist)
        f2_win_streak = _get_win_streak(f2_name, fight_date, f2_hist)
        
        # Compile all features
        feature_dict = {
            'winner': 1 if fight.get('winner') == f1_name else 0,
            'date': fight['event_date'],
            'fighter_1': f1_name,
            'fighter_2': f2_name,
            
            # Physical differences
            'height_diff': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
            'reach_diff': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
            'age_diff': (f1_age or 0) - (f2_age or 0),
            'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
            
            # Career momentum
            'days_since_last_fight_diff': f1_days_since_last - f2_days_since_last,
            'win_streak_diff': f1_win_streak - f2_win_streak,
            'fights_last_year_diff': f1_hist_stats['fights_in_last_year'] - f2_hist_stats['fights_in_last_year'],
            
            # Performance differences
            'finish_rate_diff': f1_hist_stats['finish_rate_last_n'] - f2_hist_stats['finish_rate_last_n'],
            'ko_rate_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
            'sig_str_per_min_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
            'td_accuracy_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
            'sub_attempts_per_min_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
            'control_time_diff': f1_hist_stats['avg_ctrl_time_sec_per_min_last_n'] - f2_hist_stats['avg_ctrl_time_sec_per_min_last_n'],
            
            # Defense differences
            'sig_str_defense_diff': f1_hist_stats['sig_str_defense_last_n'] - f2_hist_stats['sig_str_defense_last_n'],
            'td_defense_diff': f1_hist_stats['takedown_defense_last_n'] - f2_hist_stats['takedown_defense_last_n'],
            'knockdowns_absorbed_diff': f1_hist_stats['knockdowns_absorbed_per_fight_last_n'] - f2_hist_stats['knockdowns_absorbed_per_fight_last_n']
        }
        
        processed_fights.append(feature_dict)
    
    if not processed_fights:
        return pd.DataFrame(), pd.Series(), pd.DataFrame()
    
    # Create final dataframes
    df = pd.DataFrame(processed_fights)
    metadata = df[['date', 'fighter_1', 'fighter_2', 'winner']]
    
    # Prepare X and y
    y = df['winner']
    X = df.drop(columns=['winner', 'date', 'fighter_1', 'fighter_2'])
    X = X.reindex(sorted(X.columns), axis=1)  # Ensure consistent column order
    
    # Handle missing values by filling NaNs with 0
    X = X.fillna(0)
    
    return X, y, metadata