import pandas as pd import os from datetime import datetime def _clean_numeric_column(series): """Clean string columns into numbers, handling errors.""" series_str = series.astype(str) return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce') def _calculate_age(dob_str, fight_date_str): """Calculate age in years from date of birth and fight date strings.""" if pd.isna(dob_str) or not dob_str: return None try: dob = datetime.strptime(dob_str, '%b %d, %Y') fight_date = datetime.strptime(fight_date_str, '%B %d, %Y') return (fight_date - dob).days / 365.25 except (ValueError, TypeError): return None def _get_days_since_last_fight(current_date, past_fights): """Calculate days since a fighter's last fight.""" if not past_fights: return None last_fight_date = past_fights[-1]['date_obj'] return (current_date - last_fight_date).days def _get_win_streak(fighter_name, current_date, past_fights): """Calculate current win streak before a given date.""" streak = 0 for fight in reversed(past_fights): if fight['date_obj'] >= current_date: continue if fight['winner'] == fighter_name: streak += 1 else: break return streak def _to_int_safe(value): """Safely convert a value to integer, returning 0 for invalid values.""" try: return int(float(value)) if value and not pd.isna(value) else 0 except (ValueError, TypeError): return 0 def _get_fighter_history_stats(fighter_name, current_fight_date, past_fights, fighters_df, n_fights=5): """Calculate historical performance statistics for a fighter.""" # Sort fights by date and get last N fights before current fight past_fights = [f for f in past_fights if f['date_obj'] < current_fight_date] past_fights = sorted(past_fights, key=lambda x: x['date_obj']) last_n_fights = past_fights[-n_fights:] if past_fights else [] stats = { 'wins_last_n': 0, 'ko_wins': 0, 'total_finishes': 0, 'first_round_finishes': 0, 'knockdowns_scored': 0, 'knockdowns_absorbed': 0, 'sig_str_landed': 0, 'sig_str_attempted': 0, 'takedowns_landed': 0, 'takedowns_attempted': 0, 'sub_attempts': 0, 'ctrl_time_sec': 0, 'total_fight_time_sec': 0, 'fights_in_last_year': 0, 'avg_opp_elo_last_n': 0 } # Calculate fights in last year one_year_ago = current_fight_date - pd.Timedelta(days=365) stats['fights_in_last_year'] = len([f for f in past_fights if f['date_obj'] >= one_year_ago]) # Process each fight total_opp_elo = 0 for fight in last_n_fights: is_fighter_1 = (fight['fighter_1'] == fighter_name) f_prefix = 'f1' if is_fighter_1 else 'f2' opp_prefix = 'f2' if is_fighter_1 else 'f1' opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1'] # Win/Loss and Finishes if fight['winner'] == fighter_name: stats['wins_last_n'] += 1 if fight['method'] != 'Decision': stats['total_finishes'] += 1 if fight['round'] == '1': stats['first_round_finishes'] += 1 if 'KO' in fight['method']: stats['ko_wins'] += 1 # Striking and Grappling Stats stats['knockdowns_scored'] += _to_int_safe(fight.get(f'{f_prefix}_kd')) stats['knockdowns_absorbed'] += _to_int_safe(fight.get(f'{opp_prefix}_kd')) stats['sig_str_landed'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_landed')) stats['sig_str_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_attempted')) stats['takedowns_landed'] += _to_int_safe(fight.get(f'{f_prefix}_td_landed')) stats['takedowns_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_td_attempted')) stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_attempts')) # Control Time ctrl_time = fight.get(f'{f_prefix}_ctrl_time', '0:00') if isinstance(ctrl_time, str) and ':' in ctrl_time: mins, secs = map(int, ctrl_time.split(':')) stats['ctrl_time_sec'] += mins * 60 + secs # Fight Duration round_num = _to_int_safe(fight['round']) round_time = fight.get('round_time', '0:00') if isinstance(round_time, str) and ':' in round_time: mins, secs = map(int, round_time.split(':')) stats['total_fight_time_sec'] += (round_num - 1) * 300 + mins * 60 + secs # Opponent ELO if opponent_name in fighters_df.index: opp_elo = fighters_df.loc[opponent_name, 'elo'] if not pd.isna(opp_elo): total_opp_elo += opp_elo # Calculate averages and rates n_actual_fights = len(last_n_fights) # Always provide all required keys with default values stats['finish_rate_last_n'] = stats['total_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0 stats['first_round_finish_rate_last_n'] = stats['first_round_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0 stats['ko_percent_last_n'] = stats['ko_wins'] / n_actual_fights if n_actual_fights > 0 else 0.0 stats['avg_knockdowns_per_fight_last_n'] = stats['knockdowns_scored'] / n_actual_fights if n_actual_fights > 0 else 0.0 stats['knockdowns_absorbed_per_fight_last_n'] = stats['knockdowns_absorbed'] / n_actual_fights if n_actual_fights > 0 else 0.0 stats['avg_opp_elo_last_n'] = total_opp_elo / n_actual_fights if n_actual_fights > 0 else 1500.0 # Per-minute stats total_mins = stats['total_fight_time_sec'] / 60 stats['sig_str_landed_per_min_last_n'] = stats['sig_str_landed'] / total_mins if total_mins > 0 else 0.0 stats['sig_str_absorbed_per_min_last_n'] = stats['sig_str_attempted'] / total_mins if total_mins > 0 else 0.0 stats['sub_attempts_per_min_last_n'] = stats['sub_attempts'] / total_mins if total_mins > 0 else 0.0 stats['avg_ctrl_time_sec_per_min_last_n'] = stats['ctrl_time_sec'] / total_mins if total_mins > 0 else 0.0 # Accuracy stats stats['sig_str_defense_last_n'] = stats['sig_str_landed'] / stats['sig_str_attempted'] if stats['sig_str_attempted'] > 0 else 0.5 stats['takedown_accuracy_last_n'] = stats['takedowns_landed'] / stats['takedowns_attempted'] if stats['takedowns_attempted'] > 0 else 0.5 stats['takedown_defense_last_n'] = 1 - (stats['takedowns_landed'] / stats['takedowns_attempted']) if stats['takedowns_attempted'] > 0 else 0.5 return stats def preprocess_for_ml(fights_to_process, fighters_csv_path): """Transform fight data into ML-ready features.""" if not os.path.exists(fighters_csv_path): raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.") # Load and prepare fighter data fighters_df = pd.read_csv(fighters_csv_path) fighters_df['full_name'] = fighters_df['first_name'] + ' ' + fighters_df['last_name'] fighters_df = fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name') for col in ['height_cm', 'reach_in', 'elo']: if col in fighters_df.columns: fighters_df[col] = _clean_numeric_column(fighters_df[col]) # Process fights and calculate features processed_fights = [] for fight in fights_to_process: f1_name, f2_name = fight['fighter_1'], fight['fighter_2'] # Skip if either fighter is missing if f1_name not in fighters_df.index or f2_name not in fighters_df.index: continue # Get fighter stats f1_stats = fighters_df.loc[f1_name] f2_stats = fighters_df.loc[f2_name] # Calculate fight date and ensure date_obj is available fight_date = pd.to_datetime(fight['event_date']) fight['date_obj'] = fight_date # Get fighter histories and ensure date_obj is available for all fights f1_hist = [f for f in fights_to_process if f1_name in (f['fighter_1'], f['fighter_2'])] f2_hist = [f for f in fights_to_process if f2_name in (f['fighter_1'], f['fighter_2'])] # Ensure date_obj is available for all historical fights for hist_fight in f1_hist + f2_hist: if 'date_obj' not in hist_fight: hist_fight['date_obj'] = pd.to_datetime(hist_fight['event_date']) # Calculate historical stats f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, fighters_df) f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, fighters_df) # Calculate ages f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date']) f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date']) # Calculate days since last fight f1_days_since_last = _get_days_since_last_fight(fight_date, f1_hist) or 547 # ~1.5 years if no previous fights f2_days_since_last = _get_days_since_last_fight(fight_date, f2_hist) or 547 # Calculate win streaks f1_win_streak = _get_win_streak(f1_name, fight_date, f1_hist) f2_win_streak = _get_win_streak(f2_name, fight_date, f2_hist) # Compile all features feature_dict = { 'winner': 1 if fight.get('winner') == f1_name else 0, 'date': fight['event_date'], 'fighter_1': f1_name, 'fighter_2': f2_name, # Physical differences 'height_diff': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0), 'reach_diff': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0), 'age_diff': (f1_age or 0) - (f2_age or 0), 'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500), # Career momentum 'days_since_last_fight_diff': f1_days_since_last - f2_days_since_last, 'win_streak_diff': f1_win_streak - f2_win_streak, 'fights_last_year_diff': f1_hist_stats['fights_in_last_year'] - f2_hist_stats['fights_in_last_year'], # Performance differences 'finish_rate_diff': f1_hist_stats['finish_rate_last_n'] - f2_hist_stats['finish_rate_last_n'], 'ko_rate_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'], 'sig_str_per_min_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'], 'td_accuracy_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'], 'sub_attempts_per_min_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'], 'control_time_diff': f1_hist_stats['avg_ctrl_time_sec_per_min_last_n'] - f2_hist_stats['avg_ctrl_time_sec_per_min_last_n'], # Defense differences 'sig_str_defense_diff': f1_hist_stats['sig_str_defense_last_n'] - f2_hist_stats['sig_str_defense_last_n'], 'td_defense_diff': f1_hist_stats['takedown_defense_last_n'] - f2_hist_stats['takedown_defense_last_n'], 'knockdowns_absorbed_diff': f1_hist_stats['knockdowns_absorbed_per_fight_last_n'] - f2_hist_stats['knockdowns_absorbed_per_fight_last_n'] } processed_fights.append(feature_dict) if not processed_fights: return pd.DataFrame(), pd.Series(), pd.DataFrame() # Create final dataframes df = pd.DataFrame(processed_fights) metadata = df[['date', 'fighter_1', 'fighter_2', 'winner']] # Prepare X and y y = df['winner'] X = df.drop(columns=['winner', 'date', 'fighter_1', 'fighter_2']) X = X.reindex(sorted(X.columns), axis=1) # Ensure consistent column order # Handle missing values by filling NaNs with 0 X = X.fillna(0) return X, y, metadata