Spaces:

AlvaroMros
/

ufc-predictor

Running

File size: 11,954 Bytes

import pandas as pd
import os
import sys
from datetime import datetime

# Use absolute imports to avoid relative import issues
try:
    from src.config import FIGHTERS_CSV_PATH
except ImportError:
    # Fallback for when running directly
    from ..config import FIGHTERS_CSV_PATH

def _clean_numeric_column(series):
    """A helper to clean string columns into numbers, handling errors."""
    series_str = series.astype(str)
    return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')

def _calculate_age(dob_str, fight_date_str):
    """Calculates age in years from a date of birth string and fight date string."""
    if pd.isna(dob_str) or not dob_str:
        return None
    try:
        dob = datetime.strptime(dob_str, '%b %d, %Y')
        fight_date = datetime.strptime(fight_date_str, '%B %d, %Y')
        return (fight_date - dob).days / 365.25
    except (ValueError, TypeError):
        return None

def _parse_round_time_to_seconds(round_str, time_str):
    """Converts fight duration from round and time to total seconds."""
    try:
        rounds = int(round_str)
        minutes, seconds = map(int, time_str.split(':'))
        # Assuming 5-minute rounds for calculation simplicity
        return ((rounds - 1) * 5 * 60) + (minutes * 60) + seconds
    except (ValueError, TypeError, AttributeError):
        return 0

def _parse_striking_stats(stat_str):
    """Parses striking stats string like '10 of 20' into (landed, attempted)."""
    try:
        landed, attempted = map(int, stat_str.split(' of '))
        return landed, attempted
    except (ValueError, TypeError, AttributeError):
        return 0, 0

def _to_int_safe(val):
    """Safely converts a value to an integer, returning 0 if it's invalid or empty."""
    if pd.isna(val):
        return 0
    try:
        # handle strings with whitespace or empty strings
        return int(str(val).strip() or 0)
    except (ValueError, TypeError):
        return 0

def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history, fighters_df, n=5):
    """
    Calculates performance statistics for a fighter based on their last n fights.
    """
    past_fights = [f for f in fighter_history if f['date_obj'] < current_fight_date]
    last_n_fights = past_fights[-n:]

    if not last_n_fights:
        # Return a default dictionary with the correct keys for a fighter with no history
        return {
            'wins_last_n': 0,
            'avg_opp_elo_last_n': 1500, # Assume average ELO for first opponent
            'ko_percent_last_n': 0,
            'sig_str_landed_per_min_last_n': 0,
            'takedown_accuracy_last_n': 0,
            'sub_attempts_per_min_last_n': 0,
        }

    stats = {
        'wins': 0, 'ko_wins': 0, 'total_time_secs': 0,
        'sig_str_landed': 0, 'opponent_elos': [],
        'td_landed': 0, 'td_attempted': 0, 'sub_attempts': 0
    }

    for fight in last_n_fights:
        is_fighter_1 = (fight['fighter_1'] == fighter_name)
        opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
        
        f_prefix = 'f1' if is_fighter_1 else 'f2'

        if fight['winner'] == fighter_name:
            stats['wins'] += 1
            if 'KO' in fight['method']:
                stats['ko_wins'] += 1

        if opponent_name in fighters_df.index:
            opp_elo = fighters_df.loc[opponent_name, 'elo']
            stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else 1500)
        
        stats['total_time_secs'] += _parse_round_time_to_seconds(fight['round'], fight['time'])
        
        sig_str_stat = fight.get(f'{f_prefix}_sig_str', '0 of 0')
        landed, _ = _parse_striking_stats(sig_str_stat)
        stats['sig_str_landed'] += landed

        td_stat = fight.get(f'{f_prefix}_td', '0 of 0')
        td_landed, td_attempted = _parse_striking_stats(td_stat) # Can reuse this parser
        stats['td_landed'] += td_landed
        stats['td_attempted'] += td_attempted
        
        stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_att'))

    # Final calculations
    avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
    total_minutes = stats['total_time_secs'] / 60 if stats['total_time_secs'] > 0 else 0
    
    return {
        'wins_last_n': stats['wins'],
        'avg_opp_elo_last_n': avg_opp_elo,
        'ko_percent_last_n': (stats['ko_wins'] / stats['wins']) if stats['wins'] > 0 else 0,
        'sig_str_landed_per_min_last_n': (stats['sig_str_landed'] / total_minutes) if total_minutes > 0 else 0,
        'takedown_accuracy_last_n': (stats['td_landed'] / stats['td_attempted']) if stats['td_attempted'] > 0 else 0,
        'sub_attempts_per_min_last_n': (stats['sub_attempts'] / total_minutes) if total_minutes > 0 else 0,
    }

def preprocess_for_ml(fights_to_process, fighters_csv_path):
    """
    Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
    suitable for a binary classification machine learning model.

    Args:
        fights_to_process (list of dict): The list of fights to process.
        fighters_csv_path (str): Path to the CSV file with all fighter stats.

    Returns:
        pd.DataFrame: Feature matrix X.
        pd.Series: Target vector y.
        pd.DataFrame: Metadata DataFrame.
    """
    if not os.path.exists(fighters_csv_path):
        raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")

    fighters_df = pd.read_csv(fighters_csv_path)
    
    # 1. Prepare fighters data for merging
    fighters_prepared = fighters_df.copy()
    fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
    
    # Handle duplicate fighter names by keeping the first entry
    fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
    fighters_prepared = fighters_prepared.set_index('full_name')

    for col in ['height_cm', 'reach_in', 'elo']:
        if col in fighters_prepared.columns:
            fighters_prepared[col] = _clean_numeric_column(fighters_prepared[col])

    # 2. Pre-calculate fighter histories to speed up lookups
    # And convert date strings to datetime objects once
    for fight in fights_to_process:
        try:
            # This will work if event_date is a string
            fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
        except TypeError:
            # This will be triggered if it's already a date-like object (e.g., Timestamp)
            fight['date_obj'] = fight['event_date']
    
    fighter_histories = {}
    for fighter_name in fighters_prepared.index:
        history = [f for f in fights_to_process if fighter_name in (f['fighter_1'], f['fighter_2'])]
        fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])

    # 3. Process fights to create features and targets
    feature_list = []
    target_list = []
    metadata_list = []

    for fight in fights_to_process:
        # Per the dataset's design, fighter_1 is always the winner.
        f1_name, f2_name = fight['fighter_1'], fight['fighter_2']

        if f1_name not in fighters_prepared.index or f2_name not in fighters_prepared.index:
            continue

        f1_stats, f2_stats = fighters_prepared.loc[f1_name], fighters_prepared.loc[f2_name]
        
        if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
        if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]

        # Calculate ages for both fighters
        f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
        f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])

        # Get historical stats for both fighters
        f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)
        f2_hist_stats = _get_fighter_history_stats(f2_name, fight['date_obj'], fighter_histories.get(f2_name, []), fighters_prepared)
        
        # --- Create two training examples from each fight for a balanced dataset ---

        # 1. The "Win" case: (fighter_1 - fighter_2)
        features_win = {
            # Original diffs
            'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
            'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
            'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
            'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
            'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
            # New historical diffs
            'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
            'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
            'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
            'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
            # Grappling features
            'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
            'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
        }
        feature_list.append(features_win)
        target_list.append(1)  # 1 represents a win

        # 2. The "Loss" case: (fighter_2 - fighter_1)
        # We invert the differences for the losing case.
        features_loss = {key: -value for key, value in features_win.items()}
        # Stance difference is symmetric; it doesn't get inverted.
        features_loss['stance_is_different'] = features_win['stance_is_different']
        
        feature_list.append(features_loss)
        target_list.append(0)  # 0 represents a loss

        # Add metadata for both generated samples
        # The 'winner' and 'loser' are consistent with the original data structure
        metadata_list.append({
            'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
        })
        metadata_list.append({
            'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
        })

    X = pd.DataFrame(feature_list).fillna(0)
    y = pd.Series(target_list, name='winner')
    metadata = pd.DataFrame(metadata_list)

    print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
    return X, y, metadata

if __name__ == '__main__':
    # Use absolute imports to avoid relative import issues
    try:
        from src.predict.pipeline import PredictionPipeline
    except ImportError:
        # Fallback for when running directly
        from .pipeline import PredictionPipeline
    
    print("--- Running Preprocessing Example ---")
    
    pipeline = PredictionPipeline(models=[])
    try:
        pipeline._load_and_split_data()
        if pipeline.train_fights:
            X_train, y_train, metadata_train = preprocess_for_ml(pipeline.train_fights, FIGHTERS_CSV_PATH)
            print("\nTraining Data Shape:")
            print("X_train:", X_train.shape)
            print("y_train:", y_train.shape)
            print("metadata_train:", metadata_train.shape)

            print("\nLast 5 rows of X_train (showing populated historical features):")
            print(X_train.tail())
            
            print("\nTarget distribution (0=Loss, 1=Win):")
            print(y_train.value_counts())
            
            print("\nMetadata for last 5 rows:")
            print(metadata_train.tail())

    except FileNotFoundError as e:
        print(e)
        print("Please run the scraping pipeline first ('python -m src.scrape.main').")