Spaces:

AlvaroMros
/

ufc-predictor

Running

AlvaroMros commited on Aug 2

Commit

7fcaffe

1 Parent(s): 9678fdb

(CAREFUL!!!!!!) Refactor argument parsing and prediction pipeline

Moved command-line argument parsing to a dedicated src/args.py module and updated main.py and predict/main.py to use these functions. Improved model management logic and modularized pipeline execution. Enhanced feature engineering and preprocessing in predict/preprocess.py and refactored model classes in predict/models.py for consistency and maintainability.

Files changed (8) hide show

output/model_results.json +2 -2
output/models/EloBaselineModel.joblib +2 -2
output/models/LogisticRegressionModel.joblib +2 -2
src/args.py +97 -0
src/main.py +65 -92
src/predict/main.py +23 -55
src/predict/models.py +67 -129
src/predict/preprocess.py +196 -175

output/model_results.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf8df1ba9e26fa98e34bfb1c773e66576cbf89152087c55b70921269c84f39d5
-size 27286

 version https://git-lfs.github.com/spec/v1
+oid sha256:40c2fb9010bdae4946c2b879d4014aa671a43b586aff7faa73ea4846585e589c
+size 11671

output/models/EloBaselineModel.joblib CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bfdc684f791b598fbecfbfe9b14cca3b4d483b3d7368a16faecea31aace3be87
-size 938419

 version https://git-lfs.github.com/spec/v1
+oid sha256:40937e8b6fe9aaaa1ca92a84e3e67b5bdefcf2700d2cafb7830670a14f684858
+size 938435

output/models/LogisticRegressionModel.joblib CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a773552b7f1b166858ab1ff7bdf472e24b293279a8e24871de773b1a3de46e1
-size 5517988

 version https://git-lfs.github.com/spec/v1
+oid sha256:51c11a689c50244a6084e642a1dc35a349d515f075b40515dbd4164e7831dfdb
+size 5518484

src/args.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import argparse
+def get_pipeline_args():
+    """
+    Parse command line arguments for the main UFC data pipeline.
+    Returns:
+        argparse.Namespace: Parsed command line arguments
+    """
+    parser = argparse.ArgumentParser(description="UFC Data Pipeline")
+    # Pipeline selection
+    parser.add_argument(
+        '--pipeline',
+        type=str,
+        default='scrape',
+        choices=['scrape', 'analysis', 'predict', 'update', 'all'],
+        help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
+    )
+    # Scraping arguments
+    scraping_group = parser.add_argument_group('Scraping options')
+    scraping_group.add_argument(
+        '--scrape-mode',
+        type=str,
+        default='full',
+        choices=['full', 'update'],
+        help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
+    )
+    scraping_group.add_argument(
+        '--num-events',
+        type=int,
+        default=5,
+        help="Number of latest events to scrape in update mode (default: 5)"
+    )
+    # Model management arguments
+    model_group = parser.add_argument_group('Model management')
+    model_group.add_argument(
+        '--use-existing-models',
+        action='store_true',
+        default=True,
+        help="Use existing saved models if available and no new data (default: True)"
+    )
+    model_group.add_argument(
+        '--no-use-existing-models',
+        action='store_true',
+        default=False,
+        help="Force retrain all models from scratch, ignoring existing saved models"
+    )
+    model_group.add_argument(
+        '--force-retrain',
+        action='store_true',
+        default=False,
+        help="Force retrain all models even if no new data is available"
+    )
+    return parser.parse_args()
+def get_prediction_args():
+    """
+    Parse command line arguments specific to the prediction pipeline.
+    Returns:
+        argparse.Namespace: Parsed command line arguments
+    """
+    parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
+    parser.add_argument(
+        '--report',
+        type=str,
+        default='detailed',
+        choices=['detailed', 'summary'],
+        help="Type of report to generate: 'detailed' (file) or 'summary' (console)"
+    )
+    model_group = parser.add_argument_group('Model management')
+    model_group.add_argument(
+        '--use-existing-models',
+        action='store_true',
+        default=True,
+        help="Use existing saved models if available and no new data (default: True)"
+    )
+    model_group.add_argument(
+        '--no-use-existing-models',
+        action='store_true',
+        default=False,
+        help="Force retrain all models from scratch, ignoring existing saved models"
+    )
+    model_group.add_argument(
+        '--force-retrain',
+        action='store_true',
+        default=False,
+        help="Force retrain all models even if no new data is available"
+    )
+    return parser.parse_args()

src/main.py CHANGED Viewed

@@ -1,106 +1,79 @@
-import argparse
 import sys
 import os
-def main():
-    """
-    Main entry point for the UFC data pipeline.
-    Supports scraping, analysis, and prediction workflows.
-    """
-    parser = argparse.ArgumentParser(description="UFC Data Pipeline")
-    parser.add_argument(
-        '--pipeline',
-        type=str,
-        default='scrape',
-        choices=['scrape', 'analysis', 'predict', 'update', 'all'],
-        help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
-    )
-    parser.add_argument(
-        '--scrape-mode',
-        type=str,
-        default='full',
-        choices=['full', 'update'],
-        help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
-    )
-    parser.add_argument(
-        '--num-events',
-        type=int,
-        default=5,
-        help="Number of latest events to scrape in update mode (default: 5)"
-    )
-    # Model management arguments for prediction pipeline
-    parser.add_argument(
-        '--use-existing-models',
-        action='store_true',
-        default=True,
-        help="Use existing saved models if available and no new data (default: True)."
-    )
-    parser.add_argument(
-        '--no-use-existing-models',
-        action='store_true',
-        default=False,
-        help="Force retrain all models from scratch, ignoring existing saved models."
-    )
-    parser.add_argument(
-        '--force-retrain',
-        action='store_true',
-        default=False,
-        help="Force retrain all models even if no new data is available."
-    )
-    args = parser.parse_args()
-    if args.pipeline in ['scrape', 'all']:
-        print("=== Running Scraping Pipeline ===")
-        from src.scrape.main import main as scrape_main
-        # Override sys.argv to pass arguments to scrape.main
-        original_argv = sys.argv
-        sys.argv = ['scrape_main', '--mode', args.scrape_mode, '--num-events', str(args.num_events)]
-        try:
-            scrape_main()
-        finally:
-            sys.argv = original_argv
     if args.pipeline in ['analysis', 'all']:
-        print("\n=== Running ELO Analysis ===")
-        from src.analysis.elo import main as elo_main
-        elo_main()
     if args.pipeline == 'update':
-        print("\n=== Running Model Update Pipeline ===")
-        try:
-            from src.predict.main import MODELS_TO_RUN
-            from src.predict.pipeline import PredictionPipeline
-        except ImportError:
-            print("Fatal: Could not import prediction modules.")
-            print("Please ensure your project structure and python path are correct.")
-            return
-        pipeline = PredictionPipeline(models=MODELS_TO_RUN)
-        pipeline.update_models_if_new_data()
     if args.pipeline in ['predict', 'all']:
-        print("\n=== Running Prediction Pipeline ===")
-        from src.predict.main import main as predict_main
-        # Override sys.argv to pass model management arguments to predict.main
-        original_argv = sys.argv
-        predict_args = ['predict_main']
-        if args.no_use_existing_models:
-            predict_args.append('--no-use-existing-models')
-        elif args.use_existing_models:
-            predict_args.append('--use-existing-models')
-        if args.force_retrain:
-            predict_args.append('--force-retrain')
-        sys.argv = predict_args
-        try:
-            predict_main()
-        finally:
-            sys.argv = original_argv
 if __name__ == '__main__':
     main()

 import sys
 import os
+from .args import get_pipeline_args
+def run_scraping_pipeline(args):
+    """Execute the scraping pipeline with given arguments."""
+    print("=== Running Scraping Pipeline ===")
+    from .scrape.main import main as scrape_main
+    # Pass arguments to scrape.main
+    original_argv = sys.argv
+    sys.argv = ['scrape_main', '--mode', args.scrape_mode, '--num-events', str(args.num_events)]
+    try:
+        scrape_main()
+    finally:
+        sys.argv = original_argv
+def run_analysis_pipeline():
+    """Execute the ELO analysis pipeline."""
+    print("\n=== Running ELO Analysis ===")
+    from .analysis.elo import main as elo_main
+    elo_main()
+def run_prediction_pipeline(args):
+    """Execute the prediction pipeline with given arguments."""
+    print("\n=== Running Prediction Pipeline ===")
+    from .predict.main import main as predict_main
+    # Pass model management arguments to predict.main
+    original_argv = sys.argv
+    predict_args = ['predict_main']
+    if args.no_use_existing_models:
+        predict_args.append('--no-use-existing-models')
+    elif args.use_existing_models:
+        predict_args.append('--use-existing-models')
+    if args.force_retrain:
+        predict_args.append('--force-retrain')
+    sys.argv = predict_args
+    try:
+        predict_main()
+    finally:
+        sys.argv = original_argv
+def run_model_update(args):
+    """Execute the model update pipeline."""
+    print("\n=== Running Model Update Pipeline ===")
+    try:
+        from .predict.main import MODELS_TO_RUN
+        from .predict.pipeline import PredictionPipeline
+    except ImportError:
+        print("Fatal: Could not import prediction modules.")
+        print("Please ensure your project structure and python path are correct.")
+        return
+    pipeline = PredictionPipeline(models=MODELS_TO_RUN)
+    pipeline.update_models_if_new_data()
+def main():
+    """Main entry point for the UFC data pipeline."""
+    args = get_pipeline_args()
+    # Execute requested pipeline(s)
+    if args.pipeline in ['scrape', 'all']:
+        run_scraping_pipeline(args)
     if args.pipeline in ['analysis', 'all']:
+        run_analysis_pipeline()
     if args.pipeline == 'update':
+        run_model_update(args)
     if args.pipeline in ['predict', 'all']:
+        run_prediction_pipeline(args)
 if __name__ == '__main__':
     main()

src/predict/main.py CHANGED Viewed

@@ -1,5 +1,4 @@
-import argparse
 from .pipeline import PredictionPipeline
 from .models import (
     EloBaselineModel,
@@ -11,56 +10,34 @@ from .models import (
     LGBMModel
 )
-# --- Define Models to Run ---
-# Instantiate all the models you want to evaluate here.
-MODELS_TO_RUN = [
-    EloBaselineModel(),
-    LogisticRegressionModel(),
-    XGBoostModel(),
-    SVCModel(),
-    RandomForestModel(),
-    BernoulliNBModel(),
-    LGBMModel(),
-]
-# --- End of Model Definition ---
 def main():
     """
     Main entry point to run the prediction pipeline.
     You can specify which models to run and the reporting format.
     """
-    parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
-    parser.add_argument(
-        '--report',
-        type=str,
-        default='detailed',
-        choices=['detailed', 'summary'],
-        help="Type of report to generate: 'detailed' (file) or 'summary' (console)."
-    )
-    parser.add_argument(
-        '--use-existing-models',
-        action='store_true',
-        default=True,
-        help="Use existing saved models if available and no new data (default: True)."
-    )
-    parser.add_argument(
-        '--no-use-existing-models',
-        action='store_true',
-        default=False,
-        help="Force retrain all models from scratch, ignoring existing saved models."
-    )
-    parser.add_argument(
-        '--force-retrain',
-        action='store_true',
-        default=False,
-        help="Force retrain all models even if no new data is available."
-    )
-    args = parser.parse_args()
     # Handle conflicting arguments
     use_existing_models = not args.no_use_existing_models and args.use_existing_models
     force_retrain = args.force_retrain
     if args.no_use_existing_models:
         print("No-use-existing-models flag set: All models will be retrained from scratch.")
     elif force_retrain:
@@ -68,21 +45,9 @@ def main():
     elif use_existing_models:
         print("Using existing models if available and no new data detected.")
-    # --- Define Models to Run ---
-    # Instantiate all the models you want to evaluate here.
-    models_to_run = [
-        EloBaselineModel(),
-        LogisticRegressionModel(),
-        XGBoostModel(),
-        SVCModel(),
-        RandomForestModel(),
-        BernoulliNBModel(),
-        LGBMModel(),
-    ]
-    # --- End of Model Definition ---
     pipeline = PredictionPipeline(
-        models=MODELS_TO_RUN,
         use_existing_models=use_existing_models,
         force_retrain=force_retrain
     )
@@ -92,3 +57,6 @@ def main():
     except FileNotFoundError as e:
         print(f"Error: {e}")
         print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")

+from ..args import get_prediction_args
 from .pipeline import PredictionPipeline
 from .models import (
     EloBaselineModel,
     LGBMModel
 )
+def get_available_models():
+    """Get a list of all available prediction models.
+    Returns:
+        list: List of instantiated model objects
+    """
+    return [
+        EloBaselineModel(),
+        LogisticRegressionModel(),
+        # XGBoostModel(),
+        # SVCModel(),
+        # RandomForestModel(),
+        # BernoulliNBModel(),
+        LGBMModel(),
+    ]
 def main():
     """
     Main entry point to run the prediction pipeline.
     You can specify which models to run and the reporting format.
     """
+    args = get_prediction_args()
     # Handle conflicting arguments
     use_existing_models = not args.no_use_existing_models and args.use_existing_models
     force_retrain = args.force_retrain
+    # Log model management settings
     if args.no_use_existing_models:
         print("No-use-existing-models flag set: All models will be retrained from scratch.")
     elif force_retrain:
     elif use_existing_models:
         print("Using existing models if available and no new data detected.")
+    # Initialize and run prediction pipeline
     pipeline = PredictionPipeline(
+        models=get_available_models(),
         use_existing_models=use_existing_models,
         force_retrain=force_retrain
     )
     except FileNotFoundError as e:
         print(f"Error: {e}")
         print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        raise

src/predict/models.py CHANGED Viewed

@@ -1,6 +1,4 @@
 from abc import ABC, abstractmethod
-import sys
-import os
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
@@ -10,188 +8,128 @@ from xgboost import XGBClassifier
 from lightgbm import LGBMClassifier
 from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
 from ..config import FIGHTERS_CSV_PATH
-from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
 class BaseModel(ABC):
-    """
-    Abstract base class for all prediction models.
-    Ensures that every model has a standard interface for training and prediction.
-    """
     @abstractmethod
     def train(self, train_fights):
-        """
-        Trains or prepares the model using historical fight data.
-        :param train_fights: A list of historical fight data dictionaries.
-        """
         pass
     @abstractmethod
     def predict(self, fight):
-        """
-        Predicts the winner of a single fight.
-        :param fight: A dictionary representing a single fight.
-        :return: The name of the predicted winning fighter.
-        """
         pass
-class EloBaselineModel(BaseModel):
-    """
-    A baseline prediction model that predicts the winner based on the higher ELO rating.
-    """
-    def __init__(self):
-        self.fighters_df = None
     def train(self, train_fights):
-        """
-        For the ELO baseline, 'training' simply consists of loading the fighter data
-        to access their ELO scores during prediction.
-        """
-        print("Training EloBaselineModel: Loading fighter ELO data...")
         self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
         self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
         self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
     def predict(self, fight):
-        """Predicts the winner based on ELO and calculates win probability."""
         f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
         try:
             f1_elo = self.fighters_df.loc[f1_name, 'elo']
             f2_elo = self.fighters_df.loc[f2_name, 'elo']
-            # Calculate win probability for fighter 1 using the ELO formula
             prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
-            if prob_f1_wins >= 0.5:
-                return {'winner': f1_name, 'probability': prob_f1_wins}
-            else:
-                return {'winner': f2_name, 'probability': 1 - prob_f1_wins}
         except KeyError as e:
             print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
-            return {'winner': None, 'probability': None}
 class BaseMLModel(BaseModel):
-    """
-    An abstract base class for machine learning models that handles all common
-    data preparation, training, and prediction logic.
-    """
     def __init__(self, model):
         if model is None:
             raise ValueError("A model must be provided.")
         self.model = model
-        self.fighters_df = None
-        self.fighter_histories = {}
     def train(self, train_fights):
-        """
-        Trains the machine learning model. This involves loading fighter data,
-        pre-calculating histories, and fitting the model on the preprocessed data.
-        """
-        print(f"--- Training {self.model.__class__.__name__} ---")
-        # 1. Prepare data for prediction-time feature generation
-        self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
-        self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
-        self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
-        for col in ['height_cm', 'reach_in', 'elo']:
-            if col in self.fighters_df.columns:
-                self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
-        # 2. Pre-calculate fighter histories
-        train_fights_with_dates = []
-        for fight in train_fights:
-            fight['date_obj'] = pd.to_datetime(fight['event_date'])
-            train_fights_with_dates.append(fight)
-        for fighter_name in self.fighters_df.index:
-            history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
-            self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
-        # 3. Preprocess and fit
         X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
         print(f"Fitting model on {X_train.shape[0]} samples...")
         self.model.fit(X_train, y_train)
         print("Model training complete.")
     def predict(self, fight):
-        """
-        Predicts the outcome of a single fight, returning the winner and probability.
-        """
-        f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
-        fight_date = pd.to_datetime(fight['event_date'])
-        if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
-            print(f"Warning: Fighter not found. Skipping prediction for {f1_name} vs {f2_name}")
-            return {'winner': None, 'probability': None}
-        f1_stats = self.fighters_df.loc[f1_name]
-        f2_stats = self.fighters_df.loc[f2_name]
-        if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
-        if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
-        f1_hist = self.fighter_histories.get(f1_name, [])
-        f2_hist = self.fighter_histories.get(f2_name, [])
-        f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
-        f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
-        f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
-        f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
-        features = {
-            'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
-            'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
-            'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
-            'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
-            'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
-            'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
-            'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
-            'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
-            'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
-            'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
-            'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
-        }
-        feature_vector = pd.DataFrame([features]).fillna(0)
-        # Use predict_proba to get probabilities for each class
-        probabilities = self.model.predict_proba(feature_vector)[0]
-        prob_f1_wins = probabilities[1]  # Probability of class '1' (fighter 1 wins)
-        if prob_f1_wins >= 0.5:
-            return {'winner': f1_name, 'probability': prob_f1_wins}
-        else:
-            return {'winner': f2_name, 'probability': 1 - prob_f1_wins}
 class LogisticRegressionModel(BaseMLModel):
-    """A thin wrapper for scikit-learn's LogisticRegression."""
     def __init__(self):
-        super().__init__(model=LogisticRegression())
-class XGBoostModel(BaseMLModel):
-    """A thin wrapper for XGBoost's XGBClassifier."""
-    def __init__(self):
-        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
-        super().__init__(model=model)
 class SVCModel(BaseMLModel):
-    """A thin wrapper for scikit-learn's Support Vector Classifier."""
     def __init__(self):
-        # Probability=True is needed for some reports, though it slows down training
-        super().__init__(model=SVC(probability=True, random_state=42))
 class RandomForestModel(BaseMLModel):
-    """A thin wrapper for scikit-learn's RandomForestClassifier."""
     def __init__(self):
-        super().__init__(model=RandomForestClassifier(random_state=42))
 class BernoulliNBModel(BaseMLModel):
-    """A thin wrapper for scikit-learn's Bernoulli Naive Bayes classifier."""
     def __init__(self):
-        super().__init__(model=BernoulliNB())
 class LGBMModel(BaseMLModel):
-    """A thin wrapper for LightGBM's LGBMClassifier."""
     def __init__(self):
-        super().__init__(model=LGBMClassifier(random_state=42))

 from abc import ABC, abstractmethod
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
 from lightgbm import LGBMClassifier
 from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
 from ..config import FIGHTERS_CSV_PATH
+from .preprocess import preprocess_for_ml
 class BaseModel(ABC):
+    """Abstract base class for all prediction models."""
+    def __init__(self):
+        self.model_name = self.__class__.__name__
     @abstractmethod
     def train(self, train_fights):
+        """Train the model using historical fight data."""
         pass
     @abstractmethod
     def predict(self, fight):
+        """Predict the winner of a single fight."""
         pass
+    def _format_prediction(self, winner, probability):
+        """Format prediction results consistently."""
+        return {'winner': winner, 'probability': probability}
+class EloBaselineModel(BaseModel):
+    """Simple ELO-based prediction model."""
     def train(self, train_fights):
+        """Process historical fights to calculate current ELO ratings."""
+        print(f"--- Training {self.model_name} ---")
+        # Load and prepare fighter data
         self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
         self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
         self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
+        # Calculate ELO ratings
+        elo_ratings = process_fights_for_elo(train_fights)
+        self.fighters_df['elo'] = pd.Series(elo_ratings)
+        self.fighters_df['elo'] = self.fighters_df['elo'].fillna(INITIAL_ELO)
+        print("ELO ratings calculated for all fighters.")
     def predict(self, fight):
+        """Predict winner based on current ELO ratings."""
         f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
         try:
             f1_elo = self.fighters_df.loc[f1_name, 'elo']
             f2_elo = self.fighters_df.loc[f2_name, 'elo']
+            # Calculate win probability using ELO formula
             prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
+            winner = f1_name if prob_f1_wins >= 0.5 else f2_name
+            probability = prob_f1_wins if prob_f1_wins >= 0.5 else 1 - prob_f1_wins
+            return self._format_prediction(winner, probability)
         except KeyError as e:
             print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
+            return self._format_prediction(None, None)
 class BaseMLModel(BaseModel):
+    """Base class for all machine learning models."""
     def __init__(self, model):
+        super().__init__()
         if model is None:
             raise ValueError("A model must be provided.")
         self.model = model
     def train(self, train_fights):
+        """Train the ML model on preprocessed fight data."""
+        print(f"--- Training {self.model_name} ---")
+        # Preprocess data and fit model
         X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
         print(f"Fitting model on {X_train.shape[0]} samples...")
         self.model.fit(X_train, y_train)
         print("Model training complete.")
     def predict(self, fight):
+        """Predict fight outcome using the trained ML model."""
+        # Preprocess single fight for prediction
+        X_pred, _, metadata = preprocess_for_ml([fight], FIGHTERS_CSV_PATH)
+        if X_pred.empty:
+            print(f"Warning: Could not process fight data for {fight['fighter_1']} vs {fight['fighter_2']}")
+            return self._format_prediction(None, None)
+        # Make prediction
+        try:
+            prob_f1_wins = self.model.predict_proba(X_pred)[0][1]
+            winner = fight['fighter_1'] if prob_f1_wins >= 0.5 else fight['fighter_2']
+            probability = prob_f1_wins if prob_f1_wins >= 0.5 else 1 - prob_f1_wins
+            return self._format_prediction(winner, probability)
+        except Exception as e:
+            print(f"Error making prediction: {e}")
+            return self._format_prediction(None, None)
+# Concrete ML model implementations
 class LogisticRegressionModel(BaseMLModel):
     def __init__(self):
+        super().__init__(LogisticRegression(random_state=42))
 class SVCModel(BaseMLModel):
     def __init__(self):
+        super().__init__(SVC(probability=True, random_state=42))
 class RandomForestModel(BaseMLModel):
     def __init__(self):
+        super().__init__(RandomForestClassifier(n_estimators=100, random_state=42))
 class BernoulliNBModel(BaseMLModel):
     def __init__(self):
+        super().__init__(BernoulliNB())
+class XGBoostModel(BaseMLModel):
+    def __init__(self):
+        super().__init__(XGBClassifier(random_state=42))
 class LGBMModel(BaseMLModel):
     def __init__(self):
+        super().__init__(LGBMClassifier(random_state=42))

src/predict/preprocess.py CHANGED Viewed

@@ -1,15 +1,14 @@
 import pandas as pd
 import os
 from datetime import datetime
-from ..config import FIGHTERS_CSV_PATH
 def _clean_numeric_column(series):
-    """A helper to clean string columns into numbers, handling errors."""
     series_str = series.astype(str)
     return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
 def _calculate_age(dob_str, fight_date_str):
-    """Calculates age in years from a date of birth string and fight date string."""
     if pd.isna(dob_str) or not dob_str:
         return None
     try:
@@ -19,213 +18,235 @@ def _calculate_age(dob_str, fight_date_str):
     except (ValueError, TypeError):
         return None
-def _parse_round_time_to_seconds(round_str, time_str):
-    """Converts fight duration from round and time to total seconds."""
-    try:
-        rounds = int(round_str)
-        minutes, seconds = map(int, time_str.split(':'))
-        # Assuming 5-minute rounds for calculation simplicity
-        return ((rounds - 1) * 5 * 60) + (minutes * 60) + seconds
-    except (ValueError, TypeError, AttributeError):
-        return 0
-def _parse_striking_stats(stat_str):
-    """Parses striking stats string like '10 of 20' into (landed, attempted)."""
-    try:
-        landed, attempted = map(int, stat_str.split(' of '))
-        return landed, attempted
-    except (ValueError, TypeError, AttributeError):
-        return 0, 0
-def _to_int_safe(val):
-    """Safely converts a value to an integer, returning 0 if it's invalid or empty."""
-    if pd.isna(val):
-        return 0
     try:
-        # handle strings with whitespace or empty strings
-        return int(str(val).strip() or 0)
     except (ValueError, TypeError):
         return 0
-def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history, fighters_df, n=5):
-    """
-    Calculates performance statistics for a fighter based on their last n fights.
-    """
-    past_fights = [f for f in fighter_history if f['date_obj'] < current_fight_date]
-    last_n_fights = past_fights[-n:]
-    if not last_n_fights:
-        # Return a default dictionary with the correct keys for a fighter with no history
-        return {
-            'wins_last_n': 0,
-            'avg_opp_elo_last_n': 1500, # Assume average ELO for first opponent
-            'ko_percent_last_n': 0,
-            'sig_str_landed_per_min_last_n': 0,
-            'takedown_accuracy_last_n': 0,
-            'sub_attempts_per_min_last_n': 0,
-        }
     stats = {
-        'wins': 0, 'ko_wins': 0, 'total_time_secs': 0,
-        'sig_str_landed': 0, 'opponent_elos': [],
-        'td_landed': 0, 'td_attempted': 0, 'sub_attempts': 0
     }
     for fight in last_n_fights:
         is_fighter_1 = (fight['fighter_1'] == fighter_name)
         opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
-        f_prefix = 'f1' if is_fighter_1 else 'f2'
         if fight['winner'] == fighter_name:
-            stats['wins'] += 1
             if 'KO' in fight['method']:
                 stats['ko_wins'] += 1
-        if opponent_name in fighters_df.index:
-            opp_elo = fighters_df.loc[opponent_name, 'elo']
-            stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else 1500)
-        stats['total_time_secs'] += _parse_round_time_to_seconds(fight['round'], fight['time'])
-        sig_str_stat = fight.get(f'{f_prefix}_sig_str', '0 of 0')
-        landed, _ = _parse_striking_stats(sig_str_stat)
-        stats['sig_str_landed'] += landed
-        td_stat = fight.get(f'{f_prefix}_td', '0 of 0')
-        td_landed, td_attempted = _parse_striking_stats(td_stat) # Can reuse this parser
-        stats['td_landed'] += td_landed
-        stats['td_attempted'] += td_attempted
-        stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_att'))
-    # Final calculations
-    avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
-    total_minutes = stats['total_time_secs'] / 60 if stats['total_time_secs'] > 0 else 0
-    return {
-        'wins_last_n': stats['wins'],
-        'avg_opp_elo_last_n': avg_opp_elo,
-        'ko_percent_last_n': (stats['ko_wins'] / stats['wins']) if stats['wins'] > 0 else 0,
-        'sig_str_landed_per_min_last_n': (stats['sig_str_landed'] / total_minutes) if total_minutes > 0 else 0,
-        'takedown_accuracy_last_n': (stats['td_landed'] / stats['td_attempted']) if stats['td_attempted'] > 0 else 0,
-        'sub_attempts_per_min_last_n': (stats['sub_attempts'] / total_minutes) if total_minutes > 0 else 0,
-    }
 def preprocess_for_ml(fights_to_process, fighters_csv_path):
-    """
-    Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
-    suitable for a binary classification machine learning model.
-    Args:
-        fights_to_process (list of dict): The list of fights to process.
-        fighters_csv_path (str): Path to the CSV file with all fighter stats.
-    Returns:
-        pd.DataFrame: Feature matrix X.
-        pd.Series: Target vector y.
-        pd.DataFrame: Metadata DataFrame.
-    """
     if not os.path.exists(fighters_csv_path):
         raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
     fighters_df = pd.read_csv(fighters_csv_path)
-    # 1. Prepare fighters data for merging
-    fighters_prepared = fighters_df.copy()
-    fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
-    # Handle duplicate fighter names by keeping the first entry
-    fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
-    fighters_prepared = fighters_prepared.set_index('full_name')
     for col in ['height_cm', 'reach_in', 'elo']:
-        if col in fighters_prepared.columns:
-            fighters_prepared[col] = _clean_numeric_column(fighters_prepared[col])
-    # 2. Pre-calculate fighter histories to speed up lookups
-    # And convert date strings to datetime objects once
-    for fight in fights_to_process:
-        try:
-            # This will work if event_date is a string
-            fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
-        except TypeError:
-            # This will be triggered if it's already a date-like object (e.g., Timestamp)
-            fight['date_obj'] = fight['event_date']
-    fighter_histories = {}
-    for fighter_name in fighters_prepared.index:
-        history = [f for f in fights_to_process if fighter_name in (f['fighter_1'], f['fighter_2'])]
-        fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
-    # 3. Process fights to create features and targets
-    feature_list = []
-    target_list = []
-    metadata_list = []
     for fight in fights_to_process:
-        # Per the dataset's design, fighter_1 is always the winner.
         f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
-        if f1_name not in fighters_prepared.index or f2_name not in fighters_prepared.index:
             continue
-        f1_stats, f2_stats = fighters_prepared.loc[f1_name], fighters_prepared.loc[f2_name]
-        if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
-        if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
-        # Calculate ages for both fighters
         f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
         f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
-        # Get historical stats for both fighters
-        f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)
-        f2_hist_stats = _get_fighter_history_stats(f2_name, fight['date_obj'], fighter_histories.get(f2_name, []), fighters_prepared)
-        # --- Create two training examples from each fight for a balanced dataset ---
-        # 1. The "Win" case: (fighter_1 - fighter_2)
-        features_win = {
-            # Original diffs
             'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
-            'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
-            'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
-            'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
-            'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
-            # New historical diffs
-            'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
-            'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
-            'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
-            'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
-            # Grappling features
-            'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
-            'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
         }
-        feature_list.append(features_win)
-        target_list.append(1)  # 1 represents a win
-        # 2. The "Loss" case: (fighter_2 - fighter_1)
-        # We invert the differences for the losing case.
-        features_loss = {key: -value for key, value in features_win.items()}
-        # Stance difference is symmetric; it doesn't get inverted.
-        features_loss['stance_is_different'] = features_win['stance_is_different']
-        feature_list.append(features_loss)
-        target_list.append(0)  # 0 represents a loss
-        # Add metadata for both generated samples
-        # The 'winner' and 'loser' are consistent with the original data structure
-        metadata_list.append({
-            'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
-        })
-        metadata_list.append({
-            'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
-        })
-    X = pd.DataFrame(feature_list).fillna(0)
-    y = pd.Series(target_list, name='winner')
-    metadata = pd.DataFrame(metadata_list)
-    print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
     return X, y, metadata

 import pandas as pd
 import os
 from datetime import datetime
 def _clean_numeric_column(series):
+    """Clean string columns into numbers, handling errors."""
     series_str = series.astype(str)
     return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
 def _calculate_age(dob_str, fight_date_str):
+    """Calculate age in years from date of birth and fight date strings."""
     if pd.isna(dob_str) or not dob_str:
         return None
     try:
     except (ValueError, TypeError):
         return None
+def _get_days_since_last_fight(current_date, past_fights):
+    """Calculate days since a fighter's last fight."""
+    if not past_fights:
+        return None
+    last_fight_date = past_fights[-1]['date_obj']
+    return (current_date - last_fight_date).days
+def _get_win_streak(fighter_name, current_date, past_fights):
+    """Calculate current win streak before a given date."""
+    streak = 0
+    for fight in reversed(past_fights):
+        if fight['date_obj'] >= current_date:
+            continue
+        if fight['winner'] == fighter_name:
+            streak += 1
+        else:
+            break
+    return streak
+def _to_int_safe(value):
+    """Safely convert a value to integer, returning 0 for invalid values."""
     try:
+        return int(float(value)) if value and not pd.isna(value) else 0
     except (ValueError, TypeError):
         return 0
+def _get_fighter_history_stats(fighter_name, current_fight_date, past_fights, fighters_df, n_fights=5):
+    """Calculate historical performance statistics for a fighter."""
+    # Sort fights by date and get last N fights before current fight
+    past_fights = [f for f in past_fights if f['date_obj'] < current_fight_date]
+    past_fights = sorted(past_fights, key=lambda x: x['date_obj'])
+    last_n_fights = past_fights[-n_fights:] if past_fights else []
     stats = {
+        'wins_last_n': 0,
+        'ko_wins': 0,
+        'total_finishes': 0,
+        'first_round_finishes': 0,
+        'knockdowns_scored': 0,
+        'knockdowns_absorbed': 0,
+        'sig_str_landed': 0,
+        'sig_str_attempted': 0,
+        'takedowns_landed': 0,
+        'takedowns_attempted': 0,
+        'sub_attempts': 0,
+        'ctrl_time_sec': 0,
+        'total_fight_time_sec': 0,
+        'fights_in_last_year': 0,
+        'avg_opp_elo_last_n': 0
     }
+    # Calculate fights in last year
+    one_year_ago = current_fight_date - pd.Timedelta(days=365)
+    stats['fights_in_last_year'] = len([f for f in past_fights if f['date_obj'] >= one_year_ago])
+    # Process each fight
+    total_opp_elo = 0
     for fight in last_n_fights:
         is_fighter_1 = (fight['fighter_1'] == fighter_name)
+        f_prefix = 'f1' if is_fighter_1 else 'f2'
+        opp_prefix = 'f2' if is_fighter_1 else 'f1'
         opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
+        # Win/Loss and Finishes
         if fight['winner'] == fighter_name:
+            stats['wins_last_n'] += 1
+            if fight['method'] != 'Decision':
+                stats['total_finishes'] += 1
+                if fight['round'] == '1':
+                    stats['first_round_finishes'] += 1
             if 'KO' in fight['method']:
                 stats['ko_wins'] += 1
+        # Striking and Grappling Stats
+        stats['knockdowns_scored'] += _to_int_safe(fight.get(f'{f_prefix}_kd'))
+        stats['knockdowns_absorbed'] += _to_int_safe(fight.get(f'{opp_prefix}_kd'))
+        stats['sig_str_landed'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_landed'))
+        stats['sig_str_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_attempted'))
+        stats['takedowns_landed'] += _to_int_safe(fight.get(f'{f_prefix}_td_landed'))
+        stats['takedowns_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_td_attempted'))
+        stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_attempts'))
+        # Control Time
+        ctrl_time = fight.get(f'{f_prefix}_ctrl_time', '0:00')
+        if isinstance(ctrl_time, str) and ':' in ctrl_time:
+            mins, secs = map(int, ctrl_time.split(':'))
+            stats['ctrl_time_sec'] += mins * 60 + secs
+        # Fight Duration
+        round_num = _to_int_safe(fight['round'])
+        round_time = fight.get('round_time', '0:00')
+        if isinstance(round_time, str) and ':' in round_time:
+            mins, secs = map(int, round_time.split(':'))
+            stats['total_fight_time_sec'] += (round_num - 1) * 300 + mins * 60 + secs
+        # Opponent ELO
+        if opponent_name in fighters_df.index:
+            opp_elo = fighters_df.loc[opponent_name, 'elo']
+            if not pd.isna(opp_elo):
+                total_opp_elo += opp_elo
+    # Calculate averages and rates
+    n_actual_fights = len(last_n_fights)
+    # Always provide all required keys with default values
+    stats['finish_rate_last_n'] = stats['total_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
+    stats['first_round_finish_rate_last_n'] = stats['first_round_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
+    stats['ko_percent_last_n'] = stats['ko_wins'] / n_actual_fights if n_actual_fights > 0 else 0.0
+    stats['avg_knockdowns_per_fight_last_n'] = stats['knockdowns_scored'] / n_actual_fights if n_actual_fights > 0 else 0.0
+    stats['knockdowns_absorbed_per_fight_last_n'] = stats['knockdowns_absorbed'] / n_actual_fights if n_actual_fights > 0 else 0.0
+    stats['avg_opp_elo_last_n'] = total_opp_elo / n_actual_fights if n_actual_fights > 0 else 1500.0
+    # Per-minute stats
+    total_mins = stats['total_fight_time_sec'] / 60
+    stats['sig_str_landed_per_min_last_n'] = stats['sig_str_landed'] / total_mins if total_mins > 0 else 0.0
+    stats['sig_str_absorbed_per_min_last_n'] = stats['sig_str_attempted'] / total_mins if total_mins > 0 else 0.0
+    stats['sub_attempts_per_min_last_n'] = stats['sub_attempts'] / total_mins if total_mins > 0 else 0.0
+    stats['avg_ctrl_time_sec_per_min_last_n'] = stats['ctrl_time_sec'] / total_mins if total_mins > 0 else 0.0
+    # Accuracy stats
+    stats['sig_str_defense_last_n'] = stats['sig_str_landed'] / stats['sig_str_attempted'] if stats['sig_str_attempted'] > 0 else 0.5
+    stats['takedown_accuracy_last_n'] = stats['takedowns_landed'] / stats['takedowns_attempted'] if stats['takedowns_attempted'] > 0 else 0.5
+    stats['takedown_defense_last_n'] = 1 - (stats['takedowns_landed'] / stats['takedowns_attempted']) if stats['takedowns_attempted'] > 0 else 0.5
+    return stats
 def preprocess_for_ml(fights_to_process, fighters_csv_path):
+    """Transform fight data into ML-ready features."""
     if not os.path.exists(fighters_csv_path):
         raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
+    # Load and prepare fighter data
     fighters_df = pd.read_csv(fighters_csv_path)
+    fighters_df['full_name'] = fighters_df['first_name'] + ' ' + fighters_df['last_name']
+    fighters_df = fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
     for col in ['height_cm', 'reach_in', 'elo']:
+        if col in fighters_df.columns:
+            fighters_df[col] = _clean_numeric_column(fighters_df[col])
+    # Process fights and calculate features
+    processed_fights = []
     for fight in fights_to_process:
         f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
+        # Skip if either fighter is missing
+        if f1_name not in fighters_df.index or f2_name not in fighters_df.index:
             continue
+        # Get fighter stats
+        f1_stats = fighters_df.loc[f1_name]
+        f2_stats = fighters_df.loc[f2_name]
+        # Calculate fight date and ensure date_obj is available
+        fight_date = pd.to_datetime(fight['event_date'])
+        fight['date_obj'] = fight_date
+        # Get fighter histories and ensure date_obj is available for all fights
+        f1_hist = [f for f in fights_to_process if f1_name in (f['fighter_1'], f['fighter_2'])]
+        f2_hist = [f for f in fights_to_process if f2_name in (f['fighter_1'], f['fighter_2'])]
+        # Ensure date_obj is available for all historical fights
+        for hist_fight in f1_hist + f2_hist:
+            if 'date_obj' not in hist_fight:
+                hist_fight['date_obj'] = pd.to_datetime(hist_fight['event_date'])
+        # Calculate historical stats
+        f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, fighters_df)
+        f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, fighters_df)
+        # Calculate ages
         f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
         f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
+        # Calculate days since last fight
+        f1_days_since_last = _get_days_since_last_fight(fight_date, f1_hist) or 547  # ~1.5 years if no previous fights
+        f2_days_since_last = _get_days_since_last_fight(fight_date, f2_hist) or 547
+        # Calculate win streaks
+        f1_win_streak = _get_win_streak(f1_name, fight_date, f1_hist)
+        f2_win_streak = _get_win_streak(f2_name, fight_date, f2_hist)
+        # Compile all features
+        feature_dict = {
+            'winner': 1 if fight.get('winner') == f1_name else 0,
+            'date': fight['event_date'],
+            'fighter_1': f1_name,
+            'fighter_2': f2_name,
+            # Physical differences
+            'height_diff': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
+            'reach_diff': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
+            'age_diff': (f1_age or 0) - (f2_age or 0),
             'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
+            # Career momentum
+            'days_since_last_fight_diff': f1_days_since_last - f2_days_since_last,
+            'win_streak_diff': f1_win_streak - f2_win_streak,
+            'fights_last_year_diff': f1_hist_stats['fights_in_last_year'] - f2_hist_stats['fights_in_last_year'],
+            # Performance differences
+            'finish_rate_diff': f1_hist_stats['finish_rate_last_n'] - f2_hist_stats['finish_rate_last_n'],
+            'ko_rate_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
+            'sig_str_per_min_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
+            'td_accuracy_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
+            'sub_attempts_per_min_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
+            'control_time_diff': f1_hist_stats['avg_ctrl_time_sec_per_min_last_n'] - f2_hist_stats['avg_ctrl_time_sec_per_min_last_n'],
+            # Defense differences
+            'sig_str_defense_diff': f1_hist_stats['sig_str_defense_last_n'] - f2_hist_stats['sig_str_defense_last_n'],
+            'td_defense_diff': f1_hist_stats['takedown_defense_last_n'] - f2_hist_stats['takedown_defense_last_n'],
+            'knockdowns_absorbed_diff': f1_hist_stats['knockdowns_absorbed_per_fight_last_n'] - f2_hist_stats['knockdowns_absorbed_per_fight_last_n']
         }
+        processed_fights.append(feature_dict)
+    if not processed_fights:
+        return pd.DataFrame(), pd.Series(), pd.DataFrame()
+    # Create final dataframes
+    df = pd.DataFrame(processed_fights)
+    metadata = df[['date', 'fighter_1', 'fighter_2', 'winner']]
+    # Prepare X and y
+    y = df['winner']
+    X = df.drop(columns=['winner', 'date', 'fighter_1', 'fighter_2'])
+    X = X.reindex(sorted(X.columns), axis=1)  # Ensure consistent column order
+    # Handle missing values by filling NaNs with 0
+    X = X.fillna(0)
     return X, y, metadata