Spaces:

AlvaroMros
/

ufc-predictor

Sleeping

Alvaro commited on Jul 3

Commit

2aed0aa

1 Parent(s): bf7e729

Add new ML models and CLI tools for prediction

Introduces several new machine learning models (XGBoost, LightGBM, SVC, RandomForest, BernoulliNB) to the prediction pipeline by refactoring model logic into a shared base class. Adds requirements for new dependencies. Implements two new CLI scripts: save_model.py for training and saving models, and predict_new.py for predicting outcomes of hypothetical fights using saved models. Improves preprocessing robustness for date parsing.

Files changed (6) hide show

requirements.txt +5 -1
src/predict/main.py +14 -1
src/predict/models.py +57 -22
src/predict/predict_new.py +53 -0
src/predict/preprocess.py +6 -1
src/predict/save_model.py +52 -0

requirements.txt CHANGED Viewed

@@ -1,4 +1,8 @@
 requests
 beautifulsoup4
 pandas
-scikit-learn

 requests
 beautifulsoup4
 pandas
+scikit-learn
+lazypredict
+tqdm
+xgboost
+lightgbm

src/predict/main.py CHANGED Viewed

@@ -1,6 +1,14 @@
 import argparse
-from .models import EloBaselineModel, LogisticRegressionModel
 from .pipeline import PredictionPipeline
 def main():
     """
@@ -22,6 +30,11 @@ def main():
     models_to_run = [
         EloBaselineModel(),
         LogisticRegressionModel(),
     ]
     # --- End of Model Definition ---

 import argparse
 from .pipeline import PredictionPipeline
+from .models import (
+    EloBaselineModel,
+    LogisticRegressionModel,
+    XGBoostModel,
+    SVCModel,
+    RandomForestModel,
+    BernoulliNBModel,
+    LGBMModel
+)
 def main():
     """
     models_to_run = [
         EloBaselineModel(),
         LogisticRegressionModel(),
+        XGBoostModel(),
+        SVCModel(),
+        RandomForestModel(),
+        BernoulliNBModel(),
+        LGBMModel(),
     ]
     # --- End of Model Definition ---

src/predict/models.py CHANGED Viewed

@@ -4,6 +4,11 @@ import os
 from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 from ..config import FIGHTERS_CSV_PATH
 from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
@@ -63,21 +68,24 @@ class EloBaselineModel(BaseModel):
             print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
             return None
-class LogisticRegressionModel(BaseModel):
     """
-    A model that uses logistic regression to predict fight outcomes based on differential features.
     """
-    def __init__(self):
-        self.model = LogisticRegression(solver='liblinear', random_state=42)
         self.fighters_df = None
         self.fighter_histories = {}
     def train(self, train_fights):
         """
-        Trains the logistic regression model by preprocessing the training data
-        and fitting the model.
         """
-        print("Training LogisticRegressionModel...")
         # 1. Prepare data for prediction-time feature generation
         self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
@@ -87,17 +95,16 @@ class LogisticRegressionModel(BaseModel):
             if col in self.fighters_df.columns:
                 self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
-        # 2. Pre-calculate fighter histories for efficient lookup during prediction
         train_fights_with_dates = []
         for fight in train_fights:
             fight['date_obj'] = pd.to_datetime(fight['event_date'])
             train_fights_with_dates.append(fight)
         for fighter_name in self.fighters_df.index:
             history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
             self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
-        # 3. Preprocess training data and fit the model
         X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
         print(f"Fitting model on {X_train.shape[0]} samples...")
         self.model.fit(X_train, y_train)
@@ -111,19 +118,19 @@ class LogisticRegressionModel(BaseModel):
         fight_date = pd.to_datetime(fight['event_date'])
         if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
-            print(f"Warning: Fighter not found in data. Skipping prediction for {f1_name} vs {f2_name}")
             return None
-        # 1. Get base stats
-        f1_stats, f2_stats = self.fighters_df.loc[f1_name], self.fighters_df.loc[f2_name]
         if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
         if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
-        # 2. Get historical stats
-        f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, self.fighter_histories.get(f1_name, []), self.fighters_df)
-        f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, self.fighter_histories.get(f2_name, []), self.fighters_df)
-        # 3. Create differential features
         f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
         f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
@@ -140,9 +147,37 @@ class LogisticRegressionModel(BaseModel):
         }
         feature_vector = pd.DataFrame([features]).fillna(0)
-        # 4. Predict
-        # The model predicts the probability of class '1' (a win for fighter_1)
         prediction = self.model.predict(feature_vector)[0]
-        return f1_name if prediction == 1 else f2_name

 from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.ensemble import RandomForestClassifier
+from xgboost import XGBClassifier
+from lightgbm import LGBMClassifier
 from ..config import FIGHTERS_CSV_PATH
 from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
             print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
             return None
+class BaseMLModel(BaseModel):
     """
+    An abstract base class for machine learning models that handles all common
+    data preparation, training, and prediction logic.
     """
+    def __init__(self, model):
+        if model is None:
+            raise ValueError("A model must be provided.")
+        self.model = model
         self.fighters_df = None
         self.fighter_histories = {}
     def train(self, train_fights):
         """
+        Trains the machine learning model. This involves loading fighter data,
+        pre-calculating histories, and fitting the model on the preprocessed data.
         """
+        print(f"--- Training {self.model.__class__.__name__} ---")
         # 1. Prepare data for prediction-time feature generation
         self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
             if col in self.fighters_df.columns:
                 self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
+        # 2. Pre-calculate fighter histories
         train_fights_with_dates = []
         for fight in train_fights:
             fight['date_obj'] = pd.to_datetime(fight['event_date'])
             train_fights_with_dates.append(fight)
         for fighter_name in self.fighters_df.index:
             history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
             self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
+        # 3. Preprocess and fit
         X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
         print(f"Fitting model on {X_train.shape[0]} samples...")
         self.model.fit(X_train, y_train)
         fight_date = pd.to_datetime(fight['event_date'])
         if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
+            print(f"Warning: Fighter not found. Skipping prediction for {f1_name} vs {f2_name}")
             return None
+        f1_stats = self.fighters_df.loc[f1_name]
+        f2_stats = self.fighters_df.loc[f2_name]
         if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
         if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
+        f1_hist = self.fighter_histories.get(f1_name, [])
+        f2_hist = self.fighter_histories.get(f2_name, [])
+        f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
+        f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
         f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
         f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
         }
         feature_vector = pd.DataFrame([features]).fillna(0)
         prediction = self.model.predict(feature_vector)[0]
+        return f1_name if prediction == 1 else f2_name
+class LogisticRegressionModel(BaseMLModel):
+    """A thin wrapper for scikit-learn's LogisticRegression."""
+    def __init__(self):
+        super().__init__(model=LogisticRegression())
+class XGBoostModel(BaseMLModel):
+    """A thin wrapper for XGBoost's XGBClassifier."""
+    def __init__(self):
+        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
+        super().__init__(model=model)
+class SVCModel(BaseMLModel):
+    """A thin wrapper for scikit-learn's Support Vector Classifier."""
+    def __init__(self):
+        # Probability=True is needed for some reports, though it slows down training
+        super().__init__(model=SVC(probability=True, random_state=42))
+class RandomForestModel(BaseMLModel):
+    """A thin wrapper for scikit-learn's RandomForestClassifier."""
+    def __init__(self):
+        super().__init__(model=RandomForestClassifier(random_state=42))
+class BernoulliNBModel(BaseMLModel):
+    """A thin wrapper for scikit-learn's Bernoulli Naive Bayes classifier."""
+    def __init__(self):
+        super().__init__(model=BernoulliNB())
+class LGBMModel(BaseMLModel):
+    """A thin wrapper for LightGBM's LGBMClassifier."""
+    def __init__(self):
+        super().__init__(model=LGBMClassifier(random_state=42))

src/predict/predict_new.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import argparse
+import os
+import joblib
+from datetime import datetime
+from ..config import OUTPUT_DIR
+def predict_new_fight(fighter1_name, fighter2_name, model_path):
+    """
+    Loads a trained model and predicts the outcome of a new, hypothetical fight.
+    """
+    print("--- Predicting New Fight ---")
+    # 1. Load the trained model
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model file not found at '{model_path}'. Please train and save a model first.")
+    print(f"Loading model from {model_path}...")
+    model = joblib.load(model_path)
+    print(f"Model '{model.model.__class__.__name__}' loaded.")
+    # 2. Create the fight dictionary for prediction
+    # The predict method requires a dictionary with specific keys.
+    # We use today's date as a placeholder for the event date.
+    fight = {
+        'fighter_1': fighter1_name,
+        'fighter_2': fighter2_name,
+        'event_date': datetime.now().strftime('%B %d, %Y')
+        # Other keys like 'winner', 'method', etc., are not needed for prediction.
+    }
+    # 3. Make the prediction
+    print(f"\nPredicting winner for: {fighter1_name} vs. {fighter2_name}")
+    predicted_winner = model.predict(fight)
+    if predicted_winner:
+        print(f"\n---> Predicted Winner: {predicted_winner} <---")
+    else:
+        print("\nCould not make a prediction. One of the fighters may not be in the dataset.")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Predict the outcome of a new UFC fight.")
+    parser.add_argument('fighter1', type=str, help="The full name of the first fighter (e.g., 'Jon Jones').")
+    parser.add_argument('fighter2', type=str, help="The full name of the second fighter (e.g., 'Stipe Miocic').")
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        default=os.path.join(OUTPUT_DIR, 'trained_model.joblib'),
+        help="Path to the saved model file."
+    )
+    args = parser.parse_args()
+    predict_new_fight(args.fighter1, args.fighter2, args.model_path)

src/predict/preprocess.py CHANGED Viewed

@@ -122,7 +122,12 @@ def preprocess_for_ml(fights_to_process, fighters_csv_path):
     # 2. Pre-calculate fighter histories to speed up lookups
     # And convert date strings to datetime objects once
     for fight in fights_to_process:
-        fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
     fighter_histories = {}
     for fighter_name in fighters_prepared.index:

     # 2. Pre-calculate fighter histories to speed up lookups
     # And convert date strings to datetime objects once
     for fight in fights_to_process:
+        try:
+            # This will work if event_date is a string
+            fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
+        except TypeError:
+            # This will be triggered if it's already a date-like object (e.g., Timestamp)
+            fight['date_obj'] = fight['event_date']
     fighter_histories = {}
     for fighter_name in fighters_prepared.index:

src/predict/save_model.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import argparse
+import os
+import joblib
+import pandas as pd
+from ..config import FIGHTS_CSV_PATH, OUTPUT_DIR
+import src.predict.models as models
+def save_model(model_name):
+    """
+    Trains a specified model on the entire dataset and saves it to a file.
+    :param model_name: The name of the model class to train (e.g., 'XGBoostModel').
+    """
+    print(f"--- Training and Saving Model: {model_name} ---")
+    # 1. Get the model class from the models module
+    try:
+        ModelClass = getattr(models, model_name)
+    except AttributeError:
+        print(f"Error: Model '{model_name}' not found in src/predict/models.py")
+        return
+    model = ModelClass()
+    # 2. Load all available fights for training
+    if not os.path.exists(FIGHTS_CSV_PATH):
+        raise FileNotFoundError(f"Fights data not found at '{FIGHTS_CSV_PATH}'.")
+    all_fights = pd.read_csv(FIGHTS_CSV_PATH).to_dict('records')
+    print(f"Training model on all {len(all_fights)} available fights...")
+    # 3. Train the model
+    model.train(all_fights)
+    # 4. Save the entire trained model object
+    save_path = os.path.join(OUTPUT_DIR, 'trained_model.joblib')
+    joblib.dump(model, save_path)
+    print(f"\nModel saved successfully to {save_path}")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Train and save a prediction model.")
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='XGBoostModel',
+        help="The name of the model class to train and save."
+    )
+    args = parser.parse_args()
+    save_model(args.model)