Spaces:

AlvaroMros
/

ufc-predictor

Sleeping

Alvaro commited on Jul 3

Commit

e012a04

1 Parent(s): c81156a

Refactor prediction pipeline and modularize models

Replaces the monolithic predict.py with a modular prediction pipeline. Adds main.py to orchestrate model evaluation, models.py for model abstractions and the ELO baseline, and pipeline.py for data loading, splitting, evaluation, and reporting. Updates process_fights_for_elo to accept either a file path or pre-loaded data for improved flexibility.

Files changed (5) hide show

src/analysis/elo.py +19 -10
src/predict/main.py +30 -0
src/predict/models.py +56 -0
src/predict/pipeline.py +111 -0
src/predict/predict.py +0 -95

src/analysis/elo.py CHANGED Viewed

@@ -30,23 +30,32 @@ def update_elo_draw(elo1, elo2):
     return elo1 + change1, elo2 + change2
-def process_fights_for_elo(fights_csv_path=FIGHTS_CSV_PATH):
     """
-    Processes all fights chronologically to calculate final ELO scores for all fighters.
     """
-    if not os.path.exists(fights_csv_path):
-        print(f"Error: Fights data file not found at '{fights_csv_path}'.")
-        print("Please run the scraping pipeline first using 'src/scrape/main.py'.")
         return None
-    with open(fights_csv_path, 'r', encoding='utf-8') as f:
-        fights = list(csv.DictReader(f))
-    # Sort fights by date to process them in chronological order
     try:
         fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
     except (ValueError, KeyError) as e:
-        print(f"Error sorting fights by date. Make sure 'event_date' exists and is in 'Month Day, Year' format. Error: {e}")
         return None
     elos = {}

     return elo1 + change1, elo2 + change2
+def process_fights_for_elo(fights_data=FIGHTS_CSV_PATH):
     """
+    Processes fights chronologically to calculate ELO scores.
+    Accepts either a CSV file path or a pre-loaded list of fights.
     """
+    fights = []
+    if isinstance(fights_data, str):
+        # If a string is passed, treat it as a file path
+        if not os.path.exists(fights_data):
+            print(f"Error: Fights data file not found at '{fights_data}'.")
+            return None
+        with open(fights_data, 'r', encoding='utf-8') as f:
+            fights = list(csv.DictReader(f))
+    elif isinstance(fights_data, list):
+        # If a list is passed, use it directly
+        fights = fights_data
+    else:
+        print(f"Error: Invalid data type passed to process_fights_for_elo: {type(fights_data)}")
         return None
+    # Sort fights by date to process them in chronological order.
+    # This is crucial if loading from a file and a good safeguard if a list is passed.
     try:
         fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
     except (ValueError, KeyError) as e:
+        print(f"Error sorting fights by date: {e}")
         return None
     elos = {}

src/predict/main.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from .models import EloBaselineModel
+from .pipeline import PredictionPipeline
+def main():
+    """
+    Sets up the models and runs the prediction pipeline.
+    This is where you can add new models to compare them.
+    """
+    print("--- Initializing Machine Learning Prediction Pipeline ---")
+    # 1. Initialize the models you want to test
+    elo_model = EloBaselineModel()
+    # Add other models here to compare them, e.g.:
+    # logistic_model = LogisticRegressionModel()
+    # 2. Create a list of the models to evaluate
+    models_to_run = [
+        elo_model,
+        # logistic_model
+    ]
+    # 3. Initialize and run the pipeline
+    pipeline = PredictionPipeline(models=models_to_run)
+    # Set detailed_report=False for a summary, or True for a full detailed report
+    pipeline.run(detailed_report=False)
+if __name__ == '__main__':
+    main()

src/predict/models.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from abc import ABC, abstractmethod
+import sys
+import os
+from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
+class BaseModel(ABC):
+    """
+    Abstract base class for all prediction models.
+    Ensures that every model has a standard interface for training and prediction.
+    """
+    @abstractmethod
+    def train(self, train_fights):
+        """
+        Trains or prepares the model using historical fight data.
+        :param train_fights: A list of historical fight data dictionaries.
+        """
+        pass
+    @abstractmethod
+    def predict(self, fighter1_name, fighter2_name):
+        """
+        Predicts the winner of a single fight.
+        :param fighter1_name: The name of the first fighter.
+        :param fighter2_name: The name of the second fighter.
+        :return: The name of the predicted winning fighter.
+        """
+        pass
+class EloBaselineModel(BaseModel):
+    """
+    A baseline prediction model that predicts the winner based on the higher ELO rating.
+    """
+    def __init__(self):
+        self.historical_elos = {}
+    def train(self, train_fights):
+        """
+        Calculates the ELO ratings for all fighters based on historical data.
+        These ratings are then stored to be used for predictions.
+        """
+        print("Training ELO Baseline Model...")
+        self.historical_elos = process_fights_for_elo(train_fights)
+        print("ELO Model training complete.")
+    def predict(self, fighter1_name, fighter2_name):
+        """
+        Predicts the winner based on which fighter has the higher historical ELO.
+        If a fighter has no ELO rating, the default initial ELO is used.
+        """
+        elo1 = self.historical_elos.get(fighter1_name, INITIAL_ELO)
+        elo2 = self.historical_elos.get(fighter2_name, INITIAL_ELO)
+        # Return the name of the fighter with the higher ELO
+        return fighter1_name if elo1 > elo2 else fighter2_name

src/predict/pipeline.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import csv
+import os
+import sys
+from datetime import datetime
+from collections import OrderedDict
+from ..scrape.config import FIGHTS_CSV_PATH
+from .models import BaseModel
+class PredictionPipeline:
+    """
+    Orchestrates the model training, evaluation, and reporting pipeline.
+    """
+    def __init__(self, models):
+        if not all(isinstance(m, BaseModel) for m in models):
+            raise TypeError("All models must be instances of BaseModel.")
+        self.models = models
+        self.train_fights = []
+        self.test_fights = []
+        self.results = {}
+    def _load_and_split_data(self, num_test_events=10):
+        """Loads and splits the data into chronological training and testing sets."""
+        print("\n--- Loading and Splitting Data ---")
+        if not os.path.exists(FIGHTS_CSV_PATH):
+            raise FileNotFoundError(f"Fights data not found at '{FIGHTS_CSV_PATH}'.")
+        with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
+            fights = list(csv.DictReader(f))
+        fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
+        all_events = list(OrderedDict.fromkeys(f['event_name'] for f in fights))
+        if len(all_events) < num_test_events:
+            print(f"Warning: Fewer than {num_test_events} events found. Adjusting test set size.")
+            num_test_events = len(all_events)
+        test_event_names = all_events[-num_test_events:]
+        self.train_fights = [f for f in fights if f['event_name'] not in test_event_names]
+        self.test_fights = [f for f in fights if f['event_name'] in test_event_names]
+        print(f"Data loaded. {len(self.train_fights)} training fights, {len(self.test_fights)} testing fights.")
+        print(f"Testing on the last {num_test_events} events.")
+    def run(self, detailed_report=True):
+        """Executes the full pipeline: load, train, evaluate, and report."""
+        self._load_and_split_data()
+        eval_fights = [f for f in self.test_fights if f['winner'] not in ["Draw", "NC", ""]]
+        if not eval_fights:
+            print("No fights with definitive outcomes in the test set. Aborting.")
+            return
+        for model in self.models:
+            model_name = model.__class__.__name__
+            print(f"\n--- Evaluating Model: {model_name} ---")
+            model.train(self.train_fights)
+            correct_predictions = 0
+            predictions = []
+            for fight in eval_fights:
+                f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
+                actual_winner = fight['winner']
+                predicted_winner = model.predict(f1_name, f2_name)
+                is_correct = (predicted_winner == actual_winner)
+                if is_correct:
+                    correct_predictions += 1
+                predictions.append({
+                    'fight': f"{f1_name} vs. {f2_name}",
+                    'predicted_winner': predicted_winner,
+                    'actual_winner': actual_winner,
+                    'is_correct': is_correct
+                })
+            accuracy = (correct_predictions / len(eval_fights)) * 100
+            self.results[model_name] = {
+                'accuracy': accuracy,
+                'predictions': predictions,
+                'total_fights': len(eval_fights)
+            }
+        if detailed_report:
+            self._report_detailed_results()
+        else:
+            self._report_summary()
+    def _report_summary(self):
+        """Prints a concise summary of model performance."""
+        print("\n\n--- Prediction Pipeline Summary ---")
+        print(f"{'Model':<25} | {'Accuracy':<10} | {'Fights Evaluated':<20}")
+        print("-" * 65)
+        for model_name, result in self.results.items():
+            print(f"{model_name:<25} | {result['accuracy']:<9.2f}% | {result['total_fights']:<20}")
+        print("-" * 65)
+    def _report_detailed_results(self):
+        """Prints a summary and detailed report of the model evaluations."""
+        print("\n\n--- Prediction Pipeline Finished: Detailed Report ---")
+        for model_name, result in self.results.items():
+            print(f"\n--- Model: {model_name} ---")
+            print(f"  Overall Accuracy: {result['accuracy']:.2f}%")
+            print("  Detailed Predictions:")
+            for p in result['predictions']:
+                status = "CORRECT" if p['is_correct'] else "INCORRECT"
+                print(f"    - Fight: {p['fight']}")
+                print(f"      -> Predicted: {p['predicted_winner']}")
+                print(f"      -> Actual:    {p['actual_winner']}")
+                print(f"      -> Result: {status}")
+            print("------------------------" + "-" * len(model_name))

src/predict/predict.py DELETED Viewed

@@ -1,95 +0,0 @@
-import csv
-import os
-import sys
-from datetime import datetime
-from ..scrape.config import FIGHTS_CSV_PATH, FIGHTERS_CSV_PATH
-def load_fighters_data():
-    """Loads fighter data, including ELO scores, into a dictionary."""
-    if not os.path.exists(FIGHTERS_CSV_PATH):
-        print(f"Error: Fighter data not found at '{FIGHTERS_CSV_PATH}'.")
-        print("Please run the ELO analysis first ('python -m src.analysis.elo').")
-        return None
-    fighters = {}
-    with open(FIGHTERS_CSV_PATH, 'r', encoding='utf-8') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            full_name = f"{row['first_name']} {row['last_name']}".strip()
-            fighters[full_name] = {'elo': float(row.get('elo', 1500))} # Default ELO if missing
-    return fighters
-def load_fights_data():
-    """Loads fight data and sorts it chronologically."""
-    if not os.path.exists(FIGHTS_CSV_PATH):
-        print(f"Error: Fights data not found at '{FIGHTS_CSV_PATH}'.")
-        return None
-    with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
-        fights = list(csv.DictReader(f))
-    # Sort fights chronologically to ensure a proper train/test split later
-    fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
-    return fights
-def run_elo_baseline_model(fights, fighters):
-    """
-    Runs a simple baseline prediction model where the fighter with the higher ELO is predicted to win.
-    """
-    correct_predictions = 0
-    total_predictions = 0
-    for fight in fights:
-        fighter1_name = fight['fighter_1']
-        fighter2_name = fight['fighter_2']
-        actual_winner = fight['winner']
-        # Skip fights that are draws or no contests
-        if actual_winner in ["Draw", "NC", ""]:
-            continue
-        fighter1 = fighters.get(fighter1_name)
-        fighter2 = fighters.get(fighter2_name)
-        if not fighter1 or not fighter2:
-            continue # Skip if fighter data is missing
-        elo1 = fighter1.get('elo', 1500)
-        elo2 = fighter2.get('elo', 1500)
-        # Predict winner based on higher ELO
-        predicted_winner = fighter1_name if elo1 > elo2 else fighter2_name
-        if predicted_winner == actual_winner:
-            correct_predictions += 1
-        total_predictions += 1
-    accuracy = (correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0
-    return accuracy, total_predictions
-def main():
-    """
-    Main function to run the prediction pipeline.
-    """
-    print("--- Starting ML Prediction Pipeline ---")
-    # Load data
-    fighters_data = load_fighters_data()
-    fights_data = load_fights_data()
-    if not fighters_data or not fights_data:
-        print("Aborting pipeline due to missing data.")
-        return
-    # Run baseline model
-    print("\nRunning Baseline Model (Predicting winner by highest ELO)...")
-    accuracy, total_fights = run_elo_baseline_model(fights_data, fighters_data)
-    print("\n--- Baseline Model Evaluation ---")
-    print(f"Total Fights Evaluated: {total_fights}")
-    print(f"Model Accuracy: {accuracy:.2f}%")
-    print("---------------------------------")
-if __name__ == '__main__':
-    main()