AlvaroMros commited on
Commit
7fcaffe
·
1 Parent(s): 9678fdb

(CAREFUL!!!!!!) Refactor argument parsing and prediction pipeline

Browse files

Moved command-line argument parsing to a dedicated src/args.py module and updated main.py and predict/main.py to use these functions. Improved model management logic and modularized pipeline execution. Enhanced feature engineering and preprocessing in predict/preprocess.py and refactored model classes in predict/models.py for consistency and maintainability.

output/model_results.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf8df1ba9e26fa98e34bfb1c773e66576cbf89152087c55b70921269c84f39d5
3
- size 27286
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40c2fb9010bdae4946c2b879d4014aa671a43b586aff7faa73ea4846585e589c
3
+ size 11671
output/models/EloBaselineModel.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfdc684f791b598fbecfbfe9b14cca3b4d483b3d7368a16faecea31aace3be87
3
- size 938419
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40937e8b6fe9aaaa1ca92a84e3e67b5bdefcf2700d2cafb7830670a14f684858
3
+ size 938435
output/models/LogisticRegressionModel.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a773552b7f1b166858ab1ff7bdf472e24b293279a8e24871de773b1a3de46e1
3
- size 5517988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c11a689c50244a6084e642a1dc35a349d515f075b40515dbd4164e7831dfdb
3
+ size 5518484
src/args.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ def get_pipeline_args():
4
+ """
5
+ Parse command line arguments for the main UFC data pipeline.
6
+
7
+ Returns:
8
+ argparse.Namespace: Parsed command line arguments
9
+ """
10
+ parser = argparse.ArgumentParser(description="UFC Data Pipeline")
11
+
12
+ # Pipeline selection
13
+ parser.add_argument(
14
+ '--pipeline',
15
+ type=str,
16
+ default='scrape',
17
+ choices=['scrape', 'analysis', 'predict', 'update', 'all'],
18
+ help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
19
+ )
20
+
21
+ # Scraping arguments
22
+ scraping_group = parser.add_argument_group('Scraping options')
23
+ scraping_group.add_argument(
24
+ '--scrape-mode',
25
+ type=str,
26
+ default='full',
27
+ choices=['full', 'update'],
28
+ help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
29
+ )
30
+ scraping_group.add_argument(
31
+ '--num-events',
32
+ type=int,
33
+ default=5,
34
+ help="Number of latest events to scrape in update mode (default: 5)"
35
+ )
36
+
37
+ # Model management arguments
38
+ model_group = parser.add_argument_group('Model management')
39
+ model_group.add_argument(
40
+ '--use-existing-models',
41
+ action='store_true',
42
+ default=True,
43
+ help="Use existing saved models if available and no new data (default: True)"
44
+ )
45
+ model_group.add_argument(
46
+ '--no-use-existing-models',
47
+ action='store_true',
48
+ default=False,
49
+ help="Force retrain all models from scratch, ignoring existing saved models"
50
+ )
51
+ model_group.add_argument(
52
+ '--force-retrain',
53
+ action='store_true',
54
+ default=False,
55
+ help="Force retrain all models even if no new data is available"
56
+ )
57
+
58
+ return parser.parse_args()
59
+
60
+ def get_prediction_args():
61
+ """
62
+ Parse command line arguments specific to the prediction pipeline.
63
+
64
+ Returns:
65
+ argparse.Namespace: Parsed command line arguments
66
+ """
67
+ parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
68
+
69
+ parser.add_argument(
70
+ '--report',
71
+ type=str,
72
+ default='detailed',
73
+ choices=['detailed', 'summary'],
74
+ help="Type of report to generate: 'detailed' (file) or 'summary' (console)"
75
+ )
76
+
77
+ model_group = parser.add_argument_group('Model management')
78
+ model_group.add_argument(
79
+ '--use-existing-models',
80
+ action='store_true',
81
+ default=True,
82
+ help="Use existing saved models if available and no new data (default: True)"
83
+ )
84
+ model_group.add_argument(
85
+ '--no-use-existing-models',
86
+ action='store_true',
87
+ default=False,
88
+ help="Force retrain all models from scratch, ignoring existing saved models"
89
+ )
90
+ model_group.add_argument(
91
+ '--force-retrain',
92
+ action='store_true',
93
+ default=False,
94
+ help="Force retrain all models even if no new data is available"
95
+ )
96
+
97
+ return parser.parse_args()
src/main.py CHANGED
@@ -1,106 +1,79 @@
1
- import argparse
2
  import sys
3
  import os
 
4
 
5
- def main():
6
- """
7
- Main entry point for the UFC data pipeline.
8
- Supports scraping, analysis, and prediction workflows.
9
- """
10
- parser = argparse.ArgumentParser(description="UFC Data Pipeline")
11
- parser.add_argument(
12
- '--pipeline',
13
- type=str,
14
- default='scrape',
15
- choices=['scrape', 'analysis', 'predict', 'update', 'all'],
16
- help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
17
- )
18
- parser.add_argument(
19
- '--scrape-mode',
20
- type=str,
21
- default='full',
22
- choices=['full', 'update'],
23
- help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
24
- )
25
- parser.add_argument(
26
- '--num-events',
27
- type=int,
28
- default=5,
29
- help="Number of latest events to scrape in update mode (default: 5)"
30
- )
31
- # Model management arguments for prediction pipeline
32
- parser.add_argument(
33
- '--use-existing-models',
34
- action='store_true',
35
- default=True,
36
- help="Use existing saved models if available and no new data (default: True)."
37
- )
38
- parser.add_argument(
39
- '--no-use-existing-models',
40
- action='store_true',
41
- default=False,
42
- help="Force retrain all models from scratch, ignoring existing saved models."
43
- )
44
- parser.add_argument(
45
- '--force-retrain',
46
- action='store_true',
47
- default=False,
48
- help="Force retrain all models even if no new data is available."
49
- )
50
 
51
- args = parser.parse_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- if args.pipeline in ['scrape', 'all']:
54
- print("=== Running Scraping Pipeline ===")
55
- from src.scrape.main import main as scrape_main
 
 
 
 
 
56
 
57
- # Override sys.argv to pass arguments to scrape.main
58
- original_argv = sys.argv
59
- sys.argv = ['scrape_main', '--mode', args.scrape_mode, '--num-events', str(args.num_events)]
60
- try:
61
- scrape_main()
62
- finally:
63
- sys.argv = original_argv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  if args.pipeline in ['analysis', 'all']:
66
- print("\n=== Running ELO Analysis ===")
67
- from src.analysis.elo import main as elo_main
68
- elo_main()
69
-
70
  if args.pipeline == 'update':
71
- print("\n=== Running Model Update Pipeline ===")
72
- try:
73
- from src.predict.main import MODELS_TO_RUN
74
- from src.predict.pipeline import PredictionPipeline
75
- except ImportError:
76
- print("Fatal: Could not import prediction modules.")
77
- print("Please ensure your project structure and python path are correct.")
78
- return
79
-
80
- pipeline = PredictionPipeline(models=MODELS_TO_RUN)
81
- pipeline.update_models_if_new_data()
82
 
83
  if args.pipeline in ['predict', 'all']:
84
- print("\n=== Running Prediction Pipeline ===")
85
- from src.predict.main import main as predict_main
86
-
87
- # Override sys.argv to pass model management arguments to predict.main
88
- original_argv = sys.argv
89
- predict_args = ['predict_main']
90
-
91
- if args.no_use_existing_models:
92
- predict_args.append('--no-use-existing-models')
93
- elif args.use_existing_models:
94
- predict_args.append('--use-existing-models')
95
-
96
- if args.force_retrain:
97
- predict_args.append('--force-retrain')
98
-
99
- sys.argv = predict_args
100
- try:
101
- predict_main()
102
- finally:
103
- sys.argv = original_argv
104
 
105
  if __name__ == '__main__':
106
  main()
 
 
1
  import sys
2
  import os
3
+ from .args import get_pipeline_args
4
 
5
+ def run_scraping_pipeline(args):
6
+ """Execute the scraping pipeline with given arguments."""
7
+ print("=== Running Scraping Pipeline ===")
8
+ from .scrape.main import main as scrape_main
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ # Pass arguments to scrape.main
11
+ original_argv = sys.argv
12
+ sys.argv = ['scrape_main', '--mode', args.scrape_mode, '--num-events', str(args.num_events)]
13
+ try:
14
+ scrape_main()
15
+ finally:
16
+ sys.argv = original_argv
17
+
18
+ def run_analysis_pipeline():
19
+ """Execute the ELO analysis pipeline."""
20
+ print("\n=== Running ELO Analysis ===")
21
+ from .analysis.elo import main as elo_main
22
+ elo_main()
23
+
24
+ def run_prediction_pipeline(args):
25
+ """Execute the prediction pipeline with given arguments."""
26
+ print("\n=== Running Prediction Pipeline ===")
27
+ from .predict.main import main as predict_main
28
 
29
+ # Pass model management arguments to predict.main
30
+ original_argv = sys.argv
31
+ predict_args = ['predict_main']
32
+
33
+ if args.no_use_existing_models:
34
+ predict_args.append('--no-use-existing-models')
35
+ elif args.use_existing_models:
36
+ predict_args.append('--use-existing-models')
37
 
38
+ if args.force_retrain:
39
+ predict_args.append('--force-retrain')
40
+
41
+ sys.argv = predict_args
42
+ try:
43
+ predict_main()
44
+ finally:
45
+ sys.argv = original_argv
46
+
47
+ def run_model_update(args):
48
+ """Execute the model update pipeline."""
49
+ print("\n=== Running Model Update Pipeline ===")
50
+ try:
51
+ from .predict.main import MODELS_TO_RUN
52
+ from .predict.pipeline import PredictionPipeline
53
+ except ImportError:
54
+ print("Fatal: Could not import prediction modules.")
55
+ print("Please ensure your project structure and python path are correct.")
56
+ return
57
+
58
+ pipeline = PredictionPipeline(models=MODELS_TO_RUN)
59
+ pipeline.update_models_if_new_data()
60
+
61
+ def main():
62
+ """Main entry point for the UFC data pipeline."""
63
+ args = get_pipeline_args()
64
+
65
+ # Execute requested pipeline(s)
66
+ if args.pipeline in ['scrape', 'all']:
67
+ run_scraping_pipeline(args)
68
 
69
  if args.pipeline in ['analysis', 'all']:
70
+ run_analysis_pipeline()
71
+
 
 
72
  if args.pipeline == 'update':
73
+ run_model_update(args)
 
 
 
 
 
 
 
 
 
 
74
 
75
  if args.pipeline in ['predict', 'all']:
76
+ run_prediction_pipeline(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  if __name__ == '__main__':
79
  main()
src/predict/main.py CHANGED
@@ -1,5 +1,4 @@
1
- import argparse
2
-
3
  from .pipeline import PredictionPipeline
4
  from .models import (
5
  EloBaselineModel,
@@ -11,56 +10,34 @@ from .models import (
11
  LGBMModel
12
  )
13
 
14
- # --- Define Models to Run ---
15
- # Instantiate all the models you want to evaluate here.
16
- MODELS_TO_RUN = [
17
- EloBaselineModel(),
18
- LogisticRegressionModel(),
19
- XGBoostModel(),
20
- SVCModel(),
21
- RandomForestModel(),
22
- BernoulliNBModel(),
23
- LGBMModel(),
24
- ]
25
- # --- End of Model Definition ---
 
 
 
26
 
27
  def main():
28
  """
29
  Main entry point to run the prediction pipeline.
30
  You can specify which models to run and the reporting format.
31
  """
32
- parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
33
- parser.add_argument(
34
- '--report',
35
- type=str,
36
- default='detailed',
37
- choices=['detailed', 'summary'],
38
- help="Type of report to generate: 'detailed' (file) or 'summary' (console)."
39
- )
40
- parser.add_argument(
41
- '--use-existing-models',
42
- action='store_true',
43
- default=True,
44
- help="Use existing saved models if available and no new data (default: True)."
45
- )
46
- parser.add_argument(
47
- '--no-use-existing-models',
48
- action='store_true',
49
- default=False,
50
- help="Force retrain all models from scratch, ignoring existing saved models."
51
- )
52
- parser.add_argument(
53
- '--force-retrain',
54
- action='store_true',
55
- default=False,
56
- help="Force retrain all models even if no new data is available."
57
- )
58
- args = parser.parse_args()
59
 
60
  # Handle conflicting arguments
61
  use_existing_models = not args.no_use_existing_models and args.use_existing_models
62
  force_retrain = args.force_retrain
63
 
 
64
  if args.no_use_existing_models:
65
  print("No-use-existing-models flag set: All models will be retrained from scratch.")
66
  elif force_retrain:
@@ -68,21 +45,9 @@ def main():
68
  elif use_existing_models:
69
  print("Using existing models if available and no new data detected.")
70
 
71
- # --- Define Models to Run ---
72
- # Instantiate all the models you want to evaluate here.
73
- models_to_run = [
74
- EloBaselineModel(),
75
- LogisticRegressionModel(),
76
- XGBoostModel(),
77
- SVCModel(),
78
- RandomForestModel(),
79
- BernoulliNBModel(),
80
- LGBMModel(),
81
- ]
82
- # --- End of Model Definition ---
83
-
84
  pipeline = PredictionPipeline(
85
- models=MODELS_TO_RUN,
86
  use_existing_models=use_existing_models,
87
  force_retrain=force_retrain
88
  )
@@ -92,3 +57,6 @@ def main():
92
  except FileNotFoundError as e:
93
  print(f"Error: {e}")
94
  print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
 
 
 
 
1
+ from ..args import get_prediction_args
 
2
  from .pipeline import PredictionPipeline
3
  from .models import (
4
  EloBaselineModel,
 
10
  LGBMModel
11
  )
12
 
13
+ def get_available_models():
14
+ """Get a list of all available prediction models.
15
+
16
+ Returns:
17
+ list: List of instantiated model objects
18
+ """
19
+ return [
20
+ EloBaselineModel(),
21
+ LogisticRegressionModel(),
22
+ # XGBoostModel(),
23
+ # SVCModel(),
24
+ # RandomForestModel(),
25
+ # BernoulliNBModel(),
26
+ LGBMModel(),
27
+ ]
28
 
29
  def main():
30
  """
31
  Main entry point to run the prediction pipeline.
32
  You can specify which models to run and the reporting format.
33
  """
34
+ args = get_prediction_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # Handle conflicting arguments
37
  use_existing_models = not args.no_use_existing_models and args.use_existing_models
38
  force_retrain = args.force_retrain
39
 
40
+ # Log model management settings
41
  if args.no_use_existing_models:
42
  print("No-use-existing-models flag set: All models will be retrained from scratch.")
43
  elif force_retrain:
 
45
  elif use_existing_models:
46
  print("Using existing models if available and no new data detected.")
47
 
48
+ # Initialize and run prediction pipeline
 
 
 
 
 
 
 
 
 
 
 
 
49
  pipeline = PredictionPipeline(
50
+ models=get_available_models(),
51
  use_existing_models=use_existing_models,
52
  force_retrain=force_retrain
53
  )
 
57
  except FileNotFoundError as e:
58
  print(f"Error: {e}")
59
  print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
60
+ except Exception as e:
61
+ print(f"An unexpected error occurred: {e}")
62
+ raise
src/predict/models.py CHANGED
@@ -1,6 +1,4 @@
1
  from abc import ABC, abstractmethod
2
- import sys
3
- import os
4
  import pandas as pd
5
  from sklearn.linear_model import LogisticRegression
6
  from sklearn.svm import SVC
@@ -10,188 +8,128 @@ from xgboost import XGBClassifier
10
  from lightgbm import LGBMClassifier
11
  from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
12
  from ..config import FIGHTERS_CSV_PATH
13
- from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
14
 
15
  class BaseModel(ABC):
16
- """
17
- Abstract base class for all prediction models.
18
- Ensures that every model has a standard interface for training and prediction.
19
- """
 
20
  @abstractmethod
21
  def train(self, train_fights):
22
- """
23
- Trains or prepares the model using historical fight data.
24
-
25
- :param train_fights: A list of historical fight data dictionaries.
26
- """
27
  pass
28
 
29
  @abstractmethod
30
  def predict(self, fight):
31
- """
32
- Predicts the winner of a single fight.
33
-
34
- :param fight: A dictionary representing a single fight.
35
- :return: The name of the predicted winning fighter.
36
- """
37
  pass
38
 
39
- class EloBaselineModel(BaseModel):
40
- """
41
- A baseline prediction model that predicts the winner based on the higher ELO rating.
42
- """
43
- def __init__(self):
44
- self.fighters_df = None
45
 
 
 
 
46
  def train(self, train_fights):
47
- """
48
- For the ELO baseline, 'training' simply consists of loading the fighter data
49
- to access their ELO scores during prediction.
50
- """
51
- print("Training EloBaselineModel: Loading fighter ELO data...")
52
  self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
53
  self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
54
  self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
 
 
 
 
 
 
 
55
 
56
  def predict(self, fight):
57
- """Predicts the winner based on ELO and calculates win probability."""
58
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
59
 
60
  try:
61
  f1_elo = self.fighters_df.loc[f1_name, 'elo']
62
  f2_elo = self.fighters_df.loc[f2_name, 'elo']
63
 
64
- # Calculate win probability for fighter 1 using the ELO formula
65
  prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
66
-
67
- if prob_f1_wins >= 0.5:
68
- return {'winner': f1_name, 'probability': prob_f1_wins}
69
- else:
70
- return {'winner': f2_name, 'probability': 1 - prob_f1_wins}
71
-
72
  except KeyError as e:
73
  print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
74
- return {'winner': None, 'probability': None}
75
 
76
  class BaseMLModel(BaseModel):
77
- """
78
- An abstract base class for machine learning models that handles all common
79
- data preparation, training, and prediction logic.
80
- """
81
  def __init__(self, model):
 
82
  if model is None:
83
  raise ValueError("A model must be provided.")
84
  self.model = model
85
- self.fighters_df = None
86
- self.fighter_histories = {}
87
 
88
  def train(self, train_fights):
89
- """
90
- Trains the machine learning model. This involves loading fighter data,
91
- pre-calculating histories, and fitting the model on the preprocessed data.
92
- """
93
- print(f"--- Training {self.model.__class__.__name__} ---")
94
 
95
- # 1. Prepare data for prediction-time feature generation
96
- self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
97
- self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
98
- self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
99
- for col in ['height_cm', 'reach_in', 'elo']:
100
- if col in self.fighters_df.columns:
101
- self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
102
-
103
- # 2. Pre-calculate fighter histories
104
- train_fights_with_dates = []
105
- for fight in train_fights:
106
- fight['date_obj'] = pd.to_datetime(fight['event_date'])
107
- train_fights_with_dates.append(fight)
108
- for fighter_name in self.fighters_df.index:
109
- history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
110
- self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
111
-
112
- # 3. Preprocess and fit
113
  X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
114
  print(f"Fitting model on {X_train.shape[0]} samples...")
115
  self.model.fit(X_train, y_train)
116
  print("Model training complete.")
117
 
118
  def predict(self, fight):
119
- """
120
- Predicts the outcome of a single fight, returning the winner and probability.
121
- """
122
- f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
123
- fight_date = pd.to_datetime(fight['event_date'])
124
-
125
- if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
126
- print(f"Warning: Fighter not found. Skipping prediction for {f1_name} vs {f2_name}")
127
- return {'winner': None, 'probability': None}
128
-
129
- f1_stats = self.fighters_df.loc[f1_name]
130
- f2_stats = self.fighters_df.loc[f2_name]
131
- if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
132
- if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
133
 
134
- f1_hist = self.fighter_histories.get(f1_name, [])
135
- f2_hist = self.fighter_histories.get(f2_name, [])
136
- f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
137
- f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
138
 
139
- f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
140
- f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
141
-
142
- features = {
143
- 'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
144
- 'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
145
- 'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
146
- 'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
147
- 'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
148
- 'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
149
- 'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
150
- 'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
151
- 'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
152
- 'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
153
- 'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
154
- }
155
-
156
- feature_vector = pd.DataFrame([features]).fillna(0)
157
-
158
- # Use predict_proba to get probabilities for each class
159
- probabilities = self.model.predict_proba(feature_vector)[0]
160
- prob_f1_wins = probabilities[1] # Probability of class '1' (fighter 1 wins)
161
-
162
- if prob_f1_wins >= 0.5:
163
- return {'winner': f1_name, 'probability': prob_f1_wins}
164
- else:
165
- return {'winner': f2_name, 'probability': 1 - prob_f1_wins}
166
 
 
167
  class LogisticRegressionModel(BaseMLModel):
168
- """A thin wrapper for scikit-learn's LogisticRegression."""
169
  def __init__(self):
170
- super().__init__(model=LogisticRegression())
171
-
172
- class XGBoostModel(BaseMLModel):
173
- """A thin wrapper for XGBoost's XGBClassifier."""
174
- def __init__(self):
175
- model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
176
- super().__init__(model=model)
177
 
178
  class SVCModel(BaseMLModel):
179
- """A thin wrapper for scikit-learn's Support Vector Classifier."""
180
  def __init__(self):
181
- # Probability=True is needed for some reports, though it slows down training
182
- super().__init__(model=SVC(probability=True, random_state=42))
183
 
184
  class RandomForestModel(BaseMLModel):
185
- """A thin wrapper for scikit-learn's RandomForestClassifier."""
186
  def __init__(self):
187
- super().__init__(model=RandomForestClassifier(random_state=42))
188
 
189
  class BernoulliNBModel(BaseMLModel):
190
- """A thin wrapper for scikit-learn's Bernoulli Naive Bayes classifier."""
191
  def __init__(self):
192
- super().__init__(model=BernoulliNB())
 
 
 
 
193
 
194
  class LGBMModel(BaseMLModel):
195
- """A thin wrapper for LightGBM's LGBMClassifier."""
196
  def __init__(self):
197
- super().__init__(model=LGBMClassifier(random_state=42))
 
1
  from abc import ABC, abstractmethod
 
 
2
  import pandas as pd
3
  from sklearn.linear_model import LogisticRegression
4
  from sklearn.svm import SVC
 
8
  from lightgbm import LGBMClassifier
9
  from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
10
  from ..config import FIGHTERS_CSV_PATH
11
+ from .preprocess import preprocess_for_ml
12
 
13
  class BaseModel(ABC):
14
+ """Abstract base class for all prediction models."""
15
+
16
+ def __init__(self):
17
+ self.model_name = self.__class__.__name__
18
+
19
  @abstractmethod
20
  def train(self, train_fights):
21
+ """Train the model using historical fight data."""
 
 
 
 
22
  pass
23
 
24
  @abstractmethod
25
  def predict(self, fight):
26
+ """Predict the winner of a single fight."""
 
 
 
 
 
27
  pass
28
 
29
+ def _format_prediction(self, winner, probability):
30
+ """Format prediction results consistently."""
31
+ return {'winner': winner, 'probability': probability}
 
 
 
32
 
33
+ class EloBaselineModel(BaseModel):
34
+ """Simple ELO-based prediction model."""
35
+
36
  def train(self, train_fights):
37
+ """Process historical fights to calculate current ELO ratings."""
38
+ print(f"--- Training {self.model_name} ---")
39
+
40
+ # Load and prepare fighter data
 
41
  self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
42
  self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
43
  self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
44
+
45
+ # Calculate ELO ratings
46
+ elo_ratings = process_fights_for_elo(train_fights)
47
+ self.fighters_df['elo'] = pd.Series(elo_ratings)
48
+ self.fighters_df['elo'] = self.fighters_df['elo'].fillna(INITIAL_ELO)
49
+
50
+ print("ELO ratings calculated for all fighters.")
51
 
52
  def predict(self, fight):
53
+ """Predict winner based on current ELO ratings."""
54
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
55
 
56
  try:
57
  f1_elo = self.fighters_df.loc[f1_name, 'elo']
58
  f2_elo = self.fighters_df.loc[f2_name, 'elo']
59
 
60
+ # Calculate win probability using ELO formula
61
  prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
62
+
63
+ winner = f1_name if prob_f1_wins >= 0.5 else f2_name
64
+ probability = prob_f1_wins if prob_f1_wins >= 0.5 else 1 - prob_f1_wins
65
+
66
+ return self._format_prediction(winner, probability)
67
+
68
  except KeyError as e:
69
  print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
70
+ return self._format_prediction(None, None)
71
 
72
  class BaseMLModel(BaseModel):
73
+ """Base class for all machine learning models."""
74
+
 
 
75
  def __init__(self, model):
76
+ super().__init__()
77
  if model is None:
78
  raise ValueError("A model must be provided.")
79
  self.model = model
 
 
80
 
81
  def train(self, train_fights):
82
+ """Train the ML model on preprocessed fight data."""
83
+ print(f"--- Training {self.model_name} ---")
 
 
 
84
 
85
+ # Preprocess data and fit model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
87
  print(f"Fitting model on {X_train.shape[0]} samples...")
88
  self.model.fit(X_train, y_train)
89
  print("Model training complete.")
90
 
91
  def predict(self, fight):
92
+ """Predict fight outcome using the trained ML model."""
93
+ # Preprocess single fight for prediction
94
+ X_pred, _, metadata = preprocess_for_ml([fight], FIGHTERS_CSV_PATH)
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ if X_pred.empty:
97
+ print(f"Warning: Could not process fight data for {fight['fighter_1']} vs {fight['fighter_2']}")
98
+ return self._format_prediction(None, None)
 
99
 
100
+ # Make prediction
101
+ try:
102
+ prob_f1_wins = self.model.predict_proba(X_pred)[0][1]
103
+ winner = fight['fighter_1'] if prob_f1_wins >= 0.5 else fight['fighter_2']
104
+ probability = prob_f1_wins if prob_f1_wins >= 0.5 else 1 - prob_f1_wins
105
+
106
+ return self._format_prediction(winner, probability)
107
+
108
+ except Exception as e:
109
+ print(f"Error making prediction: {e}")
110
+ return self._format_prediction(None, None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ # Concrete ML model implementations
113
  class LogisticRegressionModel(BaseMLModel):
 
114
  def __init__(self):
115
+ super().__init__(LogisticRegression(random_state=42))
 
 
 
 
 
 
116
 
117
  class SVCModel(BaseMLModel):
 
118
  def __init__(self):
119
+ super().__init__(SVC(probability=True, random_state=42))
 
120
 
121
  class RandomForestModel(BaseMLModel):
 
122
  def __init__(self):
123
+ super().__init__(RandomForestClassifier(n_estimators=100, random_state=42))
124
 
125
  class BernoulliNBModel(BaseMLModel):
 
126
  def __init__(self):
127
+ super().__init__(BernoulliNB())
128
+
129
+ class XGBoostModel(BaseMLModel):
130
+ def __init__(self):
131
+ super().__init__(XGBClassifier(random_state=42))
132
 
133
  class LGBMModel(BaseMLModel):
 
134
  def __init__(self):
135
+ super().__init__(LGBMClassifier(random_state=42))
src/predict/preprocess.py CHANGED
@@ -1,15 +1,14 @@
1
  import pandas as pd
2
  import os
3
  from datetime import datetime
4
- from ..config import FIGHTERS_CSV_PATH
5
 
6
  def _clean_numeric_column(series):
7
- """A helper to clean string columns into numbers, handling errors."""
8
  series_str = series.astype(str)
9
  return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
10
 
11
  def _calculate_age(dob_str, fight_date_str):
12
- """Calculates age in years from a date of birth string and fight date string."""
13
  if pd.isna(dob_str) or not dob_str:
14
  return None
15
  try:
@@ -19,213 +18,235 @@ def _calculate_age(dob_str, fight_date_str):
19
  except (ValueError, TypeError):
20
  return None
21
 
22
- def _parse_round_time_to_seconds(round_str, time_str):
23
- """Converts fight duration from round and time to total seconds."""
24
- try:
25
- rounds = int(round_str)
26
- minutes, seconds = map(int, time_str.split(':'))
27
- # Assuming 5-minute rounds for calculation simplicity
28
- return ((rounds - 1) * 5 * 60) + (minutes * 60) + seconds
29
- except (ValueError, TypeError, AttributeError):
30
- return 0
31
-
32
- def _parse_striking_stats(stat_str):
33
- """Parses striking stats string like '10 of 20' into (landed, attempted)."""
34
- try:
35
- landed, attempted = map(int, stat_str.split(' of '))
36
- return landed, attempted
37
- except (ValueError, TypeError, AttributeError):
38
- return 0, 0
 
39
 
40
- def _to_int_safe(val):
41
- """Safely converts a value to an integer, returning 0 if it's invalid or empty."""
42
- if pd.isna(val):
43
- return 0
44
  try:
45
- # handle strings with whitespace or empty strings
46
- return int(str(val).strip() or 0)
47
  except (ValueError, TypeError):
48
  return 0
49
 
50
- def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history, fighters_df, n=5):
51
- """
52
- Calculates performance statistics for a fighter based on their last n fights.
53
- """
54
- past_fights = [f for f in fighter_history if f['date_obj'] < current_fight_date]
55
- last_n_fights = past_fights[-n:]
56
-
57
- if not last_n_fights:
58
- # Return a default dictionary with the correct keys for a fighter with no history
59
- return {
60
- 'wins_last_n': 0,
61
- 'avg_opp_elo_last_n': 1500, # Assume average ELO for first opponent
62
- 'ko_percent_last_n': 0,
63
- 'sig_str_landed_per_min_last_n': 0,
64
- 'takedown_accuracy_last_n': 0,
65
- 'sub_attempts_per_min_last_n': 0,
66
- }
67
-
68
  stats = {
69
- 'wins': 0, 'ko_wins': 0, 'total_time_secs': 0,
70
- 'sig_str_landed': 0, 'opponent_elos': [],
71
- 'td_landed': 0, 'td_attempted': 0, 'sub_attempts': 0
 
 
 
 
 
 
 
 
 
 
 
 
72
  }
73
-
 
 
 
 
 
 
74
  for fight in last_n_fights:
75
  is_fighter_1 = (fight['fighter_1'] == fighter_name)
 
 
76
  opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
77
 
78
- f_prefix = 'f1' if is_fighter_1 else 'f2'
79
-
80
  if fight['winner'] == fighter_name:
81
- stats['wins'] += 1
 
 
 
 
82
  if 'KO' in fight['method']:
83
  stats['ko_wins'] += 1
84
-
85
- if opponent_name in fighters_df.index:
86
- opp_elo = fighters_df.loc[opponent_name, 'elo']
87
- stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else 1500)
88
 
89
- stats['total_time_secs'] += _parse_round_time_to_seconds(fight['round'], fight['time'])
 
 
 
 
 
 
 
90
 
91
- sig_str_stat = fight.get(f'{f_prefix}_sig_str', '0 of 0')
92
- landed, _ = _parse_striking_stats(sig_str_stat)
93
- stats['sig_str_landed'] += landed
94
-
95
- td_stat = fight.get(f'{f_prefix}_td', '0 of 0')
96
- td_landed, td_attempted = _parse_striking_stats(td_stat) # Can reuse this parser
97
- stats['td_landed'] += td_landed
98
- stats['td_attempted'] += td_attempted
99
 
100
- stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_att'))
101
-
102
- # Final calculations
103
- avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
104
- total_minutes = stats['total_time_secs'] / 60 if stats['total_time_secs'] > 0 else 0
 
 
 
 
 
 
 
105
 
106
- return {
107
- 'wins_last_n': stats['wins'],
108
- 'avg_opp_elo_last_n': avg_opp_elo,
109
- 'ko_percent_last_n': (stats['ko_wins'] / stats['wins']) if stats['wins'] > 0 else 0,
110
- 'sig_str_landed_per_min_last_n': (stats['sig_str_landed'] / total_minutes) if total_minutes > 0 else 0,
111
- 'takedown_accuracy_last_n': (stats['td_landed'] / stats['td_attempted']) if stats['td_attempted'] > 0 else 0,
112
- 'sub_attempts_per_min_last_n': (stats['sub_attempts'] / total_minutes) if total_minutes > 0 else 0,
113
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  def preprocess_for_ml(fights_to_process, fighters_csv_path):
116
- """
117
- Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
118
- suitable for a binary classification machine learning model.
119
-
120
- Args:
121
- fights_to_process (list of dict): The list of fights to process.
122
- fighters_csv_path (str): Path to the CSV file with all fighter stats.
123
-
124
- Returns:
125
- pd.DataFrame: Feature matrix X.
126
- pd.Series: Target vector y.
127
- pd.DataFrame: Metadata DataFrame.
128
- """
129
  if not os.path.exists(fighters_csv_path):
130
  raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
131
 
 
132
  fighters_df = pd.read_csv(fighters_csv_path)
 
 
133
 
134
- # 1. Prepare fighters data for merging
135
- fighters_prepared = fighters_df.copy()
136
- fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
137
-
138
- # Handle duplicate fighter names by keeping the first entry
139
- fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
140
- fighters_prepared = fighters_prepared.set_index('full_name')
141
-
142
  for col in ['height_cm', 'reach_in', 'elo']:
143
- if col in fighters_prepared.columns:
144
- fighters_prepared[col] = _clean_numeric_column(fighters_prepared[col])
145
-
146
- # 2. Pre-calculate fighter histories to speed up lookups
147
- # And convert date strings to datetime objects once
148
- for fight in fights_to_process:
149
- try:
150
- # This will work if event_date is a string
151
- fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
152
- except TypeError:
153
- # This will be triggered if it's already a date-like object (e.g., Timestamp)
154
- fight['date_obj'] = fight['event_date']
155
 
156
- fighter_histories = {}
157
- for fighter_name in fighters_prepared.index:
158
- history = [f for f in fights_to_process if fighter_name in (f['fighter_1'], f['fighter_2'])]
159
- fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
160
-
161
- # 3. Process fights to create features and targets
162
- feature_list = []
163
- target_list = []
164
- metadata_list = []
165
-
166
  for fight in fights_to_process:
167
- # Per the dataset's design, fighter_1 is always the winner.
168
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
169
-
170
- if f1_name not in fighters_prepared.index or f2_name not in fighters_prepared.index:
 
171
  continue
172
-
173
- f1_stats, f2_stats = fighters_prepared.loc[f1_name], fighters_prepared.loc[f2_name]
 
 
174
 
175
- if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
176
- if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
177
-
178
- # Calculate ages for both fighters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
180
  f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
181
-
182
- # Get historical stats for both fighters
183
- f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)
184
- f2_hist_stats = _get_fighter_history_stats(f2_name, fight['date_obj'], fighter_histories.get(f2_name, []), fighters_prepared)
185
 
186
- # --- Create two training examples from each fight for a balanced dataset ---
187
-
188
- # 1. The "Win" case: (fighter_1 - fighter_2)
189
- features_win = {
190
- # Original diffs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
192
- 'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
193
- 'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
194
- 'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
195
- 'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
196
- # New historical diffs
197
- 'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
198
- 'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
199
- 'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
200
- 'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
201
- # Grappling features
202
- 'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
203
- 'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
 
 
 
 
 
 
204
  }
205
- feature_list.append(features_win)
206
- target_list.append(1) # 1 represents a win
207
-
208
- # 2. The "Loss" case: (fighter_2 - fighter_1)
209
- # We invert the differences for the losing case.
210
- features_loss = {key: -value for key, value in features_win.items()}
211
- # Stance difference is symmetric; it doesn't get inverted.
212
- features_loss['stance_is_different'] = features_win['stance_is_different']
213
 
214
- feature_list.append(features_loss)
215
- target_list.append(0) # 0 represents a loss
216
-
217
- # Add metadata for both generated samples
218
- # The 'winner' and 'loser' are consistent with the original data structure
219
- metadata_list.append({
220
- 'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
221
- })
222
- metadata_list.append({
223
- 'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
224
- })
225
-
226
- X = pd.DataFrame(feature_list).fillna(0)
227
- y = pd.Series(target_list, name='winner')
228
- metadata = pd.DataFrame(metadata_list)
229
-
230
- print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
231
  return X, y, metadata
 
1
  import pandas as pd
2
  import os
3
  from datetime import datetime
 
4
 
5
  def _clean_numeric_column(series):
6
+ """Clean string columns into numbers, handling errors."""
7
  series_str = series.astype(str)
8
  return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
9
 
10
  def _calculate_age(dob_str, fight_date_str):
11
+ """Calculate age in years from date of birth and fight date strings."""
12
  if pd.isna(dob_str) or not dob_str:
13
  return None
14
  try:
 
18
  except (ValueError, TypeError):
19
  return None
20
 
21
+ def _get_days_since_last_fight(current_date, past_fights):
22
+ """Calculate days since a fighter's last fight."""
23
+ if not past_fights:
24
+ return None
25
+ last_fight_date = past_fights[-1]['date_obj']
26
+ return (current_date - last_fight_date).days
27
+
28
+ def _get_win_streak(fighter_name, current_date, past_fights):
29
+ """Calculate current win streak before a given date."""
30
+ streak = 0
31
+ for fight in reversed(past_fights):
32
+ if fight['date_obj'] >= current_date:
33
+ continue
34
+ if fight['winner'] == fighter_name:
35
+ streak += 1
36
+ else:
37
+ break
38
+ return streak
39
 
40
+ def _to_int_safe(value):
41
+ """Safely convert a value to integer, returning 0 for invalid values."""
 
 
42
  try:
43
+ return int(float(value)) if value and not pd.isna(value) else 0
 
44
  except (ValueError, TypeError):
45
  return 0
46
 
47
+ def _get_fighter_history_stats(fighter_name, current_fight_date, past_fights, fighters_df, n_fights=5):
48
+ """Calculate historical performance statistics for a fighter."""
49
+ # Sort fights by date and get last N fights before current fight
50
+ past_fights = [f for f in past_fights if f['date_obj'] < current_fight_date]
51
+ past_fights = sorted(past_fights, key=lambda x: x['date_obj'])
52
+ last_n_fights = past_fights[-n_fights:] if past_fights else []
53
+
 
 
 
 
 
 
 
 
 
 
 
54
  stats = {
55
+ 'wins_last_n': 0,
56
+ 'ko_wins': 0,
57
+ 'total_finishes': 0,
58
+ 'first_round_finishes': 0,
59
+ 'knockdowns_scored': 0,
60
+ 'knockdowns_absorbed': 0,
61
+ 'sig_str_landed': 0,
62
+ 'sig_str_attempted': 0,
63
+ 'takedowns_landed': 0,
64
+ 'takedowns_attempted': 0,
65
+ 'sub_attempts': 0,
66
+ 'ctrl_time_sec': 0,
67
+ 'total_fight_time_sec': 0,
68
+ 'fights_in_last_year': 0,
69
+ 'avg_opp_elo_last_n': 0
70
  }
71
+
72
+ # Calculate fights in last year
73
+ one_year_ago = current_fight_date - pd.Timedelta(days=365)
74
+ stats['fights_in_last_year'] = len([f for f in past_fights if f['date_obj'] >= one_year_ago])
75
+
76
+ # Process each fight
77
+ total_opp_elo = 0
78
  for fight in last_n_fights:
79
  is_fighter_1 = (fight['fighter_1'] == fighter_name)
80
+ f_prefix = 'f1' if is_fighter_1 else 'f2'
81
+ opp_prefix = 'f2' if is_fighter_1 else 'f1'
82
  opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
83
 
84
+ # Win/Loss and Finishes
 
85
  if fight['winner'] == fighter_name:
86
+ stats['wins_last_n'] += 1
87
+ if fight['method'] != 'Decision':
88
+ stats['total_finishes'] += 1
89
+ if fight['round'] == '1':
90
+ stats['first_round_finishes'] += 1
91
  if 'KO' in fight['method']:
92
  stats['ko_wins'] += 1
 
 
 
 
93
 
94
+ # Striking and Grappling Stats
95
+ stats['knockdowns_scored'] += _to_int_safe(fight.get(f'{f_prefix}_kd'))
96
+ stats['knockdowns_absorbed'] += _to_int_safe(fight.get(f'{opp_prefix}_kd'))
97
+ stats['sig_str_landed'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_landed'))
98
+ stats['sig_str_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_attempted'))
99
+ stats['takedowns_landed'] += _to_int_safe(fight.get(f'{f_prefix}_td_landed'))
100
+ stats['takedowns_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_td_attempted'))
101
+ stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_attempts'))
102
 
103
+ # Control Time
104
+ ctrl_time = fight.get(f'{f_prefix}_ctrl_time', '0:00')
105
+ if isinstance(ctrl_time, str) and ':' in ctrl_time:
106
+ mins, secs = map(int, ctrl_time.split(':'))
107
+ stats['ctrl_time_sec'] += mins * 60 + secs
 
 
 
108
 
109
+ # Fight Duration
110
+ round_num = _to_int_safe(fight['round'])
111
+ round_time = fight.get('round_time', '0:00')
112
+ if isinstance(round_time, str) and ':' in round_time:
113
+ mins, secs = map(int, round_time.split(':'))
114
+ stats['total_fight_time_sec'] += (round_num - 1) * 300 + mins * 60 + secs
115
+
116
+ # Opponent ELO
117
+ if opponent_name in fighters_df.index:
118
+ opp_elo = fighters_df.loc[opponent_name, 'elo']
119
+ if not pd.isna(opp_elo):
120
+ total_opp_elo += opp_elo
121
 
122
+ # Calculate averages and rates
123
+ n_actual_fights = len(last_n_fights)
124
+
125
+ # Always provide all required keys with default values
126
+ stats['finish_rate_last_n'] = stats['total_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
127
+ stats['first_round_finish_rate_last_n'] = stats['first_round_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
128
+ stats['ko_percent_last_n'] = stats['ko_wins'] / n_actual_fights if n_actual_fights > 0 else 0.0
129
+ stats['avg_knockdowns_per_fight_last_n'] = stats['knockdowns_scored'] / n_actual_fights if n_actual_fights > 0 else 0.0
130
+ stats['knockdowns_absorbed_per_fight_last_n'] = stats['knockdowns_absorbed'] / n_actual_fights if n_actual_fights > 0 else 0.0
131
+ stats['avg_opp_elo_last_n'] = total_opp_elo / n_actual_fights if n_actual_fights > 0 else 1500.0
132
+
133
+ # Per-minute stats
134
+ total_mins = stats['total_fight_time_sec'] / 60
135
+ stats['sig_str_landed_per_min_last_n'] = stats['sig_str_landed'] / total_mins if total_mins > 0 else 0.0
136
+ stats['sig_str_absorbed_per_min_last_n'] = stats['sig_str_attempted'] / total_mins if total_mins > 0 else 0.0
137
+ stats['sub_attempts_per_min_last_n'] = stats['sub_attempts'] / total_mins if total_mins > 0 else 0.0
138
+ stats['avg_ctrl_time_sec_per_min_last_n'] = stats['ctrl_time_sec'] / total_mins if total_mins > 0 else 0.0
139
+
140
+ # Accuracy stats
141
+ stats['sig_str_defense_last_n'] = stats['sig_str_landed'] / stats['sig_str_attempted'] if stats['sig_str_attempted'] > 0 else 0.5
142
+ stats['takedown_accuracy_last_n'] = stats['takedowns_landed'] / stats['takedowns_attempted'] if stats['takedowns_attempted'] > 0 else 0.5
143
+ stats['takedown_defense_last_n'] = 1 - (stats['takedowns_landed'] / stats['takedowns_attempted']) if stats['takedowns_attempted'] > 0 else 0.5
144
+
145
+ return stats
146
 
147
  def preprocess_for_ml(fights_to_process, fighters_csv_path):
148
+ """Transform fight data into ML-ready features."""
 
 
 
 
 
 
 
 
 
 
 
 
149
  if not os.path.exists(fighters_csv_path):
150
  raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
151
 
152
+ # Load and prepare fighter data
153
  fighters_df = pd.read_csv(fighters_csv_path)
154
+ fighters_df['full_name'] = fighters_df['first_name'] + ' ' + fighters_df['last_name']
155
+ fighters_df = fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
156
 
 
 
 
 
 
 
 
 
157
  for col in ['height_cm', 'reach_in', 'elo']:
158
+ if col in fighters_df.columns:
159
+ fighters_df[col] = _clean_numeric_column(fighters_df[col])
 
 
 
 
 
 
 
 
 
 
160
 
161
+ # Process fights and calculate features
162
+ processed_fights = []
 
 
 
 
 
 
 
 
163
  for fight in fights_to_process:
 
164
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
165
+
166
+ # Skip if either fighter is missing
167
+ if f1_name not in fighters_df.index or f2_name not in fighters_df.index:
168
  continue
169
+
170
+ # Get fighter stats
171
+ f1_stats = fighters_df.loc[f1_name]
172
+ f2_stats = fighters_df.loc[f2_name]
173
 
174
+ # Calculate fight date and ensure date_obj is available
175
+ fight_date = pd.to_datetime(fight['event_date'])
176
+ fight['date_obj'] = fight_date
177
+
178
+ # Get fighter histories and ensure date_obj is available for all fights
179
+ f1_hist = [f for f in fights_to_process if f1_name in (f['fighter_1'], f['fighter_2'])]
180
+ f2_hist = [f for f in fights_to_process if f2_name in (f['fighter_1'], f['fighter_2'])]
181
+
182
+ # Ensure date_obj is available for all historical fights
183
+ for hist_fight in f1_hist + f2_hist:
184
+ if 'date_obj' not in hist_fight:
185
+ hist_fight['date_obj'] = pd.to_datetime(hist_fight['event_date'])
186
+
187
+ # Calculate historical stats
188
+ f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, fighters_df)
189
+ f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, fighters_df)
190
+
191
+ # Calculate ages
192
  f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
193
  f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
 
 
 
 
194
 
195
+ # Calculate days since last fight
196
+ f1_days_since_last = _get_days_since_last_fight(fight_date, f1_hist) or 547 # ~1.5 years if no previous fights
197
+ f2_days_since_last = _get_days_since_last_fight(fight_date, f2_hist) or 547
198
+
199
+ # Calculate win streaks
200
+ f1_win_streak = _get_win_streak(f1_name, fight_date, f1_hist)
201
+ f2_win_streak = _get_win_streak(f2_name, fight_date, f2_hist)
202
+
203
+ # Compile all features
204
+ feature_dict = {
205
+ 'winner': 1 if fight.get('winner') == f1_name else 0,
206
+ 'date': fight['event_date'],
207
+ 'fighter_1': f1_name,
208
+ 'fighter_2': f2_name,
209
+
210
+ # Physical differences
211
+ 'height_diff': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
212
+ 'reach_diff': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
213
+ 'age_diff': (f1_age or 0) - (f2_age or 0),
214
  'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
215
+
216
+ # Career momentum
217
+ 'days_since_last_fight_diff': f1_days_since_last - f2_days_since_last,
218
+ 'win_streak_diff': f1_win_streak - f2_win_streak,
219
+ 'fights_last_year_diff': f1_hist_stats['fights_in_last_year'] - f2_hist_stats['fights_in_last_year'],
220
+
221
+ # Performance differences
222
+ 'finish_rate_diff': f1_hist_stats['finish_rate_last_n'] - f2_hist_stats['finish_rate_last_n'],
223
+ 'ko_rate_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
224
+ 'sig_str_per_min_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
225
+ 'td_accuracy_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
226
+ 'sub_attempts_per_min_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
227
+ 'control_time_diff': f1_hist_stats['avg_ctrl_time_sec_per_min_last_n'] - f2_hist_stats['avg_ctrl_time_sec_per_min_last_n'],
228
+
229
+ # Defense differences
230
+ 'sig_str_defense_diff': f1_hist_stats['sig_str_defense_last_n'] - f2_hist_stats['sig_str_defense_last_n'],
231
+ 'td_defense_diff': f1_hist_stats['takedown_defense_last_n'] - f2_hist_stats['takedown_defense_last_n'],
232
+ 'knockdowns_absorbed_diff': f1_hist_stats['knockdowns_absorbed_per_fight_last_n'] - f2_hist_stats['knockdowns_absorbed_per_fight_last_n']
233
  }
 
 
 
 
 
 
 
 
234
 
235
+ processed_fights.append(feature_dict)
236
+
237
+ if not processed_fights:
238
+ return pd.DataFrame(), pd.Series(), pd.DataFrame()
239
+
240
+ # Create final dataframes
241
+ df = pd.DataFrame(processed_fights)
242
+ metadata = df[['date', 'fighter_1', 'fighter_2', 'winner']]
243
+
244
+ # Prepare X and y
245
+ y = df['winner']
246
+ X = df.drop(columns=['winner', 'date', 'fighter_1', 'fighter_2'])
247
+ X = X.reindex(sorted(X.columns), axis=1) # Ensure consistent column order
248
+
249
+ # Handle missing values by filling NaNs with 0
250
+ X = X.fillna(0)
251
+
252
  return X, y, metadata