Spaces:
Sleeping
Sleeping
Commit
·
7fcaffe
1
Parent(s):
9678fdb
(CAREFUL!!!!!!) Refactor argument parsing and prediction pipeline
Browse filesMoved command-line argument parsing to a dedicated src/args.py module and updated main.py and predict/main.py to use these functions. Improved model management logic and modularized pipeline execution. Enhanced feature engineering and preprocessing in predict/preprocess.py and refactored model classes in predict/models.py for consistency and maintainability.
- output/model_results.json +2 -2
- output/models/EloBaselineModel.joblib +2 -2
- output/models/LogisticRegressionModel.joblib +2 -2
- src/args.py +97 -0
- src/main.py +65 -92
- src/predict/main.py +23 -55
- src/predict/models.py +67 -129
- src/predict/preprocess.py +196 -175
output/model_results.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40c2fb9010bdae4946c2b879d4014aa671a43b586aff7faa73ea4846585e589c
|
3 |
+
size 11671
|
output/models/EloBaselineModel.joblib
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40937e8b6fe9aaaa1ca92a84e3e67b5bdefcf2700d2cafb7830670a14f684858
|
3 |
+
size 938435
|
output/models/LogisticRegressionModel.joblib
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51c11a689c50244a6084e642a1dc35a349d515f075b40515dbd4164e7831dfdb
|
3 |
+
size 5518484
|
src/args.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
def get_pipeline_args():
|
4 |
+
"""
|
5 |
+
Parse command line arguments for the main UFC data pipeline.
|
6 |
+
|
7 |
+
Returns:
|
8 |
+
argparse.Namespace: Parsed command line arguments
|
9 |
+
"""
|
10 |
+
parser = argparse.ArgumentParser(description="UFC Data Pipeline")
|
11 |
+
|
12 |
+
# Pipeline selection
|
13 |
+
parser.add_argument(
|
14 |
+
'--pipeline',
|
15 |
+
type=str,
|
16 |
+
default='scrape',
|
17 |
+
choices=['scrape', 'analysis', 'predict', 'update', 'all'],
|
18 |
+
help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
|
19 |
+
)
|
20 |
+
|
21 |
+
# Scraping arguments
|
22 |
+
scraping_group = parser.add_argument_group('Scraping options')
|
23 |
+
scraping_group.add_argument(
|
24 |
+
'--scrape-mode',
|
25 |
+
type=str,
|
26 |
+
default='full',
|
27 |
+
choices=['full', 'update'],
|
28 |
+
help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
|
29 |
+
)
|
30 |
+
scraping_group.add_argument(
|
31 |
+
'--num-events',
|
32 |
+
type=int,
|
33 |
+
default=5,
|
34 |
+
help="Number of latest events to scrape in update mode (default: 5)"
|
35 |
+
)
|
36 |
+
|
37 |
+
# Model management arguments
|
38 |
+
model_group = parser.add_argument_group('Model management')
|
39 |
+
model_group.add_argument(
|
40 |
+
'--use-existing-models',
|
41 |
+
action='store_true',
|
42 |
+
default=True,
|
43 |
+
help="Use existing saved models if available and no new data (default: True)"
|
44 |
+
)
|
45 |
+
model_group.add_argument(
|
46 |
+
'--no-use-existing-models',
|
47 |
+
action='store_true',
|
48 |
+
default=False,
|
49 |
+
help="Force retrain all models from scratch, ignoring existing saved models"
|
50 |
+
)
|
51 |
+
model_group.add_argument(
|
52 |
+
'--force-retrain',
|
53 |
+
action='store_true',
|
54 |
+
default=False,
|
55 |
+
help="Force retrain all models even if no new data is available"
|
56 |
+
)
|
57 |
+
|
58 |
+
return parser.parse_args()
|
59 |
+
|
60 |
+
def get_prediction_args():
|
61 |
+
"""
|
62 |
+
Parse command line arguments specific to the prediction pipeline.
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
argparse.Namespace: Parsed command line arguments
|
66 |
+
"""
|
67 |
+
parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
|
68 |
+
|
69 |
+
parser.add_argument(
|
70 |
+
'--report',
|
71 |
+
type=str,
|
72 |
+
default='detailed',
|
73 |
+
choices=['detailed', 'summary'],
|
74 |
+
help="Type of report to generate: 'detailed' (file) or 'summary' (console)"
|
75 |
+
)
|
76 |
+
|
77 |
+
model_group = parser.add_argument_group('Model management')
|
78 |
+
model_group.add_argument(
|
79 |
+
'--use-existing-models',
|
80 |
+
action='store_true',
|
81 |
+
default=True,
|
82 |
+
help="Use existing saved models if available and no new data (default: True)"
|
83 |
+
)
|
84 |
+
model_group.add_argument(
|
85 |
+
'--no-use-existing-models',
|
86 |
+
action='store_true',
|
87 |
+
default=False,
|
88 |
+
help="Force retrain all models from scratch, ignoring existing saved models"
|
89 |
+
)
|
90 |
+
model_group.add_argument(
|
91 |
+
'--force-retrain',
|
92 |
+
action='store_true',
|
93 |
+
default=False,
|
94 |
+
help="Force retrain all models even if no new data is available"
|
95 |
+
)
|
96 |
+
|
97 |
+
return parser.parse_args()
|
src/main.py
CHANGED
@@ -1,106 +1,79 @@
|
|
1 |
-
import argparse
|
2 |
import sys
|
3 |
import os
|
|
|
4 |
|
5 |
-
def
|
6 |
-
"""
|
7 |
-
|
8 |
-
|
9 |
-
"""
|
10 |
-
parser = argparse.ArgumentParser(description="UFC Data Pipeline")
|
11 |
-
parser.add_argument(
|
12 |
-
'--pipeline',
|
13 |
-
type=str,
|
14 |
-
default='scrape',
|
15 |
-
choices=['scrape', 'analysis', 'predict', 'update', 'all'],
|
16 |
-
help="Pipeline to run: 'scrape', 'analysis', 'predict', 'update', or 'all'"
|
17 |
-
)
|
18 |
-
parser.add_argument(
|
19 |
-
'--scrape-mode',
|
20 |
-
type=str,
|
21 |
-
default='full',
|
22 |
-
choices=['full', 'update'],
|
23 |
-
help="Scraping mode: 'full' (complete scraping) or 'update' (latest events only)"
|
24 |
-
)
|
25 |
-
parser.add_argument(
|
26 |
-
'--num-events',
|
27 |
-
type=int,
|
28 |
-
default=5,
|
29 |
-
help="Number of latest events to scrape in update mode (default: 5)"
|
30 |
-
)
|
31 |
-
# Model management arguments for prediction pipeline
|
32 |
-
parser.add_argument(
|
33 |
-
'--use-existing-models',
|
34 |
-
action='store_true',
|
35 |
-
default=True,
|
36 |
-
help="Use existing saved models if available and no new data (default: True)."
|
37 |
-
)
|
38 |
-
parser.add_argument(
|
39 |
-
'--no-use-existing-models',
|
40 |
-
action='store_true',
|
41 |
-
default=False,
|
42 |
-
help="Force retrain all models from scratch, ignoring existing saved models."
|
43 |
-
)
|
44 |
-
parser.add_argument(
|
45 |
-
'--force-retrain',
|
46 |
-
action='store_true',
|
47 |
-
default=False,
|
48 |
-
help="Force retrain all models even if no new data is available."
|
49 |
-
)
|
50 |
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
if args.pipeline in ['analysis', 'all']:
|
66 |
-
|
67 |
-
|
68 |
-
elo_main()
|
69 |
-
|
70 |
if args.pipeline == 'update':
|
71 |
-
|
72 |
-
try:
|
73 |
-
from src.predict.main import MODELS_TO_RUN
|
74 |
-
from src.predict.pipeline import PredictionPipeline
|
75 |
-
except ImportError:
|
76 |
-
print("Fatal: Could not import prediction modules.")
|
77 |
-
print("Please ensure your project structure and python path are correct.")
|
78 |
-
return
|
79 |
-
|
80 |
-
pipeline = PredictionPipeline(models=MODELS_TO_RUN)
|
81 |
-
pipeline.update_models_if_new_data()
|
82 |
|
83 |
if args.pipeline in ['predict', 'all']:
|
84 |
-
|
85 |
-
from src.predict.main import main as predict_main
|
86 |
-
|
87 |
-
# Override sys.argv to pass model management arguments to predict.main
|
88 |
-
original_argv = sys.argv
|
89 |
-
predict_args = ['predict_main']
|
90 |
-
|
91 |
-
if args.no_use_existing_models:
|
92 |
-
predict_args.append('--no-use-existing-models')
|
93 |
-
elif args.use_existing_models:
|
94 |
-
predict_args.append('--use-existing-models')
|
95 |
-
|
96 |
-
if args.force_retrain:
|
97 |
-
predict_args.append('--force-retrain')
|
98 |
-
|
99 |
-
sys.argv = predict_args
|
100 |
-
try:
|
101 |
-
predict_main()
|
102 |
-
finally:
|
103 |
-
sys.argv = original_argv
|
104 |
|
105 |
if __name__ == '__main__':
|
106 |
main()
|
|
|
|
|
1 |
import sys
|
2 |
import os
|
3 |
+
from .args import get_pipeline_args
|
4 |
|
5 |
+
def run_scraping_pipeline(args):
|
6 |
+
"""Execute the scraping pipeline with given arguments."""
|
7 |
+
print("=== Running Scraping Pipeline ===")
|
8 |
+
from .scrape.main import main as scrape_main
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# Pass arguments to scrape.main
|
11 |
+
original_argv = sys.argv
|
12 |
+
sys.argv = ['scrape_main', '--mode', args.scrape_mode, '--num-events', str(args.num_events)]
|
13 |
+
try:
|
14 |
+
scrape_main()
|
15 |
+
finally:
|
16 |
+
sys.argv = original_argv
|
17 |
+
|
18 |
+
def run_analysis_pipeline():
|
19 |
+
"""Execute the ELO analysis pipeline."""
|
20 |
+
print("\n=== Running ELO Analysis ===")
|
21 |
+
from .analysis.elo import main as elo_main
|
22 |
+
elo_main()
|
23 |
+
|
24 |
+
def run_prediction_pipeline(args):
|
25 |
+
"""Execute the prediction pipeline with given arguments."""
|
26 |
+
print("\n=== Running Prediction Pipeline ===")
|
27 |
+
from .predict.main import main as predict_main
|
28 |
|
29 |
+
# Pass model management arguments to predict.main
|
30 |
+
original_argv = sys.argv
|
31 |
+
predict_args = ['predict_main']
|
32 |
+
|
33 |
+
if args.no_use_existing_models:
|
34 |
+
predict_args.append('--no-use-existing-models')
|
35 |
+
elif args.use_existing_models:
|
36 |
+
predict_args.append('--use-existing-models')
|
37 |
|
38 |
+
if args.force_retrain:
|
39 |
+
predict_args.append('--force-retrain')
|
40 |
+
|
41 |
+
sys.argv = predict_args
|
42 |
+
try:
|
43 |
+
predict_main()
|
44 |
+
finally:
|
45 |
+
sys.argv = original_argv
|
46 |
+
|
47 |
+
def run_model_update(args):
|
48 |
+
"""Execute the model update pipeline."""
|
49 |
+
print("\n=== Running Model Update Pipeline ===")
|
50 |
+
try:
|
51 |
+
from .predict.main import MODELS_TO_RUN
|
52 |
+
from .predict.pipeline import PredictionPipeline
|
53 |
+
except ImportError:
|
54 |
+
print("Fatal: Could not import prediction modules.")
|
55 |
+
print("Please ensure your project structure and python path are correct.")
|
56 |
+
return
|
57 |
+
|
58 |
+
pipeline = PredictionPipeline(models=MODELS_TO_RUN)
|
59 |
+
pipeline.update_models_if_new_data()
|
60 |
+
|
61 |
+
def main():
|
62 |
+
"""Main entry point for the UFC data pipeline."""
|
63 |
+
args = get_pipeline_args()
|
64 |
+
|
65 |
+
# Execute requested pipeline(s)
|
66 |
+
if args.pipeline in ['scrape', 'all']:
|
67 |
+
run_scraping_pipeline(args)
|
68 |
|
69 |
if args.pipeline in ['analysis', 'all']:
|
70 |
+
run_analysis_pipeline()
|
71 |
+
|
|
|
|
|
72 |
if args.pipeline == 'update':
|
73 |
+
run_model_update(args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
if args.pipeline in ['predict', 'all']:
|
76 |
+
run_prediction_pipeline(args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
if __name__ == '__main__':
|
79 |
main()
|
src/predict/main.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
from .pipeline import PredictionPipeline
|
4 |
from .models import (
|
5 |
EloBaselineModel,
|
@@ -11,56 +10,34 @@ from .models import (
|
|
11 |
LGBMModel
|
12 |
)
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
#
|
|
|
|
|
|
|
26 |
|
27 |
def main():
|
28 |
"""
|
29 |
Main entry point to run the prediction pipeline.
|
30 |
You can specify which models to run and the reporting format.
|
31 |
"""
|
32 |
-
|
33 |
-
parser.add_argument(
|
34 |
-
'--report',
|
35 |
-
type=str,
|
36 |
-
default='detailed',
|
37 |
-
choices=['detailed', 'summary'],
|
38 |
-
help="Type of report to generate: 'detailed' (file) or 'summary' (console)."
|
39 |
-
)
|
40 |
-
parser.add_argument(
|
41 |
-
'--use-existing-models',
|
42 |
-
action='store_true',
|
43 |
-
default=True,
|
44 |
-
help="Use existing saved models if available and no new data (default: True)."
|
45 |
-
)
|
46 |
-
parser.add_argument(
|
47 |
-
'--no-use-existing-models',
|
48 |
-
action='store_true',
|
49 |
-
default=False,
|
50 |
-
help="Force retrain all models from scratch, ignoring existing saved models."
|
51 |
-
)
|
52 |
-
parser.add_argument(
|
53 |
-
'--force-retrain',
|
54 |
-
action='store_true',
|
55 |
-
default=False,
|
56 |
-
help="Force retrain all models even if no new data is available."
|
57 |
-
)
|
58 |
-
args = parser.parse_args()
|
59 |
|
60 |
# Handle conflicting arguments
|
61 |
use_existing_models = not args.no_use_existing_models and args.use_existing_models
|
62 |
force_retrain = args.force_retrain
|
63 |
|
|
|
64 |
if args.no_use_existing_models:
|
65 |
print("No-use-existing-models flag set: All models will be retrained from scratch.")
|
66 |
elif force_retrain:
|
@@ -68,21 +45,9 @@ def main():
|
|
68 |
elif use_existing_models:
|
69 |
print("Using existing models if available and no new data detected.")
|
70 |
|
71 |
-
#
|
72 |
-
# Instantiate all the models you want to evaluate here.
|
73 |
-
models_to_run = [
|
74 |
-
EloBaselineModel(),
|
75 |
-
LogisticRegressionModel(),
|
76 |
-
XGBoostModel(),
|
77 |
-
SVCModel(),
|
78 |
-
RandomForestModel(),
|
79 |
-
BernoulliNBModel(),
|
80 |
-
LGBMModel(),
|
81 |
-
]
|
82 |
-
# --- End of Model Definition ---
|
83 |
-
|
84 |
pipeline = PredictionPipeline(
|
85 |
-
models=
|
86 |
use_existing_models=use_existing_models,
|
87 |
force_retrain=force_retrain
|
88 |
)
|
@@ -92,3 +57,6 @@ def main():
|
|
92 |
except FileNotFoundError as e:
|
93 |
print(f"Error: {e}")
|
94 |
print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
|
|
|
|
|
|
|
|
1 |
+
from ..args import get_prediction_args
|
|
|
2 |
from .pipeline import PredictionPipeline
|
3 |
from .models import (
|
4 |
EloBaselineModel,
|
|
|
10 |
LGBMModel
|
11 |
)
|
12 |
|
13 |
+
def get_available_models():
|
14 |
+
"""Get a list of all available prediction models.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
list: List of instantiated model objects
|
18 |
+
"""
|
19 |
+
return [
|
20 |
+
EloBaselineModel(),
|
21 |
+
LogisticRegressionModel(),
|
22 |
+
# XGBoostModel(),
|
23 |
+
# SVCModel(),
|
24 |
+
# RandomForestModel(),
|
25 |
+
# BernoulliNBModel(),
|
26 |
+
LGBMModel(),
|
27 |
+
]
|
28 |
|
29 |
def main():
|
30 |
"""
|
31 |
Main entry point to run the prediction pipeline.
|
32 |
You can specify which models to run and the reporting format.
|
33 |
"""
|
34 |
+
args = get_prediction_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
# Handle conflicting arguments
|
37 |
use_existing_models = not args.no_use_existing_models and args.use_existing_models
|
38 |
force_retrain = args.force_retrain
|
39 |
|
40 |
+
# Log model management settings
|
41 |
if args.no_use_existing_models:
|
42 |
print("No-use-existing-models flag set: All models will be retrained from scratch.")
|
43 |
elif force_retrain:
|
|
|
45 |
elif use_existing_models:
|
46 |
print("Using existing models if available and no new data detected.")
|
47 |
|
48 |
+
# Initialize and run prediction pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
pipeline = PredictionPipeline(
|
50 |
+
models=get_available_models(),
|
51 |
use_existing_models=use_existing_models,
|
52 |
force_retrain=force_retrain
|
53 |
)
|
|
|
57 |
except FileNotFoundError as e:
|
58 |
print(f"Error: {e}")
|
59 |
print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
|
60 |
+
except Exception as e:
|
61 |
+
print(f"An unexpected error occurred: {e}")
|
62 |
+
raise
|
src/predict/models.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
-
import sys
|
3 |
-
import os
|
4 |
import pandas as pd
|
5 |
from sklearn.linear_model import LogisticRegression
|
6 |
from sklearn.svm import SVC
|
@@ -10,188 +8,128 @@ from xgboost import XGBClassifier
|
|
10 |
from lightgbm import LGBMClassifier
|
11 |
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
12 |
from ..config import FIGHTERS_CSV_PATH
|
13 |
-
from .preprocess import preprocess_for_ml
|
14 |
|
15 |
class BaseModel(ABC):
|
16 |
-
"""
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
20 |
@abstractmethod
|
21 |
def train(self, train_fights):
|
22 |
-
"""
|
23 |
-
Trains or prepares the model using historical fight data.
|
24 |
-
|
25 |
-
:param train_fights: A list of historical fight data dictionaries.
|
26 |
-
"""
|
27 |
pass
|
28 |
|
29 |
@abstractmethod
|
30 |
def predict(self, fight):
|
31 |
-
"""
|
32 |
-
Predicts the winner of a single fight.
|
33 |
-
|
34 |
-
:param fight: A dictionary representing a single fight.
|
35 |
-
:return: The name of the predicted winning fighter.
|
36 |
-
"""
|
37 |
pass
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
"""
|
43 |
-
def __init__(self):
|
44 |
-
self.fighters_df = None
|
45 |
|
|
|
|
|
|
|
46 |
def train(self, train_fights):
|
47 |
-
"""
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
print("Training EloBaselineModel: Loading fighter ELO data...")
|
52 |
self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
|
53 |
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
54 |
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
def predict(self, fight):
|
57 |
-
"""
|
58 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
59 |
|
60 |
try:
|
61 |
f1_elo = self.fighters_df.loc[f1_name, 'elo']
|
62 |
f2_elo = self.fighters_df.loc[f2_name, 'elo']
|
63 |
|
64 |
-
# Calculate win probability
|
65 |
prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
|
66 |
-
|
67 |
-
if prob_f1_wins >= 0.5
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
except KeyError as e:
|
73 |
print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
|
74 |
-
return
|
75 |
|
76 |
class BaseMLModel(BaseModel):
|
77 |
-
"""
|
78 |
-
|
79 |
-
data preparation, training, and prediction logic.
|
80 |
-
"""
|
81 |
def __init__(self, model):
|
|
|
82 |
if model is None:
|
83 |
raise ValueError("A model must be provided.")
|
84 |
self.model = model
|
85 |
-
self.fighters_df = None
|
86 |
-
self.fighter_histories = {}
|
87 |
|
88 |
def train(self, train_fights):
|
89 |
-
"""
|
90 |
-
|
91 |
-
pre-calculating histories, and fitting the model on the preprocessed data.
|
92 |
-
"""
|
93 |
-
print(f"--- Training {self.model.__class__.__name__} ---")
|
94 |
|
95 |
-
#
|
96 |
-
self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
|
97 |
-
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
98 |
-
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
99 |
-
for col in ['height_cm', 'reach_in', 'elo']:
|
100 |
-
if col in self.fighters_df.columns:
|
101 |
-
self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
|
102 |
-
|
103 |
-
# 2. Pre-calculate fighter histories
|
104 |
-
train_fights_with_dates = []
|
105 |
-
for fight in train_fights:
|
106 |
-
fight['date_obj'] = pd.to_datetime(fight['event_date'])
|
107 |
-
train_fights_with_dates.append(fight)
|
108 |
-
for fighter_name in self.fighters_df.index:
|
109 |
-
history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
|
110 |
-
self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
|
111 |
-
|
112 |
-
# 3. Preprocess and fit
|
113 |
X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
|
114 |
print(f"Fitting model on {X_train.shape[0]} samples...")
|
115 |
self.model.fit(X_train, y_train)
|
116 |
print("Model training complete.")
|
117 |
|
118 |
def predict(self, fight):
|
119 |
-
"""
|
120 |
-
|
121 |
-
|
122 |
-
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
123 |
-
fight_date = pd.to_datetime(fight['event_date'])
|
124 |
-
|
125 |
-
if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
|
126 |
-
print(f"Warning: Fighter not found. Skipping prediction for {f1_name} vs {f2_name}")
|
127 |
-
return {'winner': None, 'probability': None}
|
128 |
-
|
129 |
-
f1_stats = self.fighters_df.loc[f1_name]
|
130 |
-
f2_stats = self.fighters_df.loc[f2_name]
|
131 |
-
if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
|
132 |
-
if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
|
151 |
-
'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
|
152 |
-
'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
|
153 |
-
'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
|
154 |
-
}
|
155 |
-
|
156 |
-
feature_vector = pd.DataFrame([features]).fillna(0)
|
157 |
-
|
158 |
-
# Use predict_proba to get probabilities for each class
|
159 |
-
probabilities = self.model.predict_proba(feature_vector)[0]
|
160 |
-
prob_f1_wins = probabilities[1] # Probability of class '1' (fighter 1 wins)
|
161 |
-
|
162 |
-
if prob_f1_wins >= 0.5:
|
163 |
-
return {'winner': f1_name, 'probability': prob_f1_wins}
|
164 |
-
else:
|
165 |
-
return {'winner': f2_name, 'probability': 1 - prob_f1_wins}
|
166 |
|
|
|
167 |
class LogisticRegressionModel(BaseMLModel):
|
168 |
-
"""A thin wrapper for scikit-learn's LogisticRegression."""
|
169 |
def __init__(self):
|
170 |
-
super().__init__(
|
171 |
-
|
172 |
-
class XGBoostModel(BaseMLModel):
|
173 |
-
"""A thin wrapper for XGBoost's XGBClassifier."""
|
174 |
-
def __init__(self):
|
175 |
-
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
|
176 |
-
super().__init__(model=model)
|
177 |
|
178 |
class SVCModel(BaseMLModel):
|
179 |
-
"""A thin wrapper for scikit-learn's Support Vector Classifier."""
|
180 |
def __init__(self):
|
181 |
-
|
182 |
-
super().__init__(model=SVC(probability=True, random_state=42))
|
183 |
|
184 |
class RandomForestModel(BaseMLModel):
|
185 |
-
"""A thin wrapper for scikit-learn's RandomForestClassifier."""
|
186 |
def __init__(self):
|
187 |
-
super().__init__(
|
188 |
|
189 |
class BernoulliNBModel(BaseMLModel):
|
190 |
-
"""A thin wrapper for scikit-learn's Bernoulli Naive Bayes classifier."""
|
191 |
def __init__(self):
|
192 |
-
super().__init__(
|
|
|
|
|
|
|
|
|
193 |
|
194 |
class LGBMModel(BaseMLModel):
|
195 |
-
"""A thin wrapper for LightGBM's LGBMClassifier."""
|
196 |
def __init__(self):
|
197 |
-
super().__init__(
|
|
|
1 |
from abc import ABC, abstractmethod
|
|
|
|
|
2 |
import pandas as pd
|
3 |
from sklearn.linear_model import LogisticRegression
|
4 |
from sklearn.svm import SVC
|
|
|
8 |
from lightgbm import LGBMClassifier
|
9 |
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
10 |
from ..config import FIGHTERS_CSV_PATH
|
11 |
+
from .preprocess import preprocess_for_ml
|
12 |
|
13 |
class BaseModel(ABC):
|
14 |
+
"""Abstract base class for all prediction models."""
|
15 |
+
|
16 |
+
def __init__(self):
|
17 |
+
self.model_name = self.__class__.__name__
|
18 |
+
|
19 |
@abstractmethod
|
20 |
def train(self, train_fights):
|
21 |
+
"""Train the model using historical fight data."""
|
|
|
|
|
|
|
|
|
22 |
pass
|
23 |
|
24 |
@abstractmethod
|
25 |
def predict(self, fight):
|
26 |
+
"""Predict the winner of a single fight."""
|
|
|
|
|
|
|
|
|
|
|
27 |
pass
|
28 |
|
29 |
+
def _format_prediction(self, winner, probability):
|
30 |
+
"""Format prediction results consistently."""
|
31 |
+
return {'winner': winner, 'probability': probability}
|
|
|
|
|
|
|
32 |
|
33 |
+
class EloBaselineModel(BaseModel):
|
34 |
+
"""Simple ELO-based prediction model."""
|
35 |
+
|
36 |
def train(self, train_fights):
|
37 |
+
"""Process historical fights to calculate current ELO ratings."""
|
38 |
+
print(f"--- Training {self.model_name} ---")
|
39 |
+
|
40 |
+
# Load and prepare fighter data
|
|
|
41 |
self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
|
42 |
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
43 |
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
44 |
+
|
45 |
+
# Calculate ELO ratings
|
46 |
+
elo_ratings = process_fights_for_elo(train_fights)
|
47 |
+
self.fighters_df['elo'] = pd.Series(elo_ratings)
|
48 |
+
self.fighters_df['elo'] = self.fighters_df['elo'].fillna(INITIAL_ELO)
|
49 |
+
|
50 |
+
print("ELO ratings calculated for all fighters.")
|
51 |
|
52 |
def predict(self, fight):
|
53 |
+
"""Predict winner based on current ELO ratings."""
|
54 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
55 |
|
56 |
try:
|
57 |
f1_elo = self.fighters_df.loc[f1_name, 'elo']
|
58 |
f2_elo = self.fighters_df.loc[f2_name, 'elo']
|
59 |
|
60 |
+
# Calculate win probability using ELO formula
|
61 |
prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))
|
62 |
+
|
63 |
+
winner = f1_name if prob_f1_wins >= 0.5 else f2_name
|
64 |
+
probability = prob_f1_wins if prob_f1_wins >= 0.5 else 1 - prob_f1_wins
|
65 |
+
|
66 |
+
return self._format_prediction(winner, probability)
|
67 |
+
|
68 |
except KeyError as e:
|
69 |
print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
|
70 |
+
return self._format_prediction(None, None)
|
71 |
|
72 |
class BaseMLModel(BaseModel):
|
73 |
+
"""Base class for all machine learning models."""
|
74 |
+
|
|
|
|
|
75 |
def __init__(self, model):
|
76 |
+
super().__init__()
|
77 |
if model is None:
|
78 |
raise ValueError("A model must be provided.")
|
79 |
self.model = model
|
|
|
|
|
80 |
|
81 |
def train(self, train_fights):
|
82 |
+
"""Train the ML model on preprocessed fight data."""
|
83 |
+
print(f"--- Training {self.model_name} ---")
|
|
|
|
|
|
|
84 |
|
85 |
+
# Preprocess data and fit model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
|
87 |
print(f"Fitting model on {X_train.shape[0]} samples...")
|
88 |
self.model.fit(X_train, y_train)
|
89 |
print("Model training complete.")
|
90 |
|
91 |
def predict(self, fight):
|
92 |
+
"""Predict fight outcome using the trained ML model."""
|
93 |
+
# Preprocess single fight for prediction
|
94 |
+
X_pred, _, metadata = preprocess_for_ml([fight], FIGHTERS_CSV_PATH)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
+
if X_pred.empty:
|
97 |
+
print(f"Warning: Could not process fight data for {fight['fighter_1']} vs {fight['fighter_2']}")
|
98 |
+
return self._format_prediction(None, None)
|
|
|
99 |
|
100 |
+
# Make prediction
|
101 |
+
try:
|
102 |
+
prob_f1_wins = self.model.predict_proba(X_pred)[0][1]
|
103 |
+
winner = fight['fighter_1'] if prob_f1_wins >= 0.5 else fight['fighter_2']
|
104 |
+
probability = prob_f1_wins if prob_f1_wins >= 0.5 else 1 - prob_f1_wins
|
105 |
+
|
106 |
+
return self._format_prediction(winner, probability)
|
107 |
+
|
108 |
+
except Exception as e:
|
109 |
+
print(f"Error making prediction: {e}")
|
110 |
+
return self._format_prediction(None, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
+
# Concrete ML model implementations
|
113 |
class LogisticRegressionModel(BaseMLModel):
|
|
|
114 |
def __init__(self):
|
115 |
+
super().__init__(LogisticRegression(random_state=42))
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
class SVCModel(BaseMLModel):
|
|
|
118 |
def __init__(self):
|
119 |
+
super().__init__(SVC(probability=True, random_state=42))
|
|
|
120 |
|
121 |
class RandomForestModel(BaseMLModel):
|
|
|
122 |
def __init__(self):
|
123 |
+
super().__init__(RandomForestClassifier(n_estimators=100, random_state=42))
|
124 |
|
125 |
class BernoulliNBModel(BaseMLModel):
|
|
|
126 |
def __init__(self):
|
127 |
+
super().__init__(BernoulliNB())
|
128 |
+
|
129 |
+
class XGBoostModel(BaseMLModel):
|
130 |
+
def __init__(self):
|
131 |
+
super().__init__(XGBClassifier(random_state=42))
|
132 |
|
133 |
class LGBMModel(BaseMLModel):
|
|
|
134 |
def __init__(self):
|
135 |
+
super().__init__(LGBMClassifier(random_state=42))
|
src/predict/preprocess.py
CHANGED
@@ -1,15 +1,14 @@
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
from datetime import datetime
|
4 |
-
from ..config import FIGHTERS_CSV_PATH
|
5 |
|
6 |
def _clean_numeric_column(series):
|
7 |
-
"""
|
8 |
series_str = series.astype(str)
|
9 |
return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
|
10 |
|
11 |
def _calculate_age(dob_str, fight_date_str):
|
12 |
-
"""
|
13 |
if pd.isna(dob_str) or not dob_str:
|
14 |
return None
|
15 |
try:
|
@@ -19,213 +18,235 @@ def _calculate_age(dob_str, fight_date_str):
|
|
19 |
except (ValueError, TypeError):
|
20 |
return None
|
21 |
|
22 |
-
def
|
23 |
-
"""
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
39 |
|
40 |
-
def _to_int_safe(
|
41 |
-
"""Safely
|
42 |
-
if pd.isna(val):
|
43 |
-
return 0
|
44 |
try:
|
45 |
-
|
46 |
-
return int(str(val).strip() or 0)
|
47 |
except (ValueError, TypeError):
|
48 |
return 0
|
49 |
|
50 |
-
def _get_fighter_history_stats(fighter_name, current_fight_date,
|
51 |
-
"""
|
52 |
-
|
53 |
-
|
54 |
-
past_fights =
|
55 |
-
last_n_fights = past_fights[-
|
56 |
-
|
57 |
-
if not last_n_fights:
|
58 |
-
# Return a default dictionary with the correct keys for a fighter with no history
|
59 |
-
return {
|
60 |
-
'wins_last_n': 0,
|
61 |
-
'avg_opp_elo_last_n': 1500, # Assume average ELO for first opponent
|
62 |
-
'ko_percent_last_n': 0,
|
63 |
-
'sig_str_landed_per_min_last_n': 0,
|
64 |
-
'takedown_accuracy_last_n': 0,
|
65 |
-
'sub_attempts_per_min_last_n': 0,
|
66 |
-
}
|
67 |
-
|
68 |
stats = {
|
69 |
-
'
|
70 |
-
'
|
71 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
}
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
for fight in last_n_fights:
|
75 |
is_fighter_1 = (fight['fighter_1'] == fighter_name)
|
|
|
|
|
76 |
opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
|
77 |
|
78 |
-
|
79 |
-
|
80 |
if fight['winner'] == fighter_name:
|
81 |
-
stats['
|
|
|
|
|
|
|
|
|
82 |
if 'KO' in fight['method']:
|
83 |
stats['ko_wins'] += 1
|
84 |
-
|
85 |
-
if opponent_name in fighters_df.index:
|
86 |
-
opp_elo = fighters_df.loc[opponent_name, 'elo']
|
87 |
-
stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else 1500)
|
88 |
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
td_landed, td_attempted = _parse_striking_stats(td_stat) # Can reuse this parser
|
97 |
-
stats['td_landed'] += td_landed
|
98 |
-
stats['td_attempted'] += td_attempted
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
def preprocess_for_ml(fights_to_process, fighters_csv_path):
|
116 |
-
"""
|
117 |
-
Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
|
118 |
-
suitable for a binary classification machine learning model.
|
119 |
-
|
120 |
-
Args:
|
121 |
-
fights_to_process (list of dict): The list of fights to process.
|
122 |
-
fighters_csv_path (str): Path to the CSV file with all fighter stats.
|
123 |
-
|
124 |
-
Returns:
|
125 |
-
pd.DataFrame: Feature matrix X.
|
126 |
-
pd.Series: Target vector y.
|
127 |
-
pd.DataFrame: Metadata DataFrame.
|
128 |
-
"""
|
129 |
if not os.path.exists(fighters_csv_path):
|
130 |
raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
|
131 |
|
|
|
132 |
fighters_df = pd.read_csv(fighters_csv_path)
|
|
|
|
|
133 |
|
134 |
-
# 1. Prepare fighters data for merging
|
135 |
-
fighters_prepared = fighters_df.copy()
|
136 |
-
fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
|
137 |
-
|
138 |
-
# Handle duplicate fighter names by keeping the first entry
|
139 |
-
fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
|
140 |
-
fighters_prepared = fighters_prepared.set_index('full_name')
|
141 |
-
|
142 |
for col in ['height_cm', 'reach_in', 'elo']:
|
143 |
-
if col in
|
144 |
-
|
145 |
-
|
146 |
-
# 2. Pre-calculate fighter histories to speed up lookups
|
147 |
-
# And convert date strings to datetime objects once
|
148 |
-
for fight in fights_to_process:
|
149 |
-
try:
|
150 |
-
# This will work if event_date is a string
|
151 |
-
fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
|
152 |
-
except TypeError:
|
153 |
-
# This will be triggered if it's already a date-like object (e.g., Timestamp)
|
154 |
-
fight['date_obj'] = fight['event_date']
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
history = [f for f in fights_to_process if fighter_name in (f['fighter_1'], f['fighter_2'])]
|
159 |
-
fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
|
160 |
-
|
161 |
-
# 3. Process fights to create features and targets
|
162 |
-
feature_list = []
|
163 |
-
target_list = []
|
164 |
-
metadata_list = []
|
165 |
-
|
166 |
for fight in fights_to_process:
|
167 |
-
# Per the dataset's design, fighter_1 is always the winner.
|
168 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
169 |
-
|
170 |
-
|
|
|
171 |
continue
|
172 |
-
|
173 |
-
|
|
|
|
|
174 |
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
|
180 |
f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
|
181 |
-
|
182 |
-
# Get historical stats for both fighters
|
183 |
-
f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)
|
184 |
-
f2_hist_stats = _get_fighter_history_stats(f2_name, fight['date_obj'], fighter_histories.get(f2_name, []), fighters_prepared)
|
185 |
|
186 |
-
#
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
|
192 |
-
|
193 |
-
|
194 |
-
'
|
195 |
-
'
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
'
|
200 |
-
'
|
201 |
-
|
202 |
-
'
|
203 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
}
|
205 |
-
feature_list.append(features_win)
|
206 |
-
target_list.append(1) # 1 represents a win
|
207 |
-
|
208 |
-
# 2. The "Loss" case: (fighter_2 - fighter_1)
|
209 |
-
# We invert the differences for the losing case.
|
210 |
-
features_loss = {key: -value for key, value in features_win.items()}
|
211 |
-
# Stance difference is symmetric; it doesn't get inverted.
|
212 |
-
features_loss['stance_is_different'] = features_win['stance_is_different']
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
X =
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
return X, y, metadata
|
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
from datetime import datetime
|
|
|
4 |
|
5 |
def _clean_numeric_column(series):
|
6 |
+
"""Clean string columns into numbers, handling errors."""
|
7 |
series_str = series.astype(str)
|
8 |
return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
|
9 |
|
10 |
def _calculate_age(dob_str, fight_date_str):
|
11 |
+
"""Calculate age in years from date of birth and fight date strings."""
|
12 |
if pd.isna(dob_str) or not dob_str:
|
13 |
return None
|
14 |
try:
|
|
|
18 |
except (ValueError, TypeError):
|
19 |
return None
|
20 |
|
21 |
+
def _get_days_since_last_fight(current_date, past_fights):
|
22 |
+
"""Calculate days since a fighter's last fight."""
|
23 |
+
if not past_fights:
|
24 |
+
return None
|
25 |
+
last_fight_date = past_fights[-1]['date_obj']
|
26 |
+
return (current_date - last_fight_date).days
|
27 |
+
|
28 |
+
def _get_win_streak(fighter_name, current_date, past_fights):
|
29 |
+
"""Calculate current win streak before a given date."""
|
30 |
+
streak = 0
|
31 |
+
for fight in reversed(past_fights):
|
32 |
+
if fight['date_obj'] >= current_date:
|
33 |
+
continue
|
34 |
+
if fight['winner'] == fighter_name:
|
35 |
+
streak += 1
|
36 |
+
else:
|
37 |
+
break
|
38 |
+
return streak
|
39 |
|
40 |
+
def _to_int_safe(value):
|
41 |
+
"""Safely convert a value to integer, returning 0 for invalid values."""
|
|
|
|
|
42 |
try:
|
43 |
+
return int(float(value)) if value and not pd.isna(value) else 0
|
|
|
44 |
except (ValueError, TypeError):
|
45 |
return 0
|
46 |
|
47 |
+
def _get_fighter_history_stats(fighter_name, current_fight_date, past_fights, fighters_df, n_fights=5):
|
48 |
+
"""Calculate historical performance statistics for a fighter."""
|
49 |
+
# Sort fights by date and get last N fights before current fight
|
50 |
+
past_fights = [f for f in past_fights if f['date_obj'] < current_fight_date]
|
51 |
+
past_fights = sorted(past_fights, key=lambda x: x['date_obj'])
|
52 |
+
last_n_fights = past_fights[-n_fights:] if past_fights else []
|
53 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
stats = {
|
55 |
+
'wins_last_n': 0,
|
56 |
+
'ko_wins': 0,
|
57 |
+
'total_finishes': 0,
|
58 |
+
'first_round_finishes': 0,
|
59 |
+
'knockdowns_scored': 0,
|
60 |
+
'knockdowns_absorbed': 0,
|
61 |
+
'sig_str_landed': 0,
|
62 |
+
'sig_str_attempted': 0,
|
63 |
+
'takedowns_landed': 0,
|
64 |
+
'takedowns_attempted': 0,
|
65 |
+
'sub_attempts': 0,
|
66 |
+
'ctrl_time_sec': 0,
|
67 |
+
'total_fight_time_sec': 0,
|
68 |
+
'fights_in_last_year': 0,
|
69 |
+
'avg_opp_elo_last_n': 0
|
70 |
}
|
71 |
+
|
72 |
+
# Calculate fights in last year
|
73 |
+
one_year_ago = current_fight_date - pd.Timedelta(days=365)
|
74 |
+
stats['fights_in_last_year'] = len([f for f in past_fights if f['date_obj'] >= one_year_ago])
|
75 |
+
|
76 |
+
# Process each fight
|
77 |
+
total_opp_elo = 0
|
78 |
for fight in last_n_fights:
|
79 |
is_fighter_1 = (fight['fighter_1'] == fighter_name)
|
80 |
+
f_prefix = 'f1' if is_fighter_1 else 'f2'
|
81 |
+
opp_prefix = 'f2' if is_fighter_1 else 'f1'
|
82 |
opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
|
83 |
|
84 |
+
# Win/Loss and Finishes
|
|
|
85 |
if fight['winner'] == fighter_name:
|
86 |
+
stats['wins_last_n'] += 1
|
87 |
+
if fight['method'] != 'Decision':
|
88 |
+
stats['total_finishes'] += 1
|
89 |
+
if fight['round'] == '1':
|
90 |
+
stats['first_round_finishes'] += 1
|
91 |
if 'KO' in fight['method']:
|
92 |
stats['ko_wins'] += 1
|
|
|
|
|
|
|
|
|
93 |
|
94 |
+
# Striking and Grappling Stats
|
95 |
+
stats['knockdowns_scored'] += _to_int_safe(fight.get(f'{f_prefix}_kd'))
|
96 |
+
stats['knockdowns_absorbed'] += _to_int_safe(fight.get(f'{opp_prefix}_kd'))
|
97 |
+
stats['sig_str_landed'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_landed'))
|
98 |
+
stats['sig_str_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_sig_str_attempted'))
|
99 |
+
stats['takedowns_landed'] += _to_int_safe(fight.get(f'{f_prefix}_td_landed'))
|
100 |
+
stats['takedowns_attempted'] += _to_int_safe(fight.get(f'{f_prefix}_td_attempted'))
|
101 |
+
stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_attempts'))
|
102 |
|
103 |
+
# Control Time
|
104 |
+
ctrl_time = fight.get(f'{f_prefix}_ctrl_time', '0:00')
|
105 |
+
if isinstance(ctrl_time, str) and ':' in ctrl_time:
|
106 |
+
mins, secs = map(int, ctrl_time.split(':'))
|
107 |
+
stats['ctrl_time_sec'] += mins * 60 + secs
|
|
|
|
|
|
|
108 |
|
109 |
+
# Fight Duration
|
110 |
+
round_num = _to_int_safe(fight['round'])
|
111 |
+
round_time = fight.get('round_time', '0:00')
|
112 |
+
if isinstance(round_time, str) and ':' in round_time:
|
113 |
+
mins, secs = map(int, round_time.split(':'))
|
114 |
+
stats['total_fight_time_sec'] += (round_num - 1) * 300 + mins * 60 + secs
|
115 |
+
|
116 |
+
# Opponent ELO
|
117 |
+
if opponent_name in fighters_df.index:
|
118 |
+
opp_elo = fighters_df.loc[opponent_name, 'elo']
|
119 |
+
if not pd.isna(opp_elo):
|
120 |
+
total_opp_elo += opp_elo
|
121 |
|
122 |
+
# Calculate averages and rates
|
123 |
+
n_actual_fights = len(last_n_fights)
|
124 |
+
|
125 |
+
# Always provide all required keys with default values
|
126 |
+
stats['finish_rate_last_n'] = stats['total_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
127 |
+
stats['first_round_finish_rate_last_n'] = stats['first_round_finishes'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
128 |
+
stats['ko_percent_last_n'] = stats['ko_wins'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
129 |
+
stats['avg_knockdowns_per_fight_last_n'] = stats['knockdowns_scored'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
130 |
+
stats['knockdowns_absorbed_per_fight_last_n'] = stats['knockdowns_absorbed'] / n_actual_fights if n_actual_fights > 0 else 0.0
|
131 |
+
stats['avg_opp_elo_last_n'] = total_opp_elo / n_actual_fights if n_actual_fights > 0 else 1500.0
|
132 |
+
|
133 |
+
# Per-minute stats
|
134 |
+
total_mins = stats['total_fight_time_sec'] / 60
|
135 |
+
stats['sig_str_landed_per_min_last_n'] = stats['sig_str_landed'] / total_mins if total_mins > 0 else 0.0
|
136 |
+
stats['sig_str_absorbed_per_min_last_n'] = stats['sig_str_attempted'] / total_mins if total_mins > 0 else 0.0
|
137 |
+
stats['sub_attempts_per_min_last_n'] = stats['sub_attempts'] / total_mins if total_mins > 0 else 0.0
|
138 |
+
stats['avg_ctrl_time_sec_per_min_last_n'] = stats['ctrl_time_sec'] / total_mins if total_mins > 0 else 0.0
|
139 |
+
|
140 |
+
# Accuracy stats
|
141 |
+
stats['sig_str_defense_last_n'] = stats['sig_str_landed'] / stats['sig_str_attempted'] if stats['sig_str_attempted'] > 0 else 0.5
|
142 |
+
stats['takedown_accuracy_last_n'] = stats['takedowns_landed'] / stats['takedowns_attempted'] if stats['takedowns_attempted'] > 0 else 0.5
|
143 |
+
stats['takedown_defense_last_n'] = 1 - (stats['takedowns_landed'] / stats['takedowns_attempted']) if stats['takedowns_attempted'] > 0 else 0.5
|
144 |
+
|
145 |
+
return stats
|
146 |
|
147 |
def preprocess_for_ml(fights_to_process, fighters_csv_path):
|
148 |
+
"""Transform fight data into ML-ready features."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
if not os.path.exists(fighters_csv_path):
|
150 |
raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
|
151 |
|
152 |
+
# Load and prepare fighter data
|
153 |
fighters_df = pd.read_csv(fighters_csv_path)
|
154 |
+
fighters_df['full_name'] = fighters_df['first_name'] + ' ' + fighters_df['last_name']
|
155 |
+
fighters_df = fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
for col in ['height_cm', 'reach_in', 'elo']:
|
158 |
+
if col in fighters_df.columns:
|
159 |
+
fighters_df[col] = _clean_numeric_column(fighters_df[col])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
+
# Process fights and calculate features
|
162 |
+
processed_fights = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
for fight in fights_to_process:
|
|
|
164 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
165 |
+
|
166 |
+
# Skip if either fighter is missing
|
167 |
+
if f1_name not in fighters_df.index or f2_name not in fighters_df.index:
|
168 |
continue
|
169 |
+
|
170 |
+
# Get fighter stats
|
171 |
+
f1_stats = fighters_df.loc[f1_name]
|
172 |
+
f2_stats = fighters_df.loc[f2_name]
|
173 |
|
174 |
+
# Calculate fight date and ensure date_obj is available
|
175 |
+
fight_date = pd.to_datetime(fight['event_date'])
|
176 |
+
fight['date_obj'] = fight_date
|
177 |
+
|
178 |
+
# Get fighter histories and ensure date_obj is available for all fights
|
179 |
+
f1_hist = [f for f in fights_to_process if f1_name in (f['fighter_1'], f['fighter_2'])]
|
180 |
+
f2_hist = [f for f in fights_to_process if f2_name in (f['fighter_1'], f['fighter_2'])]
|
181 |
+
|
182 |
+
# Ensure date_obj is available for all historical fights
|
183 |
+
for hist_fight in f1_hist + f2_hist:
|
184 |
+
if 'date_obj' not in hist_fight:
|
185 |
+
hist_fight['date_obj'] = pd.to_datetime(hist_fight['event_date'])
|
186 |
+
|
187 |
+
# Calculate historical stats
|
188 |
+
f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, fighters_df)
|
189 |
+
f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, fighters_df)
|
190 |
+
|
191 |
+
# Calculate ages
|
192 |
f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
|
193 |
f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
|
|
|
|
|
|
|
|
|
194 |
|
195 |
+
# Calculate days since last fight
|
196 |
+
f1_days_since_last = _get_days_since_last_fight(fight_date, f1_hist) or 547 # ~1.5 years if no previous fights
|
197 |
+
f2_days_since_last = _get_days_since_last_fight(fight_date, f2_hist) or 547
|
198 |
+
|
199 |
+
# Calculate win streaks
|
200 |
+
f1_win_streak = _get_win_streak(f1_name, fight_date, f1_hist)
|
201 |
+
f2_win_streak = _get_win_streak(f2_name, fight_date, f2_hist)
|
202 |
+
|
203 |
+
# Compile all features
|
204 |
+
feature_dict = {
|
205 |
+
'winner': 1 if fight.get('winner') == f1_name else 0,
|
206 |
+
'date': fight['event_date'],
|
207 |
+
'fighter_1': f1_name,
|
208 |
+
'fighter_2': f2_name,
|
209 |
+
|
210 |
+
# Physical differences
|
211 |
+
'height_diff': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
|
212 |
+
'reach_diff': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
|
213 |
+
'age_diff': (f1_age or 0) - (f2_age or 0),
|
214 |
'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
|
215 |
+
|
216 |
+
# Career momentum
|
217 |
+
'days_since_last_fight_diff': f1_days_since_last - f2_days_since_last,
|
218 |
+
'win_streak_diff': f1_win_streak - f2_win_streak,
|
219 |
+
'fights_last_year_diff': f1_hist_stats['fights_in_last_year'] - f2_hist_stats['fights_in_last_year'],
|
220 |
+
|
221 |
+
# Performance differences
|
222 |
+
'finish_rate_diff': f1_hist_stats['finish_rate_last_n'] - f2_hist_stats['finish_rate_last_n'],
|
223 |
+
'ko_rate_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
|
224 |
+
'sig_str_per_min_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
|
225 |
+
'td_accuracy_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
|
226 |
+
'sub_attempts_per_min_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
|
227 |
+
'control_time_diff': f1_hist_stats['avg_ctrl_time_sec_per_min_last_n'] - f2_hist_stats['avg_ctrl_time_sec_per_min_last_n'],
|
228 |
+
|
229 |
+
# Defense differences
|
230 |
+
'sig_str_defense_diff': f1_hist_stats['sig_str_defense_last_n'] - f2_hist_stats['sig_str_defense_last_n'],
|
231 |
+
'td_defense_diff': f1_hist_stats['takedown_defense_last_n'] - f2_hist_stats['takedown_defense_last_n'],
|
232 |
+
'knockdowns_absorbed_diff': f1_hist_stats['knockdowns_absorbed_per_fight_last_n'] - f2_hist_stats['knockdowns_absorbed_per_fight_last_n']
|
233 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
+
processed_fights.append(feature_dict)
|
236 |
+
|
237 |
+
if not processed_fights:
|
238 |
+
return pd.DataFrame(), pd.Series(), pd.DataFrame()
|
239 |
+
|
240 |
+
# Create final dataframes
|
241 |
+
df = pd.DataFrame(processed_fights)
|
242 |
+
metadata = df[['date', 'fighter_1', 'fighter_2', 'winner']]
|
243 |
+
|
244 |
+
# Prepare X and y
|
245 |
+
y = df['winner']
|
246 |
+
X = df.drop(columns=['winner', 'date', 'fighter_1', 'fighter_2'])
|
247 |
+
X = X.reindex(sorted(X.columns), axis=1) # Ensure consistent column order
|
248 |
+
|
249 |
+
# Handle missing values by filling NaNs with 0
|
250 |
+
X = X.fillna(0)
|
251 |
+
|
252 |
return X, y, metadata
|