Spaces:
Running
Running
Commit
·
f972c61
1
Parent(s):
ffd453e
Startup model check: 2025-08-03 13:02:33
Browse files- logs/startup_update.log +0 -0
- src/predict/config.py +40 -0
- src/predict/main.py +1 -12
- src/predict/models.py +9 -12
- src/predict/pipeline.py +12 -7
- src/predict/preprocess.py +28 -69
- src/predict/utils.py +70 -0
logs/startup_update.log
CHANGED
Binary files a/logs/startup_update.log and b/logs/startup_update.log differ
|
|
src/predict/config.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Configuration module for UFC prediction models."""
|
2 |
+
|
3 |
+
# Model settings
|
4 |
+
DEFAULT_ELO = 1500
|
5 |
+
N_FIGHTS_HISTORY = 5
|
6 |
+
DEFAULT_ROUNDS_DURATION = 5 * 60 # 5 minutes per round
|
7 |
+
|
8 |
+
# Date formats
|
9 |
+
DATE_FORMAT_EVENT = '%B %d, %Y'
|
10 |
+
DATE_FORMAT_DOB = '%b %d, %Y'
|
11 |
+
|
12 |
+
# Feature settings
|
13 |
+
FEATURE_COLUMNS = [
|
14 |
+
'height_cm',
|
15 |
+
'reach_in',
|
16 |
+
'elo',
|
17 |
+
'stance',
|
18 |
+
'dob'
|
19 |
+
]
|
20 |
+
|
21 |
+
# Model hyperparameters
|
22 |
+
MODEL_DEFAULTS = {
|
23 |
+
'LogisticRegression': {},
|
24 |
+
'XGBClassifier': {
|
25 |
+
'use_label_encoder': False,
|
26 |
+
'eval_metric': 'logloss',
|
27 |
+
'random_state': 42
|
28 |
+
},
|
29 |
+
'SVC': {
|
30 |
+
'probability': True,
|
31 |
+
'random_state': 42
|
32 |
+
},
|
33 |
+
'RandomForestClassifier': {
|
34 |
+
'random_state': 42
|
35 |
+
},
|
36 |
+
'BernoulliNB': {},
|
37 |
+
'LGBMClassifier': {
|
38 |
+
'random_state': 42
|
39 |
+
}
|
40 |
+
}
|
src/predict/main.py
CHANGED
@@ -68,18 +68,7 @@ def main():
|
|
68 |
elif use_existing_models:
|
69 |
print("Using existing models if available and no new data detected.")
|
70 |
|
71 |
-
#
|
72 |
-
# Instantiate all the models you want to evaluate here.
|
73 |
-
models_to_run = [
|
74 |
-
EloBaselineModel(),
|
75 |
-
LogisticRegressionModel(),
|
76 |
-
XGBoostModel(),
|
77 |
-
SVCModel(),
|
78 |
-
RandomForestModel(),
|
79 |
-
BernoulliNBModel(),
|
80 |
-
LGBMModel(),
|
81 |
-
]
|
82 |
-
# --- End of Model Definition ---
|
83 |
|
84 |
pipeline = PredictionPipeline(
|
85 |
models=MODELS_TO_RUN,
|
|
|
68 |
elif use_existing_models:
|
69 |
print("Using existing models if available and no new data detected.")
|
70 |
|
71 |
+
# Use the already defined MODELS_TO_RUN from the top of the file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
pipeline = PredictionPipeline(
|
74 |
models=MODELS_TO_RUN,
|
src/predict/models.py
CHANGED
@@ -2,6 +2,7 @@ from abc import ABC, abstractmethod
|
|
2 |
import sys
|
3 |
import os
|
4 |
import pandas as pd
|
|
|
5 |
from sklearn.linear_model import LogisticRegression
|
6 |
from sklearn.svm import SVC
|
7 |
from sklearn.naive_bayes import BernoulliNB
|
@@ -10,7 +11,8 @@ from xgboost import XGBClassifier
|
|
10 |
from lightgbm import LGBMClassifier
|
11 |
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
12 |
from ..config import FIGHTERS_CSV_PATH
|
13 |
-
from .preprocess import preprocess_for_ml, _get_fighter_history_stats
|
|
|
14 |
|
15 |
class BaseModel(ABC):
|
16 |
"""
|
@@ -53,7 +55,7 @@ class EloBaselineModel(BaseModel):
|
|
53 |
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
54 |
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
55 |
|
56 |
-
def predict(self, fight):
|
57 |
"""Predicts the winner based on ELO and calculates win probability."""
|
58 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
59 |
|
@@ -85,7 +87,7 @@ class BaseMLModel(BaseModel):
|
|
85 |
self.fighters_df = None
|
86 |
self.fighter_histories = {}
|
87 |
|
88 |
-
def train(self, train_fights):
|
89 |
"""
|
90 |
Trains the machine learning model. This involves loading fighter data,
|
91 |
pre-calculating histories, and fitting the model on the preprocessed data.
|
@@ -93,12 +95,7 @@ class BaseMLModel(BaseModel):
|
|
93 |
print(f"--- Training {self.model.__class__.__name__} ---")
|
94 |
|
95 |
# 1. Prepare data for prediction-time feature generation
|
96 |
-
self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
|
97 |
-
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
98 |
-
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
99 |
-
for col in ['height_cm', 'reach_in', 'elo']:
|
100 |
-
if col in self.fighters_df.columns:
|
101 |
-
self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
|
102 |
|
103 |
# 2. Pre-calculate fighter histories
|
104 |
train_fights_with_dates = []
|
@@ -136,8 +133,8 @@ class BaseMLModel(BaseModel):
|
|
136 |
f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
|
137 |
f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
|
138 |
|
139 |
-
f1_age =
|
140 |
-
f2_age =
|
141 |
|
142 |
features = {
|
143 |
'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
|
@@ -194,4 +191,4 @@ class BernoulliNBModel(BaseMLModel):
|
|
194 |
class LGBMModel(BaseMLModel):
|
195 |
"""A thin wrapper for LightGBM's LGBMClassifier."""
|
196 |
def __init__(self):
|
197 |
-
super().__init__(model=LGBMClassifier(random_state=42))
|
|
|
2 |
import sys
|
3 |
import os
|
4 |
import pandas as pd
|
5 |
+
from typing import Dict, Any, Optional
|
6 |
from sklearn.linear_model import LogisticRegression
|
7 |
from sklearn.svm import SVC
|
8 |
from sklearn.naive_bayes import BernoulliNB
|
|
|
11 |
from lightgbm import LGBMClassifier
|
12 |
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
13 |
from ..config import FIGHTERS_CSV_PATH
|
14 |
+
from .preprocess import preprocess_for_ml, _get_fighter_history_stats
|
15 |
+
from .utils import calculate_age, prepare_fighters_data, DEFAULT_ELO
|
16 |
|
17 |
class BaseModel(ABC):
|
18 |
"""
|
|
|
55 |
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
56 |
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
57 |
|
58 |
+
def predict(self, fight: Dict[str, Any]) -> Dict[str, Optional[float]]:
|
59 |
"""Predicts the winner based on ELO and calculates win probability."""
|
60 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
61 |
|
|
|
87 |
self.fighters_df = None
|
88 |
self.fighter_histories = {}
|
89 |
|
90 |
+
def train(self, train_fights: List[Dict[str, Any]]) -> None:
|
91 |
"""
|
92 |
Trains the machine learning model. This involves loading fighter data,
|
93 |
pre-calculating histories, and fitting the model on the preprocessed data.
|
|
|
95 |
print(f"--- Training {self.model.__class__.__name__} ---")
|
96 |
|
97 |
# 1. Prepare data for prediction-time feature generation
|
98 |
+
self.fighters_df = prepare_fighters_data(pd.read_csv(FIGHTERS_CSV_PATH))
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
# 2. Pre-calculate fighter histories
|
101 |
train_fights_with_dates = []
|
|
|
133 |
f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
|
134 |
f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
|
135 |
|
136 |
+
f1_age = calculate_age(f1_stats.get('dob'), fight['event_date'])
|
137 |
+
f2_age = calculate_age(f2_stats.get('dob'), fight['event_date'])
|
138 |
|
139 |
features = {
|
140 |
'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
|
|
|
191 |
class LGBMModel(BaseMLModel):
|
192 |
"""A thin wrapper for LightGBM's LGBMClassifier."""
|
193 |
def __init__(self):
|
194 |
+
super().__init__(model=LGBMClassifier(random_state=42))
|
src/predict/pipeline.py
CHANGED
@@ -149,16 +149,13 @@ class PredictionPipeline:
|
|
149 |
print("No new data detected and all model files exist. Using existing models.")
|
150 |
return False
|
151 |
|
152 |
-
def _load_and_split_data(self, num_test_events=1):
|
153 |
"""Loads and splits the data into chronological training and testing sets."""
|
154 |
print("\n--- Loading and Splitting Data ---")
|
155 |
if not os.path.exists(FIGHTS_CSV_PATH):
|
156 |
raise FileNotFoundError(f"Fights data not found at '{FIGHTS_CSV_PATH}'.")
|
157 |
|
158 |
-
|
159 |
-
fights = list(csv.DictReader(f))
|
160 |
-
|
161 |
-
fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
|
162 |
|
163 |
all_events = list(OrderedDict.fromkeys(f['event_name'] for f in fights))
|
164 |
if len(all_events) < num_test_events:
|
@@ -171,7 +168,15 @@ class PredictionPipeline:
|
|
171 |
print(f"Data loaded. {len(self.train_fights)} training fights, {len(self.test_fights)} testing fights.")
|
172 |
print(f"Testing on the last {num_test_events} event(s): {', '.join(test_event_names)}")
|
173 |
|
174 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
"""Executes the full pipeline: load, train, evaluate, report and save models."""
|
176 |
self._load_and_split_data()
|
177 |
|
@@ -349,4 +354,4 @@ class PredictionPipeline:
|
|
349 |
# A summary is printed to the console for convenience.
|
350 |
self._report_summary()
|
351 |
# The detailed report is now saved to a JSON file.
|
352 |
-
self._save_report_to_json()
|
|
|
149 |
print("No new data detected and all model files exist. Using existing models.")
|
150 |
return False
|
151 |
|
152 |
+
def _load_and_split_data(self, num_test_events: int = 1) -> None:
|
153 |
"""Loads and splits the data into chronological training and testing sets."""
|
154 |
print("\n--- Loading and Splitting Data ---")
|
155 |
if not os.path.exists(FIGHTS_CSV_PATH):
|
156 |
raise FileNotFoundError(f"Fights data not found at '{FIGHTS_CSV_PATH}'.")
|
157 |
|
158 |
+
fights = self._load_fights()
|
|
|
|
|
|
|
159 |
|
160 |
all_events = list(OrderedDict.fromkeys(f['event_name'] for f in fights))
|
161 |
if len(all_events) < num_test_events:
|
|
|
168 |
print(f"Data loaded. {len(self.train_fights)} training fights, {len(self.test_fights)} testing fights.")
|
169 |
print(f"Testing on the last {num_test_events} event(s): {', '.join(test_event_names)}")
|
170 |
|
171 |
+
def _load_fights(self) -> list:
|
172 |
+
"""Helper method to load and sort fights from CSV."""
|
173 |
+
with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
|
174 |
+
fights = list(csv.DictReader(f))
|
175 |
+
|
176 |
+
fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
|
177 |
+
return fights
|
178 |
+
|
179 |
+
def run(self, detailed_report: bool = True) -> None:
|
180 |
"""Executes the full pipeline: load, train, evaluate, report and save models."""
|
181 |
self._load_and_split_data()
|
182 |
|
|
|
354 |
# A summary is printed to the console for convenience.
|
355 |
self._report_summary()
|
356 |
# The detailed report is now saved to a JSON file.
|
357 |
+
self._save_report_to_json()
|
src/predict/preprocess.py
CHANGED
@@ -1,53 +1,22 @@
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
from datetime import datetime
|
|
|
4 |
from ..config import FIGHTERS_CSV_PATH
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
try:
|
16 |
-
dob = datetime.strptime(dob_str, '%b %d, %Y')
|
17 |
-
fight_date = datetime.strptime(fight_date_str, '%B %d, %Y')
|
18 |
-
return (fight_date - dob).days / 365.25
|
19 |
-
except (ValueError, TypeError):
|
20 |
-
return None
|
21 |
-
|
22 |
-
def _parse_round_time_to_seconds(round_str, time_str):
|
23 |
-
"""Converts fight duration from round and time to total seconds."""
|
24 |
-
try:
|
25 |
-
rounds = int(round_str)
|
26 |
-
minutes, seconds = map(int, time_str.split(':'))
|
27 |
-
# Assuming 5-minute rounds for calculation simplicity
|
28 |
-
return ((rounds - 1) * 5 * 60) + (minutes * 60) + seconds
|
29 |
-
except (ValueError, TypeError, AttributeError):
|
30 |
-
return 0
|
31 |
-
|
32 |
-
def _parse_striking_stats(stat_str):
|
33 |
-
"""Parses striking stats string like '10 of 20' into (landed, attempted)."""
|
34 |
-
try:
|
35 |
-
landed, attempted = map(int, stat_str.split(' of '))
|
36 |
-
return landed, attempted
|
37 |
-
except (ValueError, TypeError, AttributeError):
|
38 |
-
return 0, 0
|
39 |
-
|
40 |
-
def _to_int_safe(val):
|
41 |
-
"""Safely converts a value to an integer, returning 0 if it's invalid or empty."""
|
42 |
-
if pd.isna(val):
|
43 |
-
return 0
|
44 |
-
try:
|
45 |
-
# handle strings with whitespace or empty strings
|
46 |
-
return int(str(val).strip() or 0)
|
47 |
-
except (ValueError, TypeError):
|
48 |
-
return 0
|
49 |
-
|
50 |
-
def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history, fighters_df, n=5):
|
51 |
"""
|
52 |
Calculates performance statistics for a fighter based on their last n fights.
|
53 |
"""
|
@@ -58,7 +27,7 @@ def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history
|
|
58 |
# Return a default dictionary with the correct keys for a fighter with no history
|
59 |
return {
|
60 |
'wins_last_n': 0,
|
61 |
-
'avg_opp_elo_last_n':
|
62 |
'ko_percent_last_n': 0,
|
63 |
'sig_str_landed_per_min_last_n': 0,
|
64 |
'takedown_accuracy_last_n': 0,
|
@@ -84,20 +53,20 @@ def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history
|
|
84 |
|
85 |
if opponent_name in fighters_df.index:
|
86 |
opp_elo = fighters_df.loc[opponent_name, 'elo']
|
87 |
-
stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else
|
88 |
|
89 |
-
stats['total_time_secs'] +=
|
90 |
|
91 |
sig_str_stat = fight.get(f'{f_prefix}_sig_str', '0 of 0')
|
92 |
-
landed, _ =
|
93 |
stats['sig_str_landed'] += landed
|
94 |
|
95 |
td_stat = fight.get(f'{f_prefix}_td', '0 of 0')
|
96 |
-
td_landed, td_attempted =
|
97 |
stats['td_landed'] += td_landed
|
98 |
stats['td_attempted'] += td_attempted
|
99 |
|
100 |
-
stats['sub_attempts'] +=
|
101 |
|
102 |
# Final calculations
|
103 |
avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
|
@@ -112,36 +81,26 @@ def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history
|
|
112 |
'sub_attempts_per_min_last_n': (stats['sub_attempts'] / total_minutes) if total_minutes > 0 else 0,
|
113 |
}
|
114 |
|
115 |
-
def preprocess_for_ml(
|
|
|
|
|
|
|
116 |
"""
|
117 |
Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
|
118 |
suitable for a binary classification machine learning model.
|
119 |
|
120 |
Args:
|
121 |
-
fights_to_process
|
122 |
-
fighters_csv_path
|
123 |
|
124 |
Returns:
|
125 |
-
|
126 |
-
pd.Series: Target vector y.
|
127 |
-
pd.DataFrame: Metadata DataFrame.
|
128 |
"""
|
129 |
if not os.path.exists(fighters_csv_path):
|
130 |
raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
|
131 |
|
132 |
fighters_df = pd.read_csv(fighters_csv_path)
|
133 |
-
|
134 |
-
# 1. Prepare fighters data for merging
|
135 |
-
fighters_prepared = fighters_df.copy()
|
136 |
-
fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
|
137 |
-
|
138 |
-
# Handle duplicate fighter names by keeping the first entry
|
139 |
-
fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
|
140 |
-
fighters_prepared = fighters_prepared.set_index('full_name')
|
141 |
-
|
142 |
-
for col in ['height_cm', 'reach_in', 'elo']:
|
143 |
-
if col in fighters_prepared.columns:
|
144 |
-
fighters_prepared[col] = _clean_numeric_column(fighters_prepared[col])
|
145 |
|
146 |
# 2. Pre-calculate fighter histories to speed up lookups
|
147 |
# And convert date strings to datetime objects once
|
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
from datetime import datetime
|
4 |
+
from typing import Dict, List, Tuple, Any, Optional
|
5 |
from ..config import FIGHTERS_CSV_PATH
|
6 |
+
from .utils import (
|
7 |
+
parse_round_time_to_seconds, parse_striking_stats, to_int_safe,
|
8 |
+
calculate_age, prepare_fighters_data, DEFAULT_ELO, N_FIGHTS_HISTORY
|
9 |
+
)
|
10 |
|
11 |
+
|
12 |
+
|
13 |
+
def _get_fighter_history_stats(
|
14 |
+
fighter_name: str,
|
15 |
+
current_fight_date: datetime,
|
16 |
+
fighter_history: List[Dict[str, Any]],
|
17 |
+
fighters_df: pd.DataFrame,
|
18 |
+
n: int = N_FIGHTS_HISTORY
|
19 |
+
) -> Dict[str, float]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
"""
|
21 |
Calculates performance statistics for a fighter based on their last n fights.
|
22 |
"""
|
|
|
27 |
# Return a default dictionary with the correct keys for a fighter with no history
|
28 |
return {
|
29 |
'wins_last_n': 0,
|
30 |
+
'avg_opp_elo_last_n': DEFAULT_ELO,
|
31 |
'ko_percent_last_n': 0,
|
32 |
'sig_str_landed_per_min_last_n': 0,
|
33 |
'takedown_accuracy_last_n': 0,
|
|
|
53 |
|
54 |
if opponent_name in fighters_df.index:
|
55 |
opp_elo = fighters_df.loc[opponent_name, 'elo']
|
56 |
+
stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else DEFAULT_ELO)
|
57 |
|
58 |
+
stats['total_time_secs'] += parse_round_time_to_seconds(fight['round'], fight['time'])
|
59 |
|
60 |
sig_str_stat = fight.get(f'{f_prefix}_sig_str', '0 of 0')
|
61 |
+
landed, _ = parse_striking_stats(sig_str_stat)
|
62 |
stats['sig_str_landed'] += landed
|
63 |
|
64 |
td_stat = fight.get(f'{f_prefix}_td', '0 of 0')
|
65 |
+
td_landed, td_attempted = parse_striking_stats(td_stat)
|
66 |
stats['td_landed'] += td_landed
|
67 |
stats['td_attempted'] += td_attempted
|
68 |
|
69 |
+
stats['sub_attempts'] += to_int_safe(fight.get(f'{f_prefix}_sub_att'))
|
70 |
|
71 |
# Final calculations
|
72 |
avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
|
|
|
81 |
'sub_attempts_per_min_last_n': (stats['sub_attempts'] / total_minutes) if total_minutes > 0 else 0,
|
82 |
}
|
83 |
|
84 |
+
def preprocess_for_ml(
|
85 |
+
fights_to_process: List[Dict[str, Any]],
|
86 |
+
fighters_csv_path: str
|
87 |
+
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
88 |
"""
|
89 |
Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
|
90 |
suitable for a binary classification machine learning model.
|
91 |
|
92 |
Args:
|
93 |
+
fights_to_process: The list of fights to process.
|
94 |
+
fighters_csv_path: Path to the CSV file with all fighter stats.
|
95 |
|
96 |
Returns:
|
97 |
+
Feature matrix X, target vector y, and metadata DataFrame.
|
|
|
|
|
98 |
"""
|
99 |
if not os.path.exists(fighters_csv_path):
|
100 |
raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
|
101 |
|
102 |
fighters_df = pd.read_csv(fighters_csv_path)
|
103 |
+
fighters_prepared = prepare_fighters_data(fighters_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
# 2. Pre-calculate fighter histories to speed up lookups
|
106 |
# And convert date strings to datetime objects once
|
src/predict/utils.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
from datetime import datetime
|
4 |
+
from typing import Optional, Dict, Any
|
5 |
+
|
6 |
+
# Constants
|
7 |
+
DEFAULT_ELO = 1500
|
8 |
+
DEFAULT_AGE = 0
|
9 |
+
DEFAULT_FIGHT_TIME = 0
|
10 |
+
DEFAULT_ROUNDS_DURATION = 5 * 60 # 5 minutes per round
|
11 |
+
N_FIGHTS_HISTORY = 5
|
12 |
+
|
13 |
+
def clean_numeric_column(series: pd.Series) -> pd.Series:
|
14 |
+
"""A helper to clean string columns into numbers, handling errors."""
|
15 |
+
series_str = series.astype(str)
|
16 |
+
return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
|
17 |
+
|
18 |
+
def calculate_age(dob_str: str, fight_date_str: str) -> Optional[float]:
|
19 |
+
"""Calculates age in years from a date of birth string and fight date string."""
|
20 |
+
if pd.isna(dob_str) or not dob_str:
|
21 |
+
return None
|
22 |
+
try:
|
23 |
+
dob = datetime.strptime(dob_str, '%b %d, %Y')
|
24 |
+
fight_date = datetime.strptime(fight_date_str, '%B %d, %Y')
|
25 |
+
return (fight_date - dob).days / 365.25
|
26 |
+
except (ValueError, TypeError):
|
27 |
+
return None
|
28 |
+
|
29 |
+
def parse_round_time_to_seconds(round_str: str, time_str: str) -> int:
|
30 |
+
"""Converts fight duration from round and time to total seconds."""
|
31 |
+
try:
|
32 |
+
rounds = int(round_str)
|
33 |
+
minutes, seconds = map(int, time_str.split(':'))
|
34 |
+
# Assuming 5-minute rounds for calculation simplicity
|
35 |
+
return ((rounds - 1) * DEFAULT_ROUNDS_DURATION) + (minutes * 60) + seconds
|
36 |
+
except (ValueError, TypeError, AttributeError):
|
37 |
+
return 0
|
38 |
+
|
39 |
+
def parse_striking_stats(stat_str: str) -> tuple[int, int]:
|
40 |
+
"""Parses striking stats string like '10 of 20' into (landed, attempted)."""
|
41 |
+
try:
|
42 |
+
landed, attempted = map(int, stat_str.split(' of '))
|
43 |
+
return landed, attempted
|
44 |
+
except (ValueError, TypeError, AttributeError):
|
45 |
+
return 0, 0
|
46 |
+
|
47 |
+
def to_int_safe(val: Any) -> int:
|
48 |
+
"""Safely converts a value to an integer, returning 0 if it's invalid or empty."""
|
49 |
+
if pd.isna(val):
|
50 |
+
return 0
|
51 |
+
try:
|
52 |
+
# handle strings with whitespace or empty strings
|
53 |
+
return int(str(val).strip() or 0)
|
54 |
+
except (ValueError, TypeError):
|
55 |
+
return 0
|
56 |
+
|
57 |
+
def prepare_fighters_data(fighters_df: pd.DataFrame) -> pd.DataFrame:
|
58 |
+
"""Prepares fighter data for analysis by cleaning and standardizing."""
|
59 |
+
fighters_prepared = fighters_df.copy()
|
60 |
+
fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
|
61 |
+
|
62 |
+
# Handle duplicate fighter names by keeping the first entry
|
63 |
+
fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
|
64 |
+
fighters_prepared = fighters_prepared.set_index('full_name')
|
65 |
+
|
66 |
+
for col in ['height_cm', 'reach_in', 'elo']:
|
67 |
+
if col in fighters_prepared.columns:
|
68 |
+
fighters_prepared[col] = clean_numeric_column(fighters_prepared[col])
|
69 |
+
|
70 |
+
return fighters_prepared
|