AlvaroMros commited on
Commit
f972c61
·
1 Parent(s): ffd453e

Startup model check: 2025-08-03 13:02:33

Browse files
logs/startup_update.log CHANGED
Binary files a/logs/startup_update.log and b/logs/startup_update.log differ
 
src/predict/config.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration module for UFC prediction models."""
2
+
3
+ # Model settings
4
+ DEFAULT_ELO = 1500
5
+ N_FIGHTS_HISTORY = 5
6
+ DEFAULT_ROUNDS_DURATION = 5 * 60 # 5 minutes per round
7
+
8
+ # Date formats
9
+ DATE_FORMAT_EVENT = '%B %d, %Y'
10
+ DATE_FORMAT_DOB = '%b %d, %Y'
11
+
12
+ # Feature settings
13
+ FEATURE_COLUMNS = [
14
+ 'height_cm',
15
+ 'reach_in',
16
+ 'elo',
17
+ 'stance',
18
+ 'dob'
19
+ ]
20
+
21
+ # Model hyperparameters
22
+ MODEL_DEFAULTS = {
23
+ 'LogisticRegression': {},
24
+ 'XGBClassifier': {
25
+ 'use_label_encoder': False,
26
+ 'eval_metric': 'logloss',
27
+ 'random_state': 42
28
+ },
29
+ 'SVC': {
30
+ 'probability': True,
31
+ 'random_state': 42
32
+ },
33
+ 'RandomForestClassifier': {
34
+ 'random_state': 42
35
+ },
36
+ 'BernoulliNB': {},
37
+ 'LGBMClassifier': {
38
+ 'random_state': 42
39
+ }
40
+ }
src/predict/main.py CHANGED
@@ -68,18 +68,7 @@ def main():
68
  elif use_existing_models:
69
  print("Using existing models if available and no new data detected.")
70
 
71
- # --- Define Models to Run ---
72
- # Instantiate all the models you want to evaluate here.
73
- models_to_run = [
74
- EloBaselineModel(),
75
- LogisticRegressionModel(),
76
- XGBoostModel(),
77
- SVCModel(),
78
- RandomForestModel(),
79
- BernoulliNBModel(),
80
- LGBMModel(),
81
- ]
82
- # --- End of Model Definition ---
83
 
84
  pipeline = PredictionPipeline(
85
  models=MODELS_TO_RUN,
 
68
  elif use_existing_models:
69
  print("Using existing models if available and no new data detected.")
70
 
71
+ # Use the already defined MODELS_TO_RUN from the top of the file
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  pipeline = PredictionPipeline(
74
  models=MODELS_TO_RUN,
src/predict/models.py CHANGED
@@ -2,6 +2,7 @@ from abc import ABC, abstractmethod
2
  import sys
3
  import os
4
  import pandas as pd
 
5
  from sklearn.linear_model import LogisticRegression
6
  from sklearn.svm import SVC
7
  from sklearn.naive_bayes import BernoulliNB
@@ -10,7 +11,8 @@ from xgboost import XGBClassifier
10
  from lightgbm import LGBMClassifier
11
  from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
12
  from ..config import FIGHTERS_CSV_PATH
13
- from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
 
14
 
15
  class BaseModel(ABC):
16
  """
@@ -53,7 +55,7 @@ class EloBaselineModel(BaseModel):
53
  self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
54
  self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
55
 
56
- def predict(self, fight):
57
  """Predicts the winner based on ELO and calculates win probability."""
58
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
59
 
@@ -85,7 +87,7 @@ class BaseMLModel(BaseModel):
85
  self.fighters_df = None
86
  self.fighter_histories = {}
87
 
88
- def train(self, train_fights):
89
  """
90
  Trains the machine learning model. This involves loading fighter data,
91
  pre-calculating histories, and fitting the model on the preprocessed data.
@@ -93,12 +95,7 @@ class BaseMLModel(BaseModel):
93
  print(f"--- Training {self.model.__class__.__name__} ---")
94
 
95
  # 1. Prepare data for prediction-time feature generation
96
- self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
97
- self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
98
- self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
99
- for col in ['height_cm', 'reach_in', 'elo']:
100
- if col in self.fighters_df.columns:
101
- self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
102
 
103
  # 2. Pre-calculate fighter histories
104
  train_fights_with_dates = []
@@ -136,8 +133,8 @@ class BaseMLModel(BaseModel):
136
  f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
137
  f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
138
 
139
- f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
140
- f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
141
 
142
  features = {
143
  'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
@@ -194,4 +191,4 @@ class BernoulliNBModel(BaseMLModel):
194
  class LGBMModel(BaseMLModel):
195
  """A thin wrapper for LightGBM's LGBMClassifier."""
196
  def __init__(self):
197
- super().__init__(model=LGBMClassifier(random_state=42))
 
2
  import sys
3
  import os
4
  import pandas as pd
5
+ from typing import Dict, Any, Optional
6
  from sklearn.linear_model import LogisticRegression
7
  from sklearn.svm import SVC
8
  from sklearn.naive_bayes import BernoulliNB
 
11
  from lightgbm import LGBMClassifier
12
  from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
13
  from ..config import FIGHTERS_CSV_PATH
14
+ from .preprocess import preprocess_for_ml, _get_fighter_history_stats
15
+ from .utils import calculate_age, prepare_fighters_data, DEFAULT_ELO
16
 
17
  class BaseModel(ABC):
18
  """
 
55
  self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
56
  self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
57
 
58
+ def predict(self, fight: Dict[str, Any]) -> Dict[str, Optional[float]]:
59
  """Predicts the winner based on ELO and calculates win probability."""
60
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
61
 
 
87
  self.fighters_df = None
88
  self.fighter_histories = {}
89
 
90
+ def train(self, train_fights: List[Dict[str, Any]]) -> None:
91
  """
92
  Trains the machine learning model. This involves loading fighter data,
93
  pre-calculating histories, and fitting the model on the preprocessed data.
 
95
  print(f"--- Training {self.model.__class__.__name__} ---")
96
 
97
  # 1. Prepare data for prediction-time feature generation
98
+ self.fighters_df = prepare_fighters_data(pd.read_csv(FIGHTERS_CSV_PATH))
 
 
 
 
 
99
 
100
  # 2. Pre-calculate fighter histories
101
  train_fights_with_dates = []
 
133
  f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
134
  f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)
135
 
136
+ f1_age = calculate_age(f1_stats.get('dob'), fight['event_date'])
137
+ f2_age = calculate_age(f2_stats.get('dob'), fight['event_date'])
138
 
139
  features = {
140
  'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
 
191
  class LGBMModel(BaseMLModel):
192
  """A thin wrapper for LightGBM's LGBMClassifier."""
193
  def __init__(self):
194
+ super().__init__(model=LGBMClassifier(random_state=42))
src/predict/pipeline.py CHANGED
@@ -149,16 +149,13 @@ class PredictionPipeline:
149
  print("No new data detected and all model files exist. Using existing models.")
150
  return False
151
 
152
- def _load_and_split_data(self, num_test_events=1):
153
  """Loads and splits the data into chronological training and testing sets."""
154
  print("\n--- Loading and Splitting Data ---")
155
  if not os.path.exists(FIGHTS_CSV_PATH):
156
  raise FileNotFoundError(f"Fights data not found at '{FIGHTS_CSV_PATH}'.")
157
 
158
- with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
159
- fights = list(csv.DictReader(f))
160
-
161
- fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
162
 
163
  all_events = list(OrderedDict.fromkeys(f['event_name'] for f in fights))
164
  if len(all_events) < num_test_events:
@@ -171,7 +168,15 @@ class PredictionPipeline:
171
  print(f"Data loaded. {len(self.train_fights)} training fights, {len(self.test_fights)} testing fights.")
172
  print(f"Testing on the last {num_test_events} event(s): {', '.join(test_event_names)}")
173
 
174
- def run(self, detailed_report=True):
 
 
 
 
 
 
 
 
175
  """Executes the full pipeline: load, train, evaluate, report and save models."""
176
  self._load_and_split_data()
177
 
@@ -349,4 +354,4 @@ class PredictionPipeline:
349
  # A summary is printed to the console for convenience.
350
  self._report_summary()
351
  # The detailed report is now saved to a JSON file.
352
- self._save_report_to_json()
 
149
  print("No new data detected and all model files exist. Using existing models.")
150
  return False
151
 
152
+ def _load_and_split_data(self, num_test_events: int = 1) -> None:
153
  """Loads and splits the data into chronological training and testing sets."""
154
  print("\n--- Loading and Splitting Data ---")
155
  if not os.path.exists(FIGHTS_CSV_PATH):
156
  raise FileNotFoundError(f"Fights data not found at '{FIGHTS_CSV_PATH}'.")
157
 
158
+ fights = self._load_fights()
 
 
 
159
 
160
  all_events = list(OrderedDict.fromkeys(f['event_name'] for f in fights))
161
  if len(all_events) < num_test_events:
 
168
  print(f"Data loaded. {len(self.train_fights)} training fights, {len(self.test_fights)} testing fights.")
169
  print(f"Testing on the last {num_test_events} event(s): {', '.join(test_event_names)}")
170
 
171
+ def _load_fights(self) -> list:
172
+ """Helper method to load and sort fights from CSV."""
173
+ with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
174
+ fights = list(csv.DictReader(f))
175
+
176
+ fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
177
+ return fights
178
+
179
+ def run(self, detailed_report: bool = True) -> None:
180
  """Executes the full pipeline: load, train, evaluate, report and save models."""
181
  self._load_and_split_data()
182
 
 
354
  # A summary is printed to the console for convenience.
355
  self._report_summary()
356
  # The detailed report is now saved to a JSON file.
357
+ self._save_report_to_json()
src/predict/preprocess.py CHANGED
@@ -1,53 +1,22 @@
1
  import pandas as pd
2
  import os
3
  from datetime import datetime
 
4
  from ..config import FIGHTERS_CSV_PATH
 
 
 
 
5
 
6
- def _clean_numeric_column(series):
7
- """A helper to clean string columns into numbers, handling errors."""
8
- series_str = series.astype(str)
9
- return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
10
-
11
- def _calculate_age(dob_str, fight_date_str):
12
- """Calculates age in years from a date of birth string and fight date string."""
13
- if pd.isna(dob_str) or not dob_str:
14
- return None
15
- try:
16
- dob = datetime.strptime(dob_str, '%b %d, %Y')
17
- fight_date = datetime.strptime(fight_date_str, '%B %d, %Y')
18
- return (fight_date - dob).days / 365.25
19
- except (ValueError, TypeError):
20
- return None
21
-
22
- def _parse_round_time_to_seconds(round_str, time_str):
23
- """Converts fight duration from round and time to total seconds."""
24
- try:
25
- rounds = int(round_str)
26
- minutes, seconds = map(int, time_str.split(':'))
27
- # Assuming 5-minute rounds for calculation simplicity
28
- return ((rounds - 1) * 5 * 60) + (minutes * 60) + seconds
29
- except (ValueError, TypeError, AttributeError):
30
- return 0
31
-
32
- def _parse_striking_stats(stat_str):
33
- """Parses striking stats string like '10 of 20' into (landed, attempted)."""
34
- try:
35
- landed, attempted = map(int, stat_str.split(' of '))
36
- return landed, attempted
37
- except (ValueError, TypeError, AttributeError):
38
- return 0, 0
39
-
40
- def _to_int_safe(val):
41
- """Safely converts a value to an integer, returning 0 if it's invalid or empty."""
42
- if pd.isna(val):
43
- return 0
44
- try:
45
- # handle strings with whitespace or empty strings
46
- return int(str(val).strip() or 0)
47
- except (ValueError, TypeError):
48
- return 0
49
-
50
- def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history, fighters_df, n=5):
51
  """
52
  Calculates performance statistics for a fighter based on their last n fights.
53
  """
@@ -58,7 +27,7 @@ def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history
58
  # Return a default dictionary with the correct keys for a fighter with no history
59
  return {
60
  'wins_last_n': 0,
61
- 'avg_opp_elo_last_n': 1500, # Assume average ELO for first opponent
62
  'ko_percent_last_n': 0,
63
  'sig_str_landed_per_min_last_n': 0,
64
  'takedown_accuracy_last_n': 0,
@@ -84,20 +53,20 @@ def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history
84
 
85
  if opponent_name in fighters_df.index:
86
  opp_elo = fighters_df.loc[opponent_name, 'elo']
87
- stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else 1500)
88
 
89
- stats['total_time_secs'] += _parse_round_time_to_seconds(fight['round'], fight['time'])
90
 
91
  sig_str_stat = fight.get(f'{f_prefix}_sig_str', '0 of 0')
92
- landed, _ = _parse_striking_stats(sig_str_stat)
93
  stats['sig_str_landed'] += landed
94
 
95
  td_stat = fight.get(f'{f_prefix}_td', '0 of 0')
96
- td_landed, td_attempted = _parse_striking_stats(td_stat) # Can reuse this parser
97
  stats['td_landed'] += td_landed
98
  stats['td_attempted'] += td_attempted
99
 
100
- stats['sub_attempts'] += _to_int_safe(fight.get(f'{f_prefix}_sub_att'))
101
 
102
  # Final calculations
103
  avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
@@ -112,36 +81,26 @@ def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history
112
  'sub_attempts_per_min_last_n': (stats['sub_attempts'] / total_minutes) if total_minutes > 0 else 0,
113
  }
114
 
115
- def preprocess_for_ml(fights_to_process, fighters_csv_path):
 
 
 
116
  """
117
  Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
118
  suitable for a binary classification machine learning model.
119
 
120
  Args:
121
- fights_to_process (list of dict): The list of fights to process.
122
- fighters_csv_path (str): Path to the CSV file with all fighter stats.
123
 
124
  Returns:
125
- pd.DataFrame: Feature matrix X.
126
- pd.Series: Target vector y.
127
- pd.DataFrame: Metadata DataFrame.
128
  """
129
  if not os.path.exists(fighters_csv_path):
130
  raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
131
 
132
  fighters_df = pd.read_csv(fighters_csv_path)
133
-
134
- # 1. Prepare fighters data for merging
135
- fighters_prepared = fighters_df.copy()
136
- fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
137
-
138
- # Handle duplicate fighter names by keeping the first entry
139
- fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
140
- fighters_prepared = fighters_prepared.set_index('full_name')
141
-
142
- for col in ['height_cm', 'reach_in', 'elo']:
143
- if col in fighters_prepared.columns:
144
- fighters_prepared[col] = _clean_numeric_column(fighters_prepared[col])
145
 
146
  # 2. Pre-calculate fighter histories to speed up lookups
147
  # And convert date strings to datetime objects once
 
1
  import pandas as pd
2
  import os
3
  from datetime import datetime
4
+ from typing import Dict, List, Tuple, Any, Optional
5
  from ..config import FIGHTERS_CSV_PATH
6
+ from .utils import (
7
+ parse_round_time_to_seconds, parse_striking_stats, to_int_safe,
8
+ calculate_age, prepare_fighters_data, DEFAULT_ELO, N_FIGHTS_HISTORY
9
+ )
10
 
11
+
12
+
13
+ def _get_fighter_history_stats(
14
+ fighter_name: str,
15
+ current_fight_date: datetime,
16
+ fighter_history: List[Dict[str, Any]],
17
+ fighters_df: pd.DataFrame,
18
+ n: int = N_FIGHTS_HISTORY
19
+ ) -> Dict[str, float]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  """
21
  Calculates performance statistics for a fighter based on their last n fights.
22
  """
 
27
  # Return a default dictionary with the correct keys for a fighter with no history
28
  return {
29
  'wins_last_n': 0,
30
+ 'avg_opp_elo_last_n': DEFAULT_ELO,
31
  'ko_percent_last_n': 0,
32
  'sig_str_landed_per_min_last_n': 0,
33
  'takedown_accuracy_last_n': 0,
 
53
 
54
  if opponent_name in fighters_df.index:
55
  opp_elo = fighters_df.loc[opponent_name, 'elo']
56
+ stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else DEFAULT_ELO)
57
 
58
+ stats['total_time_secs'] += parse_round_time_to_seconds(fight['round'], fight['time'])
59
 
60
  sig_str_stat = fight.get(f'{f_prefix}_sig_str', '0 of 0')
61
+ landed, _ = parse_striking_stats(sig_str_stat)
62
  stats['sig_str_landed'] += landed
63
 
64
  td_stat = fight.get(f'{f_prefix}_td', '0 of 0')
65
+ td_landed, td_attempted = parse_striking_stats(td_stat)
66
  stats['td_landed'] += td_landed
67
  stats['td_attempted'] += td_attempted
68
 
69
+ stats['sub_attempts'] += to_int_safe(fight.get(f'{f_prefix}_sub_att'))
70
 
71
  # Final calculations
72
  avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
 
81
  'sub_attempts_per_min_last_n': (stats['sub_attempts'] / total_minutes) if total_minutes > 0 else 0,
82
  }
83
 
84
+ def preprocess_for_ml(
85
+ fights_to_process: List[Dict[str, Any]],
86
+ fighters_csv_path: str
87
+ ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
88
  """
89
  Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
90
  suitable for a binary classification machine learning model.
91
 
92
  Args:
93
+ fights_to_process: The list of fights to process.
94
+ fighters_csv_path: Path to the CSV file with all fighter stats.
95
 
96
  Returns:
97
+ Feature matrix X, target vector y, and metadata DataFrame.
 
 
98
  """
99
  if not os.path.exists(fighters_csv_path):
100
  raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
101
 
102
  fighters_df = pd.read_csv(fighters_csv_path)
103
+ fighters_prepared = prepare_fighters_data(fighters_df)
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  # 2. Pre-calculate fighter histories to speed up lookups
106
  # And convert date strings to datetime objects once
src/predict/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from datetime import datetime
4
+ from typing import Optional, Dict, Any
5
+
6
+ # Constants
7
+ DEFAULT_ELO = 1500
8
+ DEFAULT_AGE = 0
9
+ DEFAULT_FIGHT_TIME = 0
10
+ DEFAULT_ROUNDS_DURATION = 5 * 60 # 5 minutes per round
11
+ N_FIGHTS_HISTORY = 5
12
+
13
+ def clean_numeric_column(series: pd.Series) -> pd.Series:
14
+ """A helper to clean string columns into numbers, handling errors."""
15
+ series_str = series.astype(str)
16
+ return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
17
+
18
+ def calculate_age(dob_str: str, fight_date_str: str) -> Optional[float]:
19
+ """Calculates age in years from a date of birth string and fight date string."""
20
+ if pd.isna(dob_str) or not dob_str:
21
+ return None
22
+ try:
23
+ dob = datetime.strptime(dob_str, '%b %d, %Y')
24
+ fight_date = datetime.strptime(fight_date_str, '%B %d, %Y')
25
+ return (fight_date - dob).days / 365.25
26
+ except (ValueError, TypeError):
27
+ return None
28
+
29
+ def parse_round_time_to_seconds(round_str: str, time_str: str) -> int:
30
+ """Converts fight duration from round and time to total seconds."""
31
+ try:
32
+ rounds = int(round_str)
33
+ minutes, seconds = map(int, time_str.split(':'))
34
+ # Assuming 5-minute rounds for calculation simplicity
35
+ return ((rounds - 1) * DEFAULT_ROUNDS_DURATION) + (minutes * 60) + seconds
36
+ except (ValueError, TypeError, AttributeError):
37
+ return 0
38
+
39
+ def parse_striking_stats(stat_str: str) -> tuple[int, int]:
40
+ """Parses striking stats string like '10 of 20' into (landed, attempted)."""
41
+ try:
42
+ landed, attempted = map(int, stat_str.split(' of '))
43
+ return landed, attempted
44
+ except (ValueError, TypeError, AttributeError):
45
+ return 0, 0
46
+
47
+ def to_int_safe(val: Any) -> int:
48
+ """Safely converts a value to an integer, returning 0 if it's invalid or empty."""
49
+ if pd.isna(val):
50
+ return 0
51
+ try:
52
+ # handle strings with whitespace or empty strings
53
+ return int(str(val).strip() or 0)
54
+ except (ValueError, TypeError):
55
+ return 0
56
+
57
+ def prepare_fighters_data(fighters_df: pd.DataFrame) -> pd.DataFrame:
58
+ """Prepares fighter data for analysis by cleaning and standardizing."""
59
+ fighters_prepared = fighters_df.copy()
60
+ fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
61
+
62
+ # Handle duplicate fighter names by keeping the first entry
63
+ fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
64
+ fighters_prepared = fighters_prepared.set_index('full_name')
65
+
66
+ for col in ['height_cm', 'reach_in', 'elo']:
67
+ if col in fighters_prepared.columns:
68
+ fighters_prepared[col] = clean_numeric_column(fighters_prepared[col])
69
+
70
+ return fighters_prepared