AlvaroMros commited on
Commit
9678fdb
·
1 Parent(s): 53745df

Refactor imports to use absolute paths and clean up scripts

Browse files

Standardized all internal imports to use absolute paths for better maintainability and to avoid import errors. Removed unnecessary sys.path manipulations and direct script execution blocks from modules, making the codebase more modular and suitable for package usage.

.gitignore CHANGED
@@ -1,3 +1,3 @@
1
  *__pycache__/
2
  example_event.html
3
- web/
 
1
  *__pycache__/
2
  example_event.html
3
+ web/
app.py CHANGED
@@ -2,25 +2,17 @@ import gradio as gr
2
  import joblib
3
  from datetime import datetime
4
  import os
5
- import sys
6
 
7
- # --- Path and Module Setup ---
8
- # Add the 'src' directory to the system path so we can import our custom modules.
9
- sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
10
-
11
- # Although these models are not called directly, they MUST be imported here.
12
- # joblib.load() needs these class definitions in scope to deserialize the model files correctly.
13
  from src.predict.models import (
14
- BaseMLModel,
15
- EloBaselineModel,
16
- LogisticRegressionModel,
17
  XGBoostModel,
18
  SVCModel,
19
  RandomForestModel,
20
  BernoulliNBModel,
21
  LGBMModel
22
  )
23
- # Import the configuration variable for the models directory for consistency.
24
  from src.config import MODELS_DIR
25
 
26
  # --- Model Cache ---
@@ -53,9 +45,9 @@ def predict_fight(model_name, fighter1_name, fighter2_name):
53
  model_path = os.path.join(MODELS_DIR, model_name)
54
  MODEL_CACHE[model_name] = joblib.load(model_path)
55
  print("...model cached.")
56
-
57
  model = MODEL_CACHE[model_name]
58
-
59
  fight = {
60
  'fighter_1': fighter1_name,
61
  'fighter_2': fighter2_name,
@@ -63,14 +55,14 @@ def predict_fight(model_name, fighter1_name, fighter2_name):
63
  }
64
 
65
  prediction_result = model.predict(fight)
66
-
67
  if prediction_result and prediction_result.get('winner'):
68
  winner = prediction_result['winner']
69
  prob = prediction_result['probability']
70
  return winner, f"{prob:.1%}"
71
  else:
72
  return "Could not make a prediction.", ""
73
-
74
  except FileNotFoundError:
75
  return f"Error: Model file '{model_name}' not found.", ""
76
  except Exception as e:
@@ -81,19 +73,19 @@ def predict_fight(model_name, fighter1_name, fighter2_name):
81
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
82
  gr.Markdown("# 🥋 UFC Fight Predictor 🥊")
83
  gr.Markdown("Select a prediction model and enter two fighter names to predict the outcome.")
84
-
85
  with gr.Column():
86
  model_dropdown = gr.Dropdown(
87
- label="Select Model",
88
  choices=available_models,
89
  value=available_models[0] if available_models else None
90
  )
91
  with gr.Row():
92
  fighter1_input = gr.Textbox(label="Fighter 1", placeholder="e.g., Jon Jones")
93
  fighter2_input = gr.Textbox(label="Fighter 2", placeholder="e.g., Stipe Miocic")
94
-
95
  predict_button = gr.Button("Predict Winner")
96
-
97
  with gr.Column():
98
  winner_output = gr.Textbox(label="Predicted Winner", interactive=False)
99
  prob_output = gr.Textbox(label="Confidence", interactive=False)
 
2
  import joblib
3
  from datetime import datetime
4
  import os
 
5
 
 
 
 
 
 
 
6
  from src.predict.models import (
7
+ BaseMLModel,
8
+ EloBaselineModel,
9
+ LogisticRegressionModel,
10
  XGBoostModel,
11
  SVCModel,
12
  RandomForestModel,
13
  BernoulliNBModel,
14
  LGBMModel
15
  )
 
16
  from src.config import MODELS_DIR
17
 
18
  # --- Model Cache ---
 
45
  model_path = os.path.join(MODELS_DIR, model_name)
46
  MODEL_CACHE[model_name] = joblib.load(model_path)
47
  print("...model cached.")
48
+
49
  model = MODEL_CACHE[model_name]
50
+
51
  fight = {
52
  'fighter_1': fighter1_name,
53
  'fighter_2': fighter2_name,
 
55
  }
56
 
57
  prediction_result = model.predict(fight)
58
+
59
  if prediction_result and prediction_result.get('winner'):
60
  winner = prediction_result['winner']
61
  prob = prediction_result['probability']
62
  return winner, f"{prob:.1%}"
63
  else:
64
  return "Could not make a prediction.", ""
65
+
66
  except FileNotFoundError:
67
  return f"Error: Model file '{model_name}' not found.", ""
68
  except Exception as e:
 
73
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
74
  gr.Markdown("# 🥋 UFC Fight Predictor 🥊")
75
  gr.Markdown("Select a prediction model and enter two fighter names to predict the outcome.")
76
+
77
  with gr.Column():
78
  model_dropdown = gr.Dropdown(
79
+ label="Select Model",
80
  choices=available_models,
81
  value=available_models[0] if available_models else None
82
  )
83
  with gr.Row():
84
  fighter1_input = gr.Textbox(label="Fighter 1", placeholder="e.g., Jon Jones")
85
  fighter2_input = gr.Textbox(label="Fighter 2", placeholder="e.g., Stipe Miocic")
86
+
87
  predict_button = gr.Button("Predict Winner")
88
+
89
  with gr.Column():
90
  winner_output = gr.Textbox(label="Predicted Winner", interactive=False)
91
  prob_output = gr.Textbox(label="Confidence", interactive=False)
logs/startup_update.log CHANGED
Binary files a/logs/startup_update.log and b/logs/startup_update.log differ
 
src/analysis/elo.py CHANGED
@@ -127,10 +127,4 @@ def main():
127
  print("\n--- Top 10 Fighters by ELO Rating ---")
128
  for i, (fighter, elo) in enumerate(sorted_fighters[:10]):
129
  print(f"{i+1}. {fighter}: {round(elo)}")
130
- print("------------------------------------")
131
-
132
- if __name__ == '__main__':
133
- # Create the directory if it doesn't exist to avoid confusion
134
- if not os.path.exists('src/analysis'):
135
- os.makedirs('src/analysis')
136
- main()
 
127
  print("\n--- Top 10 Fighters by ELO Rating ---")
128
  for i, (fighter, elo) in enumerate(sorted_fighters[:10]):
129
  print(f"{i+1}. {fighter}: {round(elo)}")
130
+ print("------------------------------------")
 
 
 
 
 
 
src/main.py CHANGED
@@ -2,9 +2,6 @@ import argparse
2
  import sys
3
  import os
4
 
5
- # Add the current directory to Python path for imports
6
- sys.path.append(os.path.dirname(os.path.abspath(__file__)))
7
-
8
  def main():
9
  """
10
  Main entry point for the UFC data pipeline.
@@ -55,7 +52,7 @@ def main():
55
 
56
  if args.pipeline in ['scrape', 'all']:
57
  print("=== Running Scraping Pipeline ===")
58
- from scrape.main import main as scrape_main
59
 
60
  # Override sys.argv to pass arguments to scrape.main
61
  original_argv = sys.argv
@@ -67,7 +64,7 @@ def main():
67
 
68
  if args.pipeline in ['analysis', 'all']:
69
  print("\n=== Running ELO Analysis ===")
70
- from analysis.elo import main as elo_main
71
  elo_main()
72
 
73
  if args.pipeline == 'update':
@@ -85,7 +82,7 @@ def main():
85
 
86
  if args.pipeline in ['predict', 'all']:
87
  print("\n=== Running Prediction Pipeline ===")
88
- from predict.main import main as predict_main
89
 
90
  # Override sys.argv to pass model management arguments to predict.main
91
  original_argv = sys.argv
 
2
  import sys
3
  import os
4
 
 
 
 
5
  def main():
6
  """
7
  Main entry point for the UFC data pipeline.
 
52
 
53
  if args.pipeline in ['scrape', 'all']:
54
  print("=== Running Scraping Pipeline ===")
55
+ from src.scrape.main import main as scrape_main
56
 
57
  # Override sys.argv to pass arguments to scrape.main
58
  original_argv = sys.argv
 
64
 
65
  if args.pipeline in ['analysis', 'all']:
66
  print("\n=== Running ELO Analysis ===")
67
+ from src.analysis.elo import main as elo_main
68
  elo_main()
69
 
70
  if args.pipeline == 'update':
 
82
 
83
  if args.pipeline in ['predict', 'all']:
84
  print("\n=== Running Prediction Pipeline ===")
85
+ from src.predict.main import main as predict_main
86
 
87
  # Override sys.argv to pass model management arguments to predict.main
88
  original_argv = sys.argv
src/predict/main.py CHANGED
@@ -1,8 +1,7 @@
1
  import argparse
2
 
3
- # Use absolute imports to avoid relative import issues
4
- from src.predict.pipeline import PredictionPipeline
5
- from src.predict.models import (
6
  EloBaselineModel,
7
  LogisticRegressionModel,
8
  XGBoostModel,
@@ -93,6 +92,3 @@ def main():
93
  except FileNotFoundError as e:
94
  print(f"Error: {e}")
95
  print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
96
-
97
- if __name__ == '__main__':
98
- main()
 
1
  import argparse
2
 
3
+ from .pipeline import PredictionPipeline
4
+ from .models import (
 
5
  EloBaselineModel,
6
  LogisticRegressionModel,
7
  XGBoostModel,
 
92
  except FileNotFoundError as e:
93
  print(f"Error: {e}")
94
  print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
 
 
 
src/predict/models.py CHANGED
@@ -8,17 +8,9 @@ from sklearn.naive_bayes import BernoulliNB
8
  from sklearn.ensemble import RandomForestClassifier
9
  from xgboost import XGBClassifier
10
  from lightgbm import LGBMClassifier
11
-
12
- # Use absolute imports to avoid relative import issues
13
- try:
14
- from src.analysis.elo import process_fights_for_elo, INITIAL_ELO
15
- from src.config import FIGHTERS_CSV_PATH
16
- from src.predict.preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
17
- except ImportError:
18
- # Fallback for when running directly
19
- from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
20
- from ..config import FIGHTERS_CSV_PATH
21
- from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
22
 
23
  class BaseModel(ABC):
24
  """
 
8
  from sklearn.ensemble import RandomForestClassifier
9
  from xgboost import XGBClassifier
10
  from lightgbm import LGBMClassifier
11
+ from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
12
+ from ..config import FIGHTERS_CSV_PATH
13
+ from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
 
 
 
 
 
 
 
 
14
 
15
  class BaseModel(ABC):
16
  """
src/predict/pipeline.py CHANGED
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
  """
3
  UFC Fight Prediction Pipeline
4
 
@@ -20,19 +19,11 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
20
 
21
  import csv
22
  import os
23
- import sys
24
  from datetime import datetime
25
  from collections import OrderedDict
26
  import json
27
  import joblib
28
-
29
- # Use absolute imports to avoid relative import issues
30
- try:
31
- from src.config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
32
- except ImportError:
33
- # Fallback for when running directly
34
- from ..config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
35
-
36
  from .models import BaseModel
37
 
38
  class PredictionPipeline:
 
 
1
  """
2
  UFC Fight Prediction Pipeline
3
 
 
19
 
20
  import csv
21
  import os
 
22
  from datetime import datetime
23
  from collections import OrderedDict
24
  import json
25
  import joblib
26
+ from ..config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
 
 
 
 
 
 
 
27
  from .models import BaseModel
28
 
29
  class PredictionPipeline:
src/predict/predict_new.py CHANGED
@@ -3,12 +3,7 @@ import os
3
  import joblib
4
  from datetime import datetime
5
 
6
- # Use absolute imports to avoid relative import issues
7
- try:
8
- from src.config import MODELS_DIR
9
- except ImportError:
10
- # Fallback for when running directly
11
- from ..config import MODELS_DIR
12
 
13
  def predict_new_fight(fighter1_name, fighter2_name, model_path):
14
  """
@@ -43,18 +38,4 @@ def predict_new_fight(fighter1_name, fighter2_name, model_path):
43
  prob = prediction_result['probability']
44
  print(f"\n---> Predicted Winner: {winner} ({prob:.1%}) <---")
45
  else:
46
- print("\nCould not make a prediction. One of the fighters may not be in the dataset.")
47
-
48
- if __name__ == '__main__':
49
- parser = argparse.ArgumentParser(description="Predict the outcome of a new UFC fight.")
50
- parser.add_argument('fighter1', type=str, help="The full name of the first fighter (e.g., 'Jon Jones').")
51
- parser.add_argument('fighter2', type=str, help="The full name of the second fighter (e.g., 'Stipe Miocic').")
52
- parser.add_argument(
53
- '--model_path',
54
- type=str,
55
- default=os.path.join(MODELS_DIR, 'XGBoostModel.joblib'),
56
- help="Path to the saved model file."
57
- )
58
- args = parser.parse_args()
59
-
60
- predict_new_fight(args.fighter1, args.fighter2, args.model_path)
 
3
  import joblib
4
  from datetime import datetime
5
 
6
+ from ..config import MODELS_DIR
 
 
 
 
 
7
 
8
  def predict_new_fight(fighter1_name, fighter2_name, model_path):
9
  """
 
38
  prob = prediction_result['probability']
39
  print(f"\n---> Predicted Winner: {winner} ({prob:.1%}) <---")
40
  else:
41
+ print("\nCould not make a prediction. One of the fighters may not be in the dataset.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/predict/preprocess.py CHANGED
@@ -1,14 +1,7 @@
1
  import pandas as pd
2
  import os
3
- import sys
4
  from datetime import datetime
5
-
6
- # Use absolute imports to avoid relative import issues
7
- try:
8
- from src.config import FIGHTERS_CSV_PATH
9
- except ImportError:
10
- # Fallback for when running directly
11
- from ..config import FIGHTERS_CSV_PATH
12
 
13
  def _clean_numeric_column(series):
14
  """A helper to clean string columns into numbers, handling errors."""
@@ -236,36 +229,3 @@ def preprocess_for_ml(fights_to_process, fighters_csv_path):
236
 
237
  print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
238
  return X, y, metadata
239
-
240
- if __name__ == '__main__':
241
- # Use absolute imports to avoid relative import issues
242
- try:
243
- from src.predict.pipeline import PredictionPipeline
244
- except ImportError:
245
- # Fallback for when running directly
246
- from .pipeline import PredictionPipeline
247
-
248
- print("--- Running Preprocessing Example ---")
249
-
250
- pipeline = PredictionPipeline(models=[])
251
- try:
252
- pipeline._load_and_split_data()
253
- if pipeline.train_fights:
254
- X_train, y_train, metadata_train = preprocess_for_ml(pipeline.train_fights, FIGHTERS_CSV_PATH)
255
- print("\nTraining Data Shape:")
256
- print("X_train:", X_train.shape)
257
- print("y_train:", y_train.shape)
258
- print("metadata_train:", metadata_train.shape)
259
-
260
- print("\nLast 5 rows of X_train (showing populated historical features):")
261
- print(X_train.tail())
262
-
263
- print("\nTarget distribution (0=Loss, 1=Win):")
264
- print(y_train.value_counts())
265
-
266
- print("\nMetadata for last 5 rows:")
267
- print(metadata_train.tail())
268
-
269
- except FileNotFoundError as e:
270
- print(e)
271
- print("Please run the scraping pipeline first ('python -m src.scrape.main').")
 
1
  import pandas as pd
2
  import os
 
3
  from datetime import datetime
4
+ from ..config import FIGHTERS_CSV_PATH
 
 
 
 
 
 
5
 
6
  def _clean_numeric_column(series):
7
  """A helper to clean string columns into numbers, handling errors."""
 
229
 
230
  print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
231
  return X, y, metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/scrape/main.py CHANGED
@@ -6,7 +6,13 @@ from .scrape_fights import scrape_all_events, scrape_latest_events
6
  from .scrape_fighters import scrape_all_fighters
7
  from .to_csv import json_to_csv, fighters_json_to_csv
8
  from .preprocess import preprocess_fighters_csv
9
- from .. import config
 
 
 
 
 
 
10
 
11
  def main():
12
  """
@@ -31,9 +37,9 @@ def main():
31
  args = parser.parse_args()
32
 
33
  # Ensure the output directory exists
34
- if not os.path.exists(config.OUTPUT_DIR):
35
- os.makedirs(config.OUTPUT_DIR)
36
- print(f"Created directory: {config.OUTPUT_DIR}")
37
 
38
  if args.mode == 'full':
39
  run_full_pipeline()
@@ -48,13 +54,13 @@ def run_full_pipeline():
48
 
49
  # --- Step 1: Scrape all data from the website ---
50
  # This will generate fighters.json and events.json
51
- scrape_all_fighters(config.FIGHTERS_JSON_PATH)
52
- scrape_all_events(config.EVENTS_JSON_PATH)
53
 
54
  # --- Step 2: Convert the scraped JSON data to CSV format ---
55
  # This will generate fighters.csv and fights.csv
56
- json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH)
57
- fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH)
58
 
59
  # --- Step 3: Run post-processing on the generated CSV files ---
60
  # This cleans names, converts height, etc.
@@ -64,12 +70,12 @@ def run_full_pipeline():
64
  # --- Step 4: Clean up temporary JSON files ---
65
  print("\n--- Deleting temporary JSON files ---")
66
  try:
67
- if os.path.exists(config.EVENTS_JSON_PATH):
68
- os.remove(config.EVENTS_JSON_PATH)
69
- print(f"Deleted: {config.EVENTS_JSON_PATH}")
70
- if os.path.exists(config.FIGHTERS_JSON_PATH):
71
- os.remove(config.FIGHTERS_JSON_PATH)
72
- print(f"Deleted: {config.FIGHTERS_JSON_PATH}")
73
  except OSError as e:
74
  print(f"Error deleting JSON files: {e}")
75
 
@@ -86,13 +92,13 @@ def run_update_pipeline(num_events=5):
86
  print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===")
87
 
88
  # --- Step 1: Scrape latest events only ---
89
- latest_events = scrape_latest_events(config.LAST_EVENT_JSON_PATH, num_events)
90
 
91
  # --- Step 2: Save latest events to last_event.json (even if empty) ---
92
  if latest_events:
93
- with open(config.LAST_EVENT_JSON_PATH, 'w') as f:
94
  json.dump(latest_events, f, indent=4)
95
- print(f"Latest {len(latest_events)} events saved to {config.LAST_EVENT_JSON_PATH}")
96
 
97
  # --- Step 3: Always check and update from last_event.json ---
98
  update_fights_csv_from_last_event()
@@ -105,13 +111,13 @@ def update_fights_csv_from_last_event():
105
  Ensures latest events are on top and preserves data types.
106
  """
107
  # Check if last_event.json exists
108
- if not os.path.exists(config.LAST_EVENT_JSON_PATH):
109
- print(f"No {config.LAST_EVENT_JSON_PATH} found. Nothing to update.")
110
  return
111
 
112
  # Load events from last_event.json
113
  try:
114
- with open(config.LAST_EVENT_JSON_PATH, 'r') as f:
115
  events_from_json = json.load(f)
116
 
117
  if not events_from_json:
@@ -126,17 +132,17 @@ def update_fights_csv_from_last_event():
126
 
127
  try:
128
  # Check if main CSV exists
129
- if os.path.exists(config.FIGHTS_CSV_PATH):
130
- existing_df = pd.read_csv(config.FIGHTS_CSV_PATH)
131
  existing_event_names = set(existing_df['event_name'].unique())
132
  else:
133
- print(f"Main fights CSV ({config.FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.")
134
- json_to_csv(config.LAST_EVENT_JSON_PATH, config.FIGHTS_CSV_PATH)
135
  return
136
 
137
  # Create temporary CSV from events in last_event.json
138
- temp_json_path = os.path.join(config.OUTPUT_DIR, 'temp_latest.json')
139
- temp_csv_path = os.path.join(config.OUTPUT_DIR, 'temp_latest.csv')
140
 
141
  with open(temp_json_path, 'w') as f:
142
  json.dump(events_from_json, f, indent=4)
@@ -165,8 +171,8 @@ def update_fights_csv_from_last_event():
165
  # Fix data types to remove .0 from numbers
166
  fix_data_types(combined_df)
167
 
168
- combined_df.to_csv(config.FIGHTS_CSV_PATH, index=False)
169
- print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {config.FIGHTS_CSV_PATH}")
170
  else:
171
  print("No new events found that aren't already in the existing CSV.")
172
 
@@ -179,7 +185,7 @@ def update_fights_csv_from_last_event():
179
  except Exception as e:
180
  print(f"Error updating fights CSV: {e}")
181
  print("Falling back to creating new CSV from last_event.json only.")
182
- json_to_csv(config.LAST_EVENT_JSON_PATH, config.FIGHTS_CSV_PATH)
183
 
184
  def fix_data_types(df):
185
  """
@@ -200,6 +206,3 @@ def fix_data_types(df):
200
  df[col] = df[col].str.replace(r'\.0$', '', regex=True)
201
  # Convert empty strings back to original empty values
202
  df[col] = df[col].replace('', '')
203
-
204
- if __name__ == '__main__':
205
- main()
 
6
  from .scrape_fighters import scrape_all_fighters
7
  from .to_csv import json_to_csv, fighters_json_to_csv
8
  from .preprocess import preprocess_fighters_csv
9
+ from ..config import (
10
+ OUTPUT_DIR,
11
+ FIGHTERS_JSON_PATH,
12
+ EVENTS_JSON_PATH,
13
+ FIGHTS_CSV_PATH,
14
+ LAST_EVENT_JSON_PATH
15
+ )
16
 
17
  def main():
18
  """
 
37
  args = parser.parse_args()
38
 
39
  # Ensure the output directory exists
40
+ if not os.path.exists(OUTPUT_DIR):
41
+ os.makedirs(OUTPUT_DIR)
42
+ print(f"Created directory: {OUTPUT_DIR}")
43
 
44
  if args.mode == 'full':
45
  run_full_pipeline()
 
54
 
55
  # --- Step 1: Scrape all data from the website ---
56
  # This will generate fighters.json and events.json
57
+ scrape_all_fighters(FIGHTERS_JSON_PATH)
58
+ scrape_all_events(EVENTS_JSON_PATH)
59
 
60
  # --- Step 2: Convert the scraped JSON data to CSV format ---
61
  # This will generate fighters.csv and fights.csv
62
+ json_to_csv(EVENTS_JSON_PATH, FIGHTS_CSV_PATH)
63
+ fighters_json_to_csv(FIGHTERS_JSON_PATH, FIGHTERS_CSV_PATH)
64
 
65
  # --- Step 3: Run post-processing on the generated CSV files ---
66
  # This cleans names, converts height, etc.
 
70
  # --- Step 4: Clean up temporary JSON files ---
71
  print("\n--- Deleting temporary JSON files ---")
72
  try:
73
+ if os.path.exists(EVENTS_JSON_PATH):
74
+ os.remove(EVENTS_JSON_PATH)
75
+ print(f"Deleted: {EVENTS_JSON_PATH}")
76
+ if os.path.exists(FIGHTERS_JSON_PATH):
77
+ os.remove(FIGHTERS_JSON_PATH)
78
+ print(f"Deleted: {FIGHTERS_JSON_PATH}")
79
  except OSError as e:
80
  print(f"Error deleting JSON files: {e}")
81
 
 
92
  print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===")
93
 
94
  # --- Step 1: Scrape latest events only ---
95
+ latest_events = scrape_latest_events(LAST_EVENT_JSON_PATH, num_events)
96
 
97
  # --- Step 2: Save latest events to last_event.json (even if empty) ---
98
  if latest_events:
99
+ with open(LAST_EVENT_JSON_PATH, 'w') as f:
100
  json.dump(latest_events, f, indent=4)
101
+ print(f"Latest {len(latest_events)} events saved to {LAST_EVENT_JSON_PATH}")
102
 
103
  # --- Step 3: Always check and update from last_event.json ---
104
  update_fights_csv_from_last_event()
 
111
  Ensures latest events are on top and preserves data types.
112
  """
113
  # Check if last_event.json exists
114
+ if not os.path.exists(LAST_EVENT_JSON_PATH):
115
+ print(f"No {LAST_EVENT_JSON_PATH} found. Nothing to update.")
116
  return
117
 
118
  # Load events from last_event.json
119
  try:
120
+ with open(LAST_EVENT_JSON_PATH, 'r') as f:
121
  events_from_json = json.load(f)
122
 
123
  if not events_from_json:
 
132
 
133
  try:
134
  # Check if main CSV exists
135
+ if os.path.exists(FIGHTS_CSV_PATH):
136
+ existing_df = pd.read_csv(FIGHTS_CSV_PATH)
137
  existing_event_names = set(existing_df['event_name'].unique())
138
  else:
139
+ print(f"Main fights CSV ({FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.")
140
+ json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
141
  return
142
 
143
  # Create temporary CSV from events in last_event.json
144
+ temp_json_path = os.path.join(OUTPUT_DIR, 'temp_latest.json')
145
+ temp_csv_path = os.path.join(OUTPUT_DIR, 'temp_latest.csv')
146
 
147
  with open(temp_json_path, 'w') as f:
148
  json.dump(events_from_json, f, indent=4)
 
171
  # Fix data types to remove .0 from numbers
172
  fix_data_types(combined_df)
173
 
174
+ combined_df.to_csv(FIGHTS_CSV_PATH, index=False)
175
+ print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {FIGHTS_CSV_PATH}")
176
  else:
177
  print("No new events found that aren't already in the existing CSV.")
178
 
 
185
  except Exception as e:
186
  print(f"Error updating fights CSV: {e}")
187
  print("Falling back to creating new CSV from last_event.json only.")
188
+ json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
189
 
190
  def fix_data_types(df):
191
  """
 
206
  df[col] = df[col].str.replace(r'\.0$', '', regex=True)
207
  # Convert empty strings back to original empty values
208
  df[col] = df[col].replace('', '')
 
 
 
src/scrape/preprocess.py CHANGED
@@ -1,6 +1,6 @@
1
  import csv
2
  import os
3
- from .. import config
4
 
5
  def convert_height_to_cm(height_str):
6
  """
@@ -24,7 +24,7 @@ def convert_height_to_cm(height_str):
24
  # Return original value if parsing fails
25
  return height_str
26
 
27
- def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
28
  """
29
  Reads the fighters CSV, cleans names, converts height to cm,
30
  and saves the changes back to the same file.
@@ -78,7 +78,4 @@ def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
78
  print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
79
 
80
  except Exception as e:
81
- print(f"An error occurred: {e}")
82
-
83
- if __name__ == '__main__':
84
- preprocess_fighters_csv()
 
1
  import csv
2
  import os
3
+ from ..config import FIGHTERS_CSV_PATH
4
 
5
  def convert_height_to_cm(height_str):
6
  """
 
24
  # Return original value if parsing fails
25
  return height_str
26
 
27
+ def preprocess_fighters_csv(file_path=FIGHTERS_CSV_PATH):
28
  """
29
  Reads the fighters CSV, cleans names, converts height to cm,
30
  and saves the changes back to the same file.
 
78
  print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
79
 
80
  except Exception as e:
81
+ print(f"An error occurred: {e}")
 
 
 
src/scrape/scrape_fighters.py CHANGED
@@ -5,7 +5,7 @@ import time
5
  import string
6
  import concurrent.futures
7
  import os
8
- from .. import config
9
 
10
  # --- Configuration ---
11
  # The number of parallel threads to use for scraping fighter details.
@@ -133,14 +133,4 @@ def scrape_all_fighters(json_path):
133
  json.dump(fighters_with_details, f, indent=4)
134
 
135
  fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
136
- return fighters_with_details
137
-
138
- if __name__ == "__main__":
139
- all_fighters_data = scrape_all_fighters(config.FIGHTERS_JSON_PATH)
140
- if not os.path.exists(config.OUTPUT_DIR):
141
- os.makedirs(config.OUTPUT_DIR)
142
-
143
- with open(config.FIGHTERS_JSON_PATH, 'w') as f:
144
- json.dump(all_fighters_data, f, indent=4)
145
-
146
- print(f"\nScraping complete. Final data for {len(all_fighters_data)} fighters saved to {config.FIGHTERS_JSON_PATH}")
 
5
  import string
6
  import concurrent.futures
7
  import os
8
+ from ..config import FIGHTERS_JSON_PATH, OUTPUT_DIR
9
 
10
  # --- Configuration ---
11
  # The number of parallel threads to use for scraping fighter details.
 
133
  json.dump(fighters_with_details, f, indent=4)
134
 
135
  fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
136
+ return fighters_with_details
 
 
 
 
 
 
 
 
 
 
src/scrape/scrape_fights.py CHANGED
@@ -3,7 +3,7 @@ from bs4 import BeautifulSoup
3
  import json
4
  import time
5
  import concurrent.futures
6
- from .. import config
7
 
8
  # --- Configuration ---
9
  # The number of parallel threads to use for scraping fight details.
@@ -255,9 +255,3 @@ def scrape_latest_events(json_path, num_events=5):
255
  print(f"Could not process event {event_url}. Error: {e}")
256
 
257
  return events
258
-
259
- if __name__ == "__main__":
260
- all_events_data = scrape_all_events(config.EVENTS_JSON_PATH)
261
- with open(config.EVENTS_JSON_PATH, 'w') as f:
262
- json.dump(all_events_data, f, indent=4)
263
- print(f"\nScraping complete. Final data saved to {config.EVENTS_JSON_PATH}")
 
3
  import json
4
  import time
5
  import concurrent.futures
6
+ from ..config import EVENTS_JSON_PATH
7
 
8
  # --- Configuration ---
9
  # The number of parallel threads to use for scraping fight details.
 
255
  print(f"Could not process event {event_url}. Error: {e}")
256
 
257
  return events
 
 
 
 
 
 
src/scrape/to_csv.py CHANGED
@@ -1,6 +1,6 @@
1
  import json
2
  import csv
3
- from .. import config
4
 
5
  def json_to_csv(json_file_path, csv_file_path):
6
  try:
@@ -137,8 +137,4 @@ def fighters_json_to_csv(json_file_path, csv_file_path):
137
  cleaned_row = {key: clean_value(fighter_data.get(key, '')) for key in headers}
138
  writer.writerow(cleaned_row)
139
 
140
- print(f"Successfully converted {json_file_path} to {csv_file_path}")
141
-
142
- if __name__ == '__main__':
143
- json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH)
144
- fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH)
 
1
  import json
2
  import csv
3
+ from ..config import EVENTS_JSON_PATH, FIGHTS_CSV_PATH, FIGHTERS_JSON_PATH
4
 
5
  def json_to_csv(json_file_path, csv_file_path):
6
  try:
 
137
  cleaned_row = {key: clean_value(fighter_data.get(key, '')) for key in headers}
138
  writer.writerow(cleaned_row)
139
 
140
+ print(f"Successfully converted {json_file_path} to {csv_file_path}")