Spaces:
Running
Running
Commit
·
9678fdb
1
Parent(s):
53745df
Refactor imports to use absolute paths and clean up scripts
Browse filesStandardized all internal imports to use absolute paths for better maintainability and to avoid import errors. Removed unnecessary sys.path manipulations and direct script execution blocks from modules, making the codebase more modular and suitable for package usage.
- .gitignore +1 -1
- app.py +11 -19
- logs/startup_update.log +0 -0
- src/analysis/elo.py +1 -7
- src/main.py +3 -6
- src/predict/main.py +2 -6
- src/predict/models.py +3 -11
- src/predict/pipeline.py +1 -10
- src/predict/predict_new.py +2 -21
- src/predict/preprocess.py +1 -41
- src/scrape/main.py +35 -32
- src/scrape/preprocess.py +3 -6
- src/scrape/scrape_fighters.py +2 -12
- src/scrape/scrape_fights.py +1 -7
- src/scrape/to_csv.py +2 -6
.gitignore
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
*__pycache__/
|
2 |
example_event.html
|
3 |
-
web/
|
|
|
1 |
*__pycache__/
|
2 |
example_event.html
|
3 |
+
web/
|
app.py
CHANGED
@@ -2,25 +2,17 @@ import gradio as gr
|
|
2 |
import joblib
|
3 |
from datetime import datetime
|
4 |
import os
|
5 |
-
import sys
|
6 |
|
7 |
-
# --- Path and Module Setup ---
|
8 |
-
# Add the 'src' directory to the system path so we can import our custom modules.
|
9 |
-
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
|
10 |
-
|
11 |
-
# Although these models are not called directly, they MUST be imported here.
|
12 |
-
# joblib.load() needs these class definitions in scope to deserialize the model files correctly.
|
13 |
from src.predict.models import (
|
14 |
-
BaseMLModel,
|
15 |
-
EloBaselineModel,
|
16 |
-
LogisticRegressionModel,
|
17 |
XGBoostModel,
|
18 |
SVCModel,
|
19 |
RandomForestModel,
|
20 |
BernoulliNBModel,
|
21 |
LGBMModel
|
22 |
)
|
23 |
-
# Import the configuration variable for the models directory for consistency.
|
24 |
from src.config import MODELS_DIR
|
25 |
|
26 |
# --- Model Cache ---
|
@@ -53,9 +45,9 @@ def predict_fight(model_name, fighter1_name, fighter2_name):
|
|
53 |
model_path = os.path.join(MODELS_DIR, model_name)
|
54 |
MODEL_CACHE[model_name] = joblib.load(model_path)
|
55 |
print("...model cached.")
|
56 |
-
|
57 |
model = MODEL_CACHE[model_name]
|
58 |
-
|
59 |
fight = {
|
60 |
'fighter_1': fighter1_name,
|
61 |
'fighter_2': fighter2_name,
|
@@ -63,14 +55,14 @@ def predict_fight(model_name, fighter1_name, fighter2_name):
|
|
63 |
}
|
64 |
|
65 |
prediction_result = model.predict(fight)
|
66 |
-
|
67 |
if prediction_result and prediction_result.get('winner'):
|
68 |
winner = prediction_result['winner']
|
69 |
prob = prediction_result['probability']
|
70 |
return winner, f"{prob:.1%}"
|
71 |
else:
|
72 |
return "Could not make a prediction.", ""
|
73 |
-
|
74 |
except FileNotFoundError:
|
75 |
return f"Error: Model file '{model_name}' not found.", ""
|
76 |
except Exception as e:
|
@@ -81,19 +73,19 @@ def predict_fight(model_name, fighter1_name, fighter2_name):
|
|
81 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
82 |
gr.Markdown("# 🥋 UFC Fight Predictor 🥊")
|
83 |
gr.Markdown("Select a prediction model and enter two fighter names to predict the outcome.")
|
84 |
-
|
85 |
with gr.Column():
|
86 |
model_dropdown = gr.Dropdown(
|
87 |
-
label="Select Model",
|
88 |
choices=available_models,
|
89 |
value=available_models[0] if available_models else None
|
90 |
)
|
91 |
with gr.Row():
|
92 |
fighter1_input = gr.Textbox(label="Fighter 1", placeholder="e.g., Jon Jones")
|
93 |
fighter2_input = gr.Textbox(label="Fighter 2", placeholder="e.g., Stipe Miocic")
|
94 |
-
|
95 |
predict_button = gr.Button("Predict Winner")
|
96 |
-
|
97 |
with gr.Column():
|
98 |
winner_output = gr.Textbox(label="Predicted Winner", interactive=False)
|
99 |
prob_output = gr.Textbox(label="Confidence", interactive=False)
|
|
|
2 |
import joblib
|
3 |
from datetime import datetime
|
4 |
import os
|
|
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from src.predict.models import (
|
7 |
+
BaseMLModel,
|
8 |
+
EloBaselineModel,
|
9 |
+
LogisticRegressionModel,
|
10 |
XGBoostModel,
|
11 |
SVCModel,
|
12 |
RandomForestModel,
|
13 |
BernoulliNBModel,
|
14 |
LGBMModel
|
15 |
)
|
|
|
16 |
from src.config import MODELS_DIR
|
17 |
|
18 |
# --- Model Cache ---
|
|
|
45 |
model_path = os.path.join(MODELS_DIR, model_name)
|
46 |
MODEL_CACHE[model_name] = joblib.load(model_path)
|
47 |
print("...model cached.")
|
48 |
+
|
49 |
model = MODEL_CACHE[model_name]
|
50 |
+
|
51 |
fight = {
|
52 |
'fighter_1': fighter1_name,
|
53 |
'fighter_2': fighter2_name,
|
|
|
55 |
}
|
56 |
|
57 |
prediction_result = model.predict(fight)
|
58 |
+
|
59 |
if prediction_result and prediction_result.get('winner'):
|
60 |
winner = prediction_result['winner']
|
61 |
prob = prediction_result['probability']
|
62 |
return winner, f"{prob:.1%}"
|
63 |
else:
|
64 |
return "Could not make a prediction.", ""
|
65 |
+
|
66 |
except FileNotFoundError:
|
67 |
return f"Error: Model file '{model_name}' not found.", ""
|
68 |
except Exception as e:
|
|
|
73 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
74 |
gr.Markdown("# 🥋 UFC Fight Predictor 🥊")
|
75 |
gr.Markdown("Select a prediction model and enter two fighter names to predict the outcome.")
|
76 |
+
|
77 |
with gr.Column():
|
78 |
model_dropdown = gr.Dropdown(
|
79 |
+
label="Select Model",
|
80 |
choices=available_models,
|
81 |
value=available_models[0] if available_models else None
|
82 |
)
|
83 |
with gr.Row():
|
84 |
fighter1_input = gr.Textbox(label="Fighter 1", placeholder="e.g., Jon Jones")
|
85 |
fighter2_input = gr.Textbox(label="Fighter 2", placeholder="e.g., Stipe Miocic")
|
86 |
+
|
87 |
predict_button = gr.Button("Predict Winner")
|
88 |
+
|
89 |
with gr.Column():
|
90 |
winner_output = gr.Textbox(label="Predicted Winner", interactive=False)
|
91 |
prob_output = gr.Textbox(label="Confidence", interactive=False)
|
logs/startup_update.log
CHANGED
Binary files a/logs/startup_update.log and b/logs/startup_update.log differ
|
|
src/analysis/elo.py
CHANGED
@@ -127,10 +127,4 @@ def main():
|
|
127 |
print("\n--- Top 10 Fighters by ELO Rating ---")
|
128 |
for i, (fighter, elo) in enumerate(sorted_fighters[:10]):
|
129 |
print(f"{i+1}. {fighter}: {round(elo)}")
|
130 |
-
print("------------------------------------")
|
131 |
-
|
132 |
-
if __name__ == '__main__':
|
133 |
-
# Create the directory if it doesn't exist to avoid confusion
|
134 |
-
if not os.path.exists('src/analysis'):
|
135 |
-
os.makedirs('src/analysis')
|
136 |
-
main()
|
|
|
127 |
print("\n--- Top 10 Fighters by ELO Rating ---")
|
128 |
for i, (fighter, elo) in enumerate(sorted_fighters[:10]):
|
129 |
print(f"{i+1}. {fighter}: {round(elo)}")
|
130 |
+
print("------------------------------------")
|
|
|
|
|
|
|
|
|
|
|
|
src/main.py
CHANGED
@@ -2,9 +2,6 @@ import argparse
|
|
2 |
import sys
|
3 |
import os
|
4 |
|
5 |
-
# Add the current directory to Python path for imports
|
6 |
-
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
7 |
-
|
8 |
def main():
|
9 |
"""
|
10 |
Main entry point for the UFC data pipeline.
|
@@ -55,7 +52,7 @@ def main():
|
|
55 |
|
56 |
if args.pipeline in ['scrape', 'all']:
|
57 |
print("=== Running Scraping Pipeline ===")
|
58 |
-
from scrape.main import main as scrape_main
|
59 |
|
60 |
# Override sys.argv to pass arguments to scrape.main
|
61 |
original_argv = sys.argv
|
@@ -67,7 +64,7 @@ def main():
|
|
67 |
|
68 |
if args.pipeline in ['analysis', 'all']:
|
69 |
print("\n=== Running ELO Analysis ===")
|
70 |
-
from analysis.elo import main as elo_main
|
71 |
elo_main()
|
72 |
|
73 |
if args.pipeline == 'update':
|
@@ -85,7 +82,7 @@ def main():
|
|
85 |
|
86 |
if args.pipeline in ['predict', 'all']:
|
87 |
print("\n=== Running Prediction Pipeline ===")
|
88 |
-
from predict.main import main as predict_main
|
89 |
|
90 |
# Override sys.argv to pass model management arguments to predict.main
|
91 |
original_argv = sys.argv
|
|
|
2 |
import sys
|
3 |
import os
|
4 |
|
|
|
|
|
|
|
5 |
def main():
|
6 |
"""
|
7 |
Main entry point for the UFC data pipeline.
|
|
|
52 |
|
53 |
if args.pipeline in ['scrape', 'all']:
|
54 |
print("=== Running Scraping Pipeline ===")
|
55 |
+
from src.scrape.main import main as scrape_main
|
56 |
|
57 |
# Override sys.argv to pass arguments to scrape.main
|
58 |
original_argv = sys.argv
|
|
|
64 |
|
65 |
if args.pipeline in ['analysis', 'all']:
|
66 |
print("\n=== Running ELO Analysis ===")
|
67 |
+
from src.analysis.elo import main as elo_main
|
68 |
elo_main()
|
69 |
|
70 |
if args.pipeline == 'update':
|
|
|
82 |
|
83 |
if args.pipeline in ['predict', 'all']:
|
84 |
print("\n=== Running Prediction Pipeline ===")
|
85 |
+
from src.predict.main import main as predict_main
|
86 |
|
87 |
# Override sys.argv to pass model management arguments to predict.main
|
88 |
original_argv = sys.argv
|
src/predict/main.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
import argparse
|
2 |
|
3 |
-
|
4 |
-
from
|
5 |
-
from src.predict.models import (
|
6 |
EloBaselineModel,
|
7 |
LogisticRegressionModel,
|
8 |
XGBoostModel,
|
@@ -93,6 +92,3 @@ def main():
|
|
93 |
except FileNotFoundError as e:
|
94 |
print(f"Error: {e}")
|
95 |
print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
|
96 |
-
|
97 |
-
if __name__ == '__main__':
|
98 |
-
main()
|
|
|
1 |
import argparse
|
2 |
|
3 |
+
from .pipeline import PredictionPipeline
|
4 |
+
from .models import (
|
|
|
5 |
EloBaselineModel,
|
6 |
LogisticRegressionModel,
|
7 |
XGBoostModel,
|
|
|
92 |
except FileNotFoundError as e:
|
93 |
print(f"Error: {e}")
|
94 |
print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
|
|
|
|
|
|
src/predict/models.py
CHANGED
@@ -8,17 +8,9 @@ from sklearn.naive_bayes import BernoulliNB
|
|
8 |
from sklearn.ensemble import RandomForestClassifier
|
9 |
from xgboost import XGBClassifier
|
10 |
from lightgbm import LGBMClassifier
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
from src.analysis.elo import process_fights_for_elo, INITIAL_ELO
|
15 |
-
from src.config import FIGHTERS_CSV_PATH
|
16 |
-
from src.predict.preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
|
17 |
-
except ImportError:
|
18 |
-
# Fallback for when running directly
|
19 |
-
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
20 |
-
from ..config import FIGHTERS_CSV_PATH
|
21 |
-
from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
|
22 |
|
23 |
class BaseModel(ABC):
|
24 |
"""
|
|
|
8 |
from sklearn.ensemble import RandomForestClassifier
|
9 |
from xgboost import XGBClassifier
|
10 |
from lightgbm import LGBMClassifier
|
11 |
+
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
12 |
+
from ..config import FIGHTERS_CSV_PATH
|
13 |
+
from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
class BaseModel(ABC):
|
16 |
"""
|
src/predict/pipeline.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
"""
|
3 |
UFC Fight Prediction Pipeline
|
4 |
|
@@ -20,19 +19,11 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
20 |
|
21 |
import csv
|
22 |
import os
|
23 |
-
import sys
|
24 |
from datetime import datetime
|
25 |
from collections import OrderedDict
|
26 |
import json
|
27 |
import joblib
|
28 |
-
|
29 |
-
# Use absolute imports to avoid relative import issues
|
30 |
-
try:
|
31 |
-
from src.config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
|
32 |
-
except ImportError:
|
33 |
-
# Fallback for when running directly
|
34 |
-
from ..config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
|
35 |
-
|
36 |
from .models import BaseModel
|
37 |
|
38 |
class PredictionPipeline:
|
|
|
|
|
1 |
"""
|
2 |
UFC Fight Prediction Pipeline
|
3 |
|
|
|
19 |
|
20 |
import csv
|
21 |
import os
|
|
|
22 |
from datetime import datetime
|
23 |
from collections import OrderedDict
|
24 |
import json
|
25 |
import joblib
|
26 |
+
from ..config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
from .models import BaseModel
|
28 |
|
29 |
class PredictionPipeline:
|
src/predict/predict_new.py
CHANGED
@@ -3,12 +3,7 @@ import os
|
|
3 |
import joblib
|
4 |
from datetime import datetime
|
5 |
|
6 |
-
|
7 |
-
try:
|
8 |
-
from src.config import MODELS_DIR
|
9 |
-
except ImportError:
|
10 |
-
# Fallback for when running directly
|
11 |
-
from ..config import MODELS_DIR
|
12 |
|
13 |
def predict_new_fight(fighter1_name, fighter2_name, model_path):
|
14 |
"""
|
@@ -43,18 +38,4 @@ def predict_new_fight(fighter1_name, fighter2_name, model_path):
|
|
43 |
prob = prediction_result['probability']
|
44 |
print(f"\n---> Predicted Winner: {winner} ({prob:.1%}) <---")
|
45 |
else:
|
46 |
-
print("\nCould not make a prediction. One of the fighters may not be in the dataset.")
|
47 |
-
|
48 |
-
if __name__ == '__main__':
|
49 |
-
parser = argparse.ArgumentParser(description="Predict the outcome of a new UFC fight.")
|
50 |
-
parser.add_argument('fighter1', type=str, help="The full name of the first fighter (e.g., 'Jon Jones').")
|
51 |
-
parser.add_argument('fighter2', type=str, help="The full name of the second fighter (e.g., 'Stipe Miocic').")
|
52 |
-
parser.add_argument(
|
53 |
-
'--model_path',
|
54 |
-
type=str,
|
55 |
-
default=os.path.join(MODELS_DIR, 'XGBoostModel.joblib'),
|
56 |
-
help="Path to the saved model file."
|
57 |
-
)
|
58 |
-
args = parser.parse_args()
|
59 |
-
|
60 |
-
predict_new_fight(args.fighter1, args.fighter2, args.model_path)
|
|
|
3 |
import joblib
|
4 |
from datetime import datetime
|
5 |
|
6 |
+
from ..config import MODELS_DIR
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def predict_new_fight(fighter1_name, fighter2_name, model_path):
|
9 |
"""
|
|
|
38 |
prob = prediction_result['probability']
|
39 |
print(f"\n---> Predicted Winner: {winner} ({prob:.1%}) <---")
|
40 |
else:
|
41 |
+
print("\nCould not make a prediction. One of the fighters may not be in the dataset.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/predict/preprocess.py
CHANGED
@@ -1,14 +1,7 @@
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
-
import sys
|
4 |
from datetime import datetime
|
5 |
-
|
6 |
-
# Use absolute imports to avoid relative import issues
|
7 |
-
try:
|
8 |
-
from src.config import FIGHTERS_CSV_PATH
|
9 |
-
except ImportError:
|
10 |
-
# Fallback for when running directly
|
11 |
-
from ..config import FIGHTERS_CSV_PATH
|
12 |
|
13 |
def _clean_numeric_column(series):
|
14 |
"""A helper to clean string columns into numbers, handling errors."""
|
@@ -236,36 +229,3 @@ def preprocess_for_ml(fights_to_process, fighters_csv_path):
|
|
236 |
|
237 |
print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
|
238 |
return X, y, metadata
|
239 |
-
|
240 |
-
if __name__ == '__main__':
|
241 |
-
# Use absolute imports to avoid relative import issues
|
242 |
-
try:
|
243 |
-
from src.predict.pipeline import PredictionPipeline
|
244 |
-
except ImportError:
|
245 |
-
# Fallback for when running directly
|
246 |
-
from .pipeline import PredictionPipeline
|
247 |
-
|
248 |
-
print("--- Running Preprocessing Example ---")
|
249 |
-
|
250 |
-
pipeline = PredictionPipeline(models=[])
|
251 |
-
try:
|
252 |
-
pipeline._load_and_split_data()
|
253 |
-
if pipeline.train_fights:
|
254 |
-
X_train, y_train, metadata_train = preprocess_for_ml(pipeline.train_fights, FIGHTERS_CSV_PATH)
|
255 |
-
print("\nTraining Data Shape:")
|
256 |
-
print("X_train:", X_train.shape)
|
257 |
-
print("y_train:", y_train.shape)
|
258 |
-
print("metadata_train:", metadata_train.shape)
|
259 |
-
|
260 |
-
print("\nLast 5 rows of X_train (showing populated historical features):")
|
261 |
-
print(X_train.tail())
|
262 |
-
|
263 |
-
print("\nTarget distribution (0=Loss, 1=Win):")
|
264 |
-
print(y_train.value_counts())
|
265 |
-
|
266 |
-
print("\nMetadata for last 5 rows:")
|
267 |
-
print(metadata_train.tail())
|
268 |
-
|
269 |
-
except FileNotFoundError as e:
|
270 |
-
print(e)
|
271 |
-
print("Please run the scraping pipeline first ('python -m src.scrape.main').")
|
|
|
1 |
import pandas as pd
|
2 |
import os
|
|
|
3 |
from datetime import datetime
|
4 |
+
from ..config import FIGHTERS_CSV_PATH
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
def _clean_numeric_column(series):
|
7 |
"""A helper to clean string columns into numbers, handling errors."""
|
|
|
229 |
|
230 |
print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
|
231 |
return X, y, metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/scrape/main.py
CHANGED
@@ -6,7 +6,13 @@ from .scrape_fights import scrape_all_events, scrape_latest_events
|
|
6 |
from .scrape_fighters import scrape_all_fighters
|
7 |
from .to_csv import json_to_csv, fighters_json_to_csv
|
8 |
from .preprocess import preprocess_fighters_csv
|
9 |
-
from .. import
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def main():
|
12 |
"""
|
@@ -31,9 +37,9 @@ def main():
|
|
31 |
args = parser.parse_args()
|
32 |
|
33 |
# Ensure the output directory exists
|
34 |
-
if not os.path.exists(
|
35 |
-
os.makedirs(
|
36 |
-
print(f"Created directory: {
|
37 |
|
38 |
if args.mode == 'full':
|
39 |
run_full_pipeline()
|
@@ -48,13 +54,13 @@ def run_full_pipeline():
|
|
48 |
|
49 |
# --- Step 1: Scrape all data from the website ---
|
50 |
# This will generate fighters.json and events.json
|
51 |
-
scrape_all_fighters(
|
52 |
-
scrape_all_events(
|
53 |
|
54 |
# --- Step 2: Convert the scraped JSON data to CSV format ---
|
55 |
# This will generate fighters.csv and fights.csv
|
56 |
-
json_to_csv(
|
57 |
-
fighters_json_to_csv(
|
58 |
|
59 |
# --- Step 3: Run post-processing on the generated CSV files ---
|
60 |
# This cleans names, converts height, etc.
|
@@ -64,12 +70,12 @@ def run_full_pipeline():
|
|
64 |
# --- Step 4: Clean up temporary JSON files ---
|
65 |
print("\n--- Deleting temporary JSON files ---")
|
66 |
try:
|
67 |
-
if os.path.exists(
|
68 |
-
os.remove(
|
69 |
-
print(f"Deleted: {
|
70 |
-
if os.path.exists(
|
71 |
-
os.remove(
|
72 |
-
print(f"Deleted: {
|
73 |
except OSError as e:
|
74 |
print(f"Error deleting JSON files: {e}")
|
75 |
|
@@ -86,13 +92,13 @@ def run_update_pipeline(num_events=5):
|
|
86 |
print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===")
|
87 |
|
88 |
# --- Step 1: Scrape latest events only ---
|
89 |
-
latest_events = scrape_latest_events(
|
90 |
|
91 |
# --- Step 2: Save latest events to last_event.json (even if empty) ---
|
92 |
if latest_events:
|
93 |
-
with open(
|
94 |
json.dump(latest_events, f, indent=4)
|
95 |
-
print(f"Latest {len(latest_events)} events saved to {
|
96 |
|
97 |
# --- Step 3: Always check and update from last_event.json ---
|
98 |
update_fights_csv_from_last_event()
|
@@ -105,13 +111,13 @@ def update_fights_csv_from_last_event():
|
|
105 |
Ensures latest events are on top and preserves data types.
|
106 |
"""
|
107 |
# Check if last_event.json exists
|
108 |
-
if not os.path.exists(
|
109 |
-
print(f"No {
|
110 |
return
|
111 |
|
112 |
# Load events from last_event.json
|
113 |
try:
|
114 |
-
with open(
|
115 |
events_from_json = json.load(f)
|
116 |
|
117 |
if not events_from_json:
|
@@ -126,17 +132,17 @@ def update_fights_csv_from_last_event():
|
|
126 |
|
127 |
try:
|
128 |
# Check if main CSV exists
|
129 |
-
if os.path.exists(
|
130 |
-
existing_df = pd.read_csv(
|
131 |
existing_event_names = set(existing_df['event_name'].unique())
|
132 |
else:
|
133 |
-
print(f"Main fights CSV ({
|
134 |
-
json_to_csv(
|
135 |
return
|
136 |
|
137 |
# Create temporary CSV from events in last_event.json
|
138 |
-
temp_json_path = os.path.join(
|
139 |
-
temp_csv_path = os.path.join(
|
140 |
|
141 |
with open(temp_json_path, 'w') as f:
|
142 |
json.dump(events_from_json, f, indent=4)
|
@@ -165,8 +171,8 @@ def update_fights_csv_from_last_event():
|
|
165 |
# Fix data types to remove .0 from numbers
|
166 |
fix_data_types(combined_df)
|
167 |
|
168 |
-
combined_df.to_csv(
|
169 |
-
print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {
|
170 |
else:
|
171 |
print("No new events found that aren't already in the existing CSV.")
|
172 |
|
@@ -179,7 +185,7 @@ def update_fights_csv_from_last_event():
|
|
179 |
except Exception as e:
|
180 |
print(f"Error updating fights CSV: {e}")
|
181 |
print("Falling back to creating new CSV from last_event.json only.")
|
182 |
-
json_to_csv(
|
183 |
|
184 |
def fix_data_types(df):
|
185 |
"""
|
@@ -200,6 +206,3 @@ def fix_data_types(df):
|
|
200 |
df[col] = df[col].str.replace(r'\.0$', '', regex=True)
|
201 |
# Convert empty strings back to original empty values
|
202 |
df[col] = df[col].replace('', '')
|
203 |
-
|
204 |
-
if __name__ == '__main__':
|
205 |
-
main()
|
|
|
6 |
from .scrape_fighters import scrape_all_fighters
|
7 |
from .to_csv import json_to_csv, fighters_json_to_csv
|
8 |
from .preprocess import preprocess_fighters_csv
|
9 |
+
from ..config import (
|
10 |
+
OUTPUT_DIR,
|
11 |
+
FIGHTERS_JSON_PATH,
|
12 |
+
EVENTS_JSON_PATH,
|
13 |
+
FIGHTS_CSV_PATH,
|
14 |
+
LAST_EVENT_JSON_PATH
|
15 |
+
)
|
16 |
|
17 |
def main():
|
18 |
"""
|
|
|
37 |
args = parser.parse_args()
|
38 |
|
39 |
# Ensure the output directory exists
|
40 |
+
if not os.path.exists(OUTPUT_DIR):
|
41 |
+
os.makedirs(OUTPUT_DIR)
|
42 |
+
print(f"Created directory: {OUTPUT_DIR}")
|
43 |
|
44 |
if args.mode == 'full':
|
45 |
run_full_pipeline()
|
|
|
54 |
|
55 |
# --- Step 1: Scrape all data from the website ---
|
56 |
# This will generate fighters.json and events.json
|
57 |
+
scrape_all_fighters(FIGHTERS_JSON_PATH)
|
58 |
+
scrape_all_events(EVENTS_JSON_PATH)
|
59 |
|
60 |
# --- Step 2: Convert the scraped JSON data to CSV format ---
|
61 |
# This will generate fighters.csv and fights.csv
|
62 |
+
json_to_csv(EVENTS_JSON_PATH, FIGHTS_CSV_PATH)
|
63 |
+
fighters_json_to_csv(FIGHTERS_JSON_PATH, FIGHTERS_CSV_PATH)
|
64 |
|
65 |
# --- Step 3: Run post-processing on the generated CSV files ---
|
66 |
# This cleans names, converts height, etc.
|
|
|
70 |
# --- Step 4: Clean up temporary JSON files ---
|
71 |
print("\n--- Deleting temporary JSON files ---")
|
72 |
try:
|
73 |
+
if os.path.exists(EVENTS_JSON_PATH):
|
74 |
+
os.remove(EVENTS_JSON_PATH)
|
75 |
+
print(f"Deleted: {EVENTS_JSON_PATH}")
|
76 |
+
if os.path.exists(FIGHTERS_JSON_PATH):
|
77 |
+
os.remove(FIGHTERS_JSON_PATH)
|
78 |
+
print(f"Deleted: {FIGHTERS_JSON_PATH}")
|
79 |
except OSError as e:
|
80 |
print(f"Error deleting JSON files: {e}")
|
81 |
|
|
|
92 |
print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===")
|
93 |
|
94 |
# --- Step 1: Scrape latest events only ---
|
95 |
+
latest_events = scrape_latest_events(LAST_EVENT_JSON_PATH, num_events)
|
96 |
|
97 |
# --- Step 2: Save latest events to last_event.json (even if empty) ---
|
98 |
if latest_events:
|
99 |
+
with open(LAST_EVENT_JSON_PATH, 'w') as f:
|
100 |
json.dump(latest_events, f, indent=4)
|
101 |
+
print(f"Latest {len(latest_events)} events saved to {LAST_EVENT_JSON_PATH}")
|
102 |
|
103 |
# --- Step 3: Always check and update from last_event.json ---
|
104 |
update_fights_csv_from_last_event()
|
|
|
111 |
Ensures latest events are on top and preserves data types.
|
112 |
"""
|
113 |
# Check if last_event.json exists
|
114 |
+
if not os.path.exists(LAST_EVENT_JSON_PATH):
|
115 |
+
print(f"No {LAST_EVENT_JSON_PATH} found. Nothing to update.")
|
116 |
return
|
117 |
|
118 |
# Load events from last_event.json
|
119 |
try:
|
120 |
+
with open(LAST_EVENT_JSON_PATH, 'r') as f:
|
121 |
events_from_json = json.load(f)
|
122 |
|
123 |
if not events_from_json:
|
|
|
132 |
|
133 |
try:
|
134 |
# Check if main CSV exists
|
135 |
+
if os.path.exists(FIGHTS_CSV_PATH):
|
136 |
+
existing_df = pd.read_csv(FIGHTS_CSV_PATH)
|
137 |
existing_event_names = set(existing_df['event_name'].unique())
|
138 |
else:
|
139 |
+
print(f"Main fights CSV ({FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.")
|
140 |
+
json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
|
141 |
return
|
142 |
|
143 |
# Create temporary CSV from events in last_event.json
|
144 |
+
temp_json_path = os.path.join(OUTPUT_DIR, 'temp_latest.json')
|
145 |
+
temp_csv_path = os.path.join(OUTPUT_DIR, 'temp_latest.csv')
|
146 |
|
147 |
with open(temp_json_path, 'w') as f:
|
148 |
json.dump(events_from_json, f, indent=4)
|
|
|
171 |
# Fix data types to remove .0 from numbers
|
172 |
fix_data_types(combined_df)
|
173 |
|
174 |
+
combined_df.to_csv(FIGHTS_CSV_PATH, index=False)
|
175 |
+
print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {FIGHTS_CSV_PATH}")
|
176 |
else:
|
177 |
print("No new events found that aren't already in the existing CSV.")
|
178 |
|
|
|
185 |
except Exception as e:
|
186 |
print(f"Error updating fights CSV: {e}")
|
187 |
print("Falling back to creating new CSV from last_event.json only.")
|
188 |
+
json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
|
189 |
|
190 |
def fix_data_types(df):
|
191 |
"""
|
|
|
206 |
df[col] = df[col].str.replace(r'\.0$', '', regex=True)
|
207 |
# Convert empty strings back to original empty values
|
208 |
df[col] = df[col].replace('', '')
|
|
|
|
|
|
src/scrape/preprocess.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import csv
|
2 |
import os
|
3 |
-
from .. import
|
4 |
|
5 |
def convert_height_to_cm(height_str):
|
6 |
"""
|
@@ -24,7 +24,7 @@ def convert_height_to_cm(height_str):
|
|
24 |
# Return original value if parsing fails
|
25 |
return height_str
|
26 |
|
27 |
-
def preprocess_fighters_csv(file_path=
|
28 |
"""
|
29 |
Reads the fighters CSV, cleans names, converts height to cm,
|
30 |
and saves the changes back to the same file.
|
@@ -78,7 +78,4 @@ def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
|
|
78 |
print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
|
79 |
|
80 |
except Exception as e:
|
81 |
-
print(f"An error occurred: {e}")
|
82 |
-
|
83 |
-
if __name__ == '__main__':
|
84 |
-
preprocess_fighters_csv()
|
|
|
1 |
import csv
|
2 |
import os
|
3 |
+
from ..config import FIGHTERS_CSV_PATH
|
4 |
|
5 |
def convert_height_to_cm(height_str):
|
6 |
"""
|
|
|
24 |
# Return original value if parsing fails
|
25 |
return height_str
|
26 |
|
27 |
+
def preprocess_fighters_csv(file_path=FIGHTERS_CSV_PATH):
|
28 |
"""
|
29 |
Reads the fighters CSV, cleans names, converts height to cm,
|
30 |
and saves the changes back to the same file.
|
|
|
78 |
print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
|
79 |
|
80 |
except Exception as e:
|
81 |
+
print(f"An error occurred: {e}")
|
|
|
|
|
|
src/scrape/scrape_fighters.py
CHANGED
@@ -5,7 +5,7 @@ import time
|
|
5 |
import string
|
6 |
import concurrent.futures
|
7 |
import os
|
8 |
-
from .. import
|
9 |
|
10 |
# --- Configuration ---
|
11 |
# The number of parallel threads to use for scraping fighter details.
|
@@ -133,14 +133,4 @@ def scrape_all_fighters(json_path):
|
|
133 |
json.dump(fighters_with_details, f, indent=4)
|
134 |
|
135 |
fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
|
136 |
-
return fighters_with_details
|
137 |
-
|
138 |
-
if __name__ == "__main__":
|
139 |
-
all_fighters_data = scrape_all_fighters(config.FIGHTERS_JSON_PATH)
|
140 |
-
if not os.path.exists(config.OUTPUT_DIR):
|
141 |
-
os.makedirs(config.OUTPUT_DIR)
|
142 |
-
|
143 |
-
with open(config.FIGHTERS_JSON_PATH, 'w') as f:
|
144 |
-
json.dump(all_fighters_data, f, indent=4)
|
145 |
-
|
146 |
-
print(f"\nScraping complete. Final data for {len(all_fighters_data)} fighters saved to {config.FIGHTERS_JSON_PATH}")
|
|
|
5 |
import string
|
6 |
import concurrent.futures
|
7 |
import os
|
8 |
+
from ..config import FIGHTERS_JSON_PATH, OUTPUT_DIR
|
9 |
|
10 |
# --- Configuration ---
|
11 |
# The number of parallel threads to use for scraping fighter details.
|
|
|
133 |
json.dump(fighters_with_details, f, indent=4)
|
134 |
|
135 |
fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
|
136 |
+
return fighters_with_details
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/scrape/scrape_fights.py
CHANGED
@@ -3,7 +3,7 @@ from bs4 import BeautifulSoup
|
|
3 |
import json
|
4 |
import time
|
5 |
import concurrent.futures
|
6 |
-
from .. import
|
7 |
|
8 |
# --- Configuration ---
|
9 |
# The number of parallel threads to use for scraping fight details.
|
@@ -255,9 +255,3 @@ def scrape_latest_events(json_path, num_events=5):
|
|
255 |
print(f"Could not process event {event_url}. Error: {e}")
|
256 |
|
257 |
return events
|
258 |
-
|
259 |
-
if __name__ == "__main__":
|
260 |
-
all_events_data = scrape_all_events(config.EVENTS_JSON_PATH)
|
261 |
-
with open(config.EVENTS_JSON_PATH, 'w') as f:
|
262 |
-
json.dump(all_events_data, f, indent=4)
|
263 |
-
print(f"\nScraping complete. Final data saved to {config.EVENTS_JSON_PATH}")
|
|
|
3 |
import json
|
4 |
import time
|
5 |
import concurrent.futures
|
6 |
+
from ..config import EVENTS_JSON_PATH
|
7 |
|
8 |
# --- Configuration ---
|
9 |
# The number of parallel threads to use for scraping fight details.
|
|
|
255 |
print(f"Could not process event {event_url}. Error: {e}")
|
256 |
|
257 |
return events
|
|
|
|
|
|
|
|
|
|
|
|
src/scrape/to_csv.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import json
|
2 |
import csv
|
3 |
-
from .. import
|
4 |
|
5 |
def json_to_csv(json_file_path, csv_file_path):
|
6 |
try:
|
@@ -137,8 +137,4 @@ def fighters_json_to_csv(json_file_path, csv_file_path):
|
|
137 |
cleaned_row = {key: clean_value(fighter_data.get(key, '')) for key in headers}
|
138 |
writer.writerow(cleaned_row)
|
139 |
|
140 |
-
print(f"Successfully converted {json_file_path} to {csv_file_path}")
|
141 |
-
|
142 |
-
if __name__ == '__main__':
|
143 |
-
json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH)
|
144 |
-
fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH)
|
|
|
1 |
import json
|
2 |
import csv
|
3 |
+
from ..config import EVENTS_JSON_PATH, FIGHTS_CSV_PATH, FIGHTERS_JSON_PATH
|
4 |
|
5 |
def json_to_csv(json_file_path, csv_file_path):
|
6 |
try:
|
|
|
137 |
cleaned_row = {key: clean_value(fighter_data.get(key, '')) for key in headers}
|
138 |
writer.writerow(cleaned_row)
|
139 |
|
140 |
+
print(f"Successfully converted {json_file_path} to {csv_file_path}")
|
|
|
|
|
|
|
|