Spaces:

AlvaroMros
/

ufc-predictor

Sleeping

AlvaroMros commited on Jul 27

Commit

9678fdb

1 Parent(s): 53745df

Refactor imports to use absolute paths and clean up scripts

Standardized all internal imports to use absolute paths for better maintainability and to avoid import errors. Removed unnecessary sys.path manipulations and direct script execution blocks from modules, making the codebase more modular and suitable for package usage.

Files changed (15) hide show

.gitignore +1 -1
app.py +11 -19
logs/startup_update.log +0 -0
src/analysis/elo.py +1 -7
src/main.py +3 -6
src/predict/main.py +2 -6
src/predict/models.py +3 -11
src/predict/pipeline.py +1 -10
src/predict/predict_new.py +2 -21
src/predict/preprocess.py +1 -41
src/scrape/main.py +35 -32
src/scrape/preprocess.py +3 -6
src/scrape/scrape_fighters.py +2 -12
src/scrape/scrape_fights.py +1 -7
src/scrape/to_csv.py +2 -6

.gitignore CHANGED Viewed

@@ -1,3 +1,3 @@
 *__pycache__/
 example_event.html
-web/

 *__pycache__/
 example_event.html
+web/

app.py CHANGED Viewed

@@ -2,25 +2,17 @@ import gradio as gr
 import joblib
 from datetime import datetime
 import os
-import sys
-# --- Path and Module Setup ---
-# Add the 'src' directory to the system path so we can import our custom modules.
-sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
-# Although these models are not called directly, they MUST be imported here.
-# joblib.load() needs these class definitions in scope to deserialize the model files correctly.
 from src.predict.models import (
-    BaseMLModel,
-    EloBaselineModel,
-    LogisticRegressionModel,
     XGBoostModel,
     SVCModel,
     RandomForestModel,
     BernoulliNBModel,
     LGBMModel
 )
-# Import the configuration variable for the models directory for consistency.
 from src.config import MODELS_DIR
 # --- Model Cache ---
@@ -53,9 +45,9 @@ def predict_fight(model_name, fighter1_name, fighter2_name):
             model_path = os.path.join(MODELS_DIR, model_name)
             MODEL_CACHE[model_name] = joblib.load(model_path)
             print("...model cached.")
         model = MODEL_CACHE[model_name]
         fight = {
             'fighter_1': fighter1_name,
             'fighter_2': fighter2_name,
@@ -63,14 +55,14 @@ def predict_fight(model_name, fighter1_name, fighter2_name):
         }
         prediction_result = model.predict(fight)
         if prediction_result and prediction_result.get('winner'):
             winner = prediction_result['winner']
             prob = prediction_result['probability']
             return winner, f"{prob:.1%}"
         else:
             return "Could not make a prediction.", ""
     except FileNotFoundError:
         return f"Error: Model file '{model_name}' not found.", ""
     except Exception as e:
@@ -81,19 +73,19 @@ def predict_fight(model_name, fighter1_name, fighter2_name):
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🥋 UFC Fight Predictor 🥊")
     gr.Markdown("Select a prediction model and enter two fighter names to predict the outcome.")
     with gr.Column():
         model_dropdown = gr.Dropdown(
-            label="Select Model",
             choices=available_models,
             value=available_models[0] if available_models else None
         )
         with gr.Row():
             fighter1_input = gr.Textbox(label="Fighter 1", placeholder="e.g., Jon Jones")
             fighter2_input = gr.Textbox(label="Fighter 2", placeholder="e.g., Stipe Miocic")
     predict_button = gr.Button("Predict Winner")
     with gr.Column():
         winner_output = gr.Textbox(label="Predicted Winner", interactive=False)
         prob_output = gr.Textbox(label="Confidence", interactive=False)

 import joblib
 from datetime import datetime
 import os
 from src.predict.models import (
+    BaseMLModel,
+    EloBaselineModel,
+    LogisticRegressionModel,
     XGBoostModel,
     SVCModel,
     RandomForestModel,
     BernoulliNBModel,
     LGBMModel
 )
 from src.config import MODELS_DIR
 # --- Model Cache ---
             model_path = os.path.join(MODELS_DIR, model_name)
             MODEL_CACHE[model_name] = joblib.load(model_path)
             print("...model cached.")
         model = MODEL_CACHE[model_name]
         fight = {
             'fighter_1': fighter1_name,
             'fighter_2': fighter2_name,
         }
         prediction_result = model.predict(fight)
         if prediction_result and prediction_result.get('winner'):
             winner = prediction_result['winner']
             prob = prediction_result['probability']
             return winner, f"{prob:.1%}"
         else:
             return "Could not make a prediction.", ""
     except FileNotFoundError:
         return f"Error: Model file '{model_name}' not found.", ""
     except Exception as e:
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🥋 UFC Fight Predictor 🥊")
     gr.Markdown("Select a prediction model and enter two fighter names to predict the outcome.")
     with gr.Column():
         model_dropdown = gr.Dropdown(
+            label="Select Model",
             choices=available_models,
             value=available_models[0] if available_models else None
         )
         with gr.Row():
             fighter1_input = gr.Textbox(label="Fighter 1", placeholder="e.g., Jon Jones")
             fighter2_input = gr.Textbox(label="Fighter 2", placeholder="e.g., Stipe Miocic")
     predict_button = gr.Button("Predict Winner")
     with gr.Column():
         winner_output = gr.Textbox(label="Predicted Winner", interactive=False)
         prob_output = gr.Textbox(label="Confidence", interactive=False)

logs/startup_update.log CHANGED Viewed

Binary files a/logs/startup_update.log and b/logs/startup_update.log differ

src/analysis/elo.py CHANGED Viewed

@@ -127,10 +127,4 @@ def main():
         print("\n--- Top 10 Fighters by ELO Rating ---")
         for i, (fighter, elo) in enumerate(sorted_fighters[:10]):
             print(f"{i+1}. {fighter}: {round(elo)}")
-        print("------------------------------------")
-if __name__ == '__main__':
-    # Create the directory if it doesn't exist to avoid confusion
-    if not os.path.exists('src/analysis'):
-        os.makedirs('src/analysis')
-    main()

         print("\n--- Top 10 Fighters by ELO Rating ---")
         for i, (fighter, elo) in enumerate(sorted_fighters[:10]):
             print(f"{i+1}. {fighter}: {round(elo)}")
+        print("------------------------------------")

src/main.py CHANGED Viewed

@@ -2,9 +2,6 @@ import argparse
 import sys
 import os
-# Add the current directory to Python path for imports
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 def main():
     """
     Main entry point for the UFC data pipeline.
@@ -55,7 +52,7 @@ def main():
     if args.pipeline in ['scrape', 'all']:
         print("=== Running Scraping Pipeline ===")
-        from scrape.main import main as scrape_main
         # Override sys.argv to pass arguments to scrape.main
         original_argv = sys.argv
@@ -67,7 +64,7 @@ def main():
     if args.pipeline in ['analysis', 'all']:
         print("\n=== Running ELO Analysis ===")
-        from analysis.elo import main as elo_main
         elo_main()
     if args.pipeline == 'update':
@@ -85,7 +82,7 @@ def main():
     if args.pipeline in ['predict', 'all']:
         print("\n=== Running Prediction Pipeline ===")
-        from predict.main import main as predict_main
         # Override sys.argv to pass model management arguments to predict.main
         original_argv = sys.argv

 import sys
 import os
 def main():
     """
     Main entry point for the UFC data pipeline.
     if args.pipeline in ['scrape', 'all']:
         print("=== Running Scraping Pipeline ===")
+        from src.scrape.main import main as scrape_main
         # Override sys.argv to pass arguments to scrape.main
         original_argv = sys.argv
     if args.pipeline in ['analysis', 'all']:
         print("\n=== Running ELO Analysis ===")
+        from src.analysis.elo import main as elo_main
         elo_main()
     if args.pipeline == 'update':
     if args.pipeline in ['predict', 'all']:
         print("\n=== Running Prediction Pipeline ===")
+        from src.predict.main import main as predict_main
         # Override sys.argv to pass model management arguments to predict.main
         original_argv = sys.argv

src/predict/main.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import argparse
-# Use absolute imports to avoid relative import issues
-from src.predict.pipeline import PredictionPipeline
-from src.predict.models import (
     EloBaselineModel,
     LogisticRegressionModel,
     XGBoostModel,
@@ -93,6 +92,3 @@ def main():
     except FileNotFoundError as e:
         print(f"Error: {e}")
         print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
-if __name__ == '__main__':
-    main()

 import argparse
+from .pipeline import PredictionPipeline
+from .models import (
     EloBaselineModel,
     LogisticRegressionModel,
     XGBoostModel,
     except FileNotFoundError as e:
         print(f"Error: {e}")
         print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")

src/predict/models.py CHANGED Viewed

@@ -8,17 +8,9 @@ from sklearn.naive_bayes import BernoulliNB
 from sklearn.ensemble import RandomForestClassifier
 from xgboost import XGBClassifier
 from lightgbm import LGBMClassifier
-# Use absolute imports to avoid relative import issues
-try:
-    from src.analysis.elo import process_fights_for_elo, INITIAL_ELO
-    from src.config import FIGHTERS_CSV_PATH
-    from src.predict.preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
-except ImportError:
-    # Fallback for when running directly
-    from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
-    from ..config import FIGHTERS_CSV_PATH
-    from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
 class BaseModel(ABC):
     """

 from sklearn.ensemble import RandomForestClassifier
 from xgboost import XGBClassifier
 from lightgbm import LGBMClassifier
+from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
+from ..config import FIGHTERS_CSV_PATH
+from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
 class BaseModel(ABC):
     """

src/predict/pipeline.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 """
 UFC Fight Prediction Pipeline
@@ -20,19 +19,11 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 import csv
 import os
-import sys
 from datetime import datetime
 from collections import OrderedDict
 import json
 import joblib
-# Use absolute imports to avoid relative import issues
-try:
-    from src.config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
-except ImportError:
-    # Fallback for when running directly
-    from ..config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
 from .models import BaseModel
 class PredictionPipeline:

 """
 UFC Fight Prediction Pipeline
 import csv
 import os
 from datetime import datetime
 from collections import OrderedDict
 import json
 import joblib
+from ..config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
 from .models import BaseModel
 class PredictionPipeline:

src/predict/predict_new.py CHANGED Viewed

@@ -3,12 +3,7 @@ import os
 import joblib
 from datetime import datetime
-# Use absolute imports to avoid relative import issues
-try:
-    from src.config import MODELS_DIR
-except ImportError:
-    # Fallback for when running directly
-    from ..config import MODELS_DIR
 def predict_new_fight(fighter1_name, fighter2_name, model_path):
     """
@@ -43,18 +38,4 @@ def predict_new_fight(fighter1_name, fighter2_name, model_path):
         prob = prediction_result['probability']
         print(f"\n---> Predicted Winner: {winner} ({prob:.1%}) <---")
     else:
-        print("\nCould not make a prediction. One of the fighters may not be in the dataset.")
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Predict the outcome of a new UFC fight.")
-    parser.add_argument('fighter1', type=str, help="The full name of the first fighter (e.g., 'Jon Jones').")
-    parser.add_argument('fighter2', type=str, help="The full name of the second fighter (e.g., 'Stipe Miocic').")
-    parser.add_argument(
-        '--model_path',
-        type=str,
-        default=os.path.join(MODELS_DIR, 'XGBoostModel.joblib'),
-        help="Path to the saved model file."
-    )
-    args = parser.parse_args()
-    predict_new_fight(args.fighter1, args.fighter2, args.model_path)

 import joblib
 from datetime import datetime
+from ..config import MODELS_DIR
 def predict_new_fight(fighter1_name, fighter2_name, model_path):
     """
         prob = prediction_result['probability']
         print(f"\n---> Predicted Winner: {winner} ({prob:.1%}) <---")
     else:
+        print("\nCould not make a prediction. One of the fighters may not be in the dataset.")

src/predict/preprocess.py CHANGED Viewed

@@ -1,14 +1,7 @@
 import pandas as pd
 import os
-import sys
 from datetime import datetime
-# Use absolute imports to avoid relative import issues
-try:
-    from src.config import FIGHTERS_CSV_PATH
-except ImportError:
-    # Fallback for when running directly
-    from ..config import FIGHTERS_CSV_PATH
 def _clean_numeric_column(series):
     """A helper to clean string columns into numbers, handling errors."""
@@ -236,36 +229,3 @@ def preprocess_for_ml(fights_to_process, fighters_csv_path):
     print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
     return X, y, metadata
-if __name__ == '__main__':
-    # Use absolute imports to avoid relative import issues
-    try:
-        from src.predict.pipeline import PredictionPipeline
-    except ImportError:
-        # Fallback for when running directly
-        from .pipeline import PredictionPipeline
-    print("--- Running Preprocessing Example ---")
-    pipeline = PredictionPipeline(models=[])
-    try:
-        pipeline._load_and_split_data()
-        if pipeline.train_fights:
-            X_train, y_train, metadata_train = preprocess_for_ml(pipeline.train_fights, FIGHTERS_CSV_PATH)
-            print("\nTraining Data Shape:")
-            print("X_train:", X_train.shape)
-            print("y_train:", y_train.shape)
-            print("metadata_train:", metadata_train.shape)
-            print("\nLast 5 rows of X_train (showing populated historical features):")
-            print(X_train.tail())
-            print("\nTarget distribution (0=Loss, 1=Win):")
-            print(y_train.value_counts())
-            print("\nMetadata for last 5 rows:")
-            print(metadata_train.tail())
-    except FileNotFoundError as e:
-        print(e)
-        print("Please run the scraping pipeline first ('python -m src.scrape.main').")

 import pandas as pd
 import os
 from datetime import datetime
+from ..config import FIGHTERS_CSV_PATH
 def _clean_numeric_column(series):
     """A helper to clean string columns into numbers, handling errors."""
     print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
     return X, y, metadata

src/scrape/main.py CHANGED Viewed

@@ -6,7 +6,13 @@ from .scrape_fights import scrape_all_events, scrape_latest_events
 from .scrape_fighters import scrape_all_fighters
 from .to_csv import json_to_csv, fighters_json_to_csv
 from .preprocess import preprocess_fighters_csv
-from .. import config
 def main():
     """
@@ -31,9 +37,9 @@ def main():
     args = parser.parse_args()
     # Ensure the output directory exists
-    if not os.path.exists(config.OUTPUT_DIR):
-        os.makedirs(config.OUTPUT_DIR)
-        print(f"Created directory: {config.OUTPUT_DIR}")
     if args.mode == 'full':
         run_full_pipeline()
@@ -48,13 +54,13 @@ def run_full_pipeline():
     # --- Step 1: Scrape all data from the website ---
     # This will generate fighters.json and events.json
-    scrape_all_fighters(config.FIGHTERS_JSON_PATH)
-    scrape_all_events(config.EVENTS_JSON_PATH)
     # --- Step 2: Convert the scraped JSON data to CSV format ---
     # This will generate fighters.csv and fights.csv
-    json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH)
-    fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH)
     # --- Step 3: Run post-processing on the generated CSV files ---
     # This cleans names, converts height, etc.
@@ -64,12 +70,12 @@ def run_full_pipeline():
     # --- Step 4: Clean up temporary JSON files ---
     print("\n--- Deleting temporary JSON files ---")
     try:
-        if os.path.exists(config.EVENTS_JSON_PATH):
-            os.remove(config.EVENTS_JSON_PATH)
-            print(f"Deleted: {config.EVENTS_JSON_PATH}")
-        if os.path.exists(config.FIGHTERS_JSON_PATH):
-            os.remove(config.FIGHTERS_JSON_PATH)
-            print(f"Deleted: {config.FIGHTERS_JSON_PATH}")
     except OSError as e:
         print(f"Error deleting JSON files: {e}")
@@ -86,13 +92,13 @@ def run_update_pipeline(num_events=5):
     print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===")
     # --- Step 1: Scrape latest events only ---
-    latest_events = scrape_latest_events(config.LAST_EVENT_JSON_PATH, num_events)
     # --- Step 2: Save latest events to last_event.json (even if empty) ---
     if latest_events:
-        with open(config.LAST_EVENT_JSON_PATH, 'w') as f:
             json.dump(latest_events, f, indent=4)
-        print(f"Latest {len(latest_events)} events saved to {config.LAST_EVENT_JSON_PATH}")
     # --- Step 3: Always check and update from last_event.json ---
     update_fights_csv_from_last_event()
@@ -105,13 +111,13 @@ def update_fights_csv_from_last_event():
     Ensures latest events are on top and preserves data types.
     """
     # Check if last_event.json exists
-    if not os.path.exists(config.LAST_EVENT_JSON_PATH):
-        print(f"No {config.LAST_EVENT_JSON_PATH} found. Nothing to update.")
         return
     # Load events from last_event.json
     try:
-        with open(config.LAST_EVENT_JSON_PATH, 'r') as f:
             events_from_json = json.load(f)
         if not events_from_json:
@@ -126,17 +132,17 @@ def update_fights_csv_from_last_event():
     try:
         # Check if main CSV exists
-        if os.path.exists(config.FIGHTS_CSV_PATH):
-            existing_df = pd.read_csv(config.FIGHTS_CSV_PATH)
             existing_event_names = set(existing_df['event_name'].unique())
         else:
-            print(f"Main fights CSV ({config.FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.")
-            json_to_csv(config.LAST_EVENT_JSON_PATH, config.FIGHTS_CSV_PATH)
             return
         # Create temporary CSV from events in last_event.json
-        temp_json_path = os.path.join(config.OUTPUT_DIR, 'temp_latest.json')
-        temp_csv_path = os.path.join(config.OUTPUT_DIR, 'temp_latest.csv')
         with open(temp_json_path, 'w') as f:
             json.dump(events_from_json, f, indent=4)
@@ -165,8 +171,8 @@ def update_fights_csv_from_last_event():
             # Fix data types to remove .0 from numbers
             fix_data_types(combined_df)
-            combined_df.to_csv(config.FIGHTS_CSV_PATH, index=False)
-            print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {config.FIGHTS_CSV_PATH}")
         else:
             print("No new events found that aren't already in the existing CSV.")
@@ -179,7 +185,7 @@ def update_fights_csv_from_last_event():
     except Exception as e:
         print(f"Error updating fights CSV: {e}")
         print("Falling back to creating new CSV from last_event.json only.")
-        json_to_csv(config.LAST_EVENT_JSON_PATH, config.FIGHTS_CSV_PATH)
 def fix_data_types(df):
     """
@@ -200,6 +206,3 @@ def fix_data_types(df):
                 df[col] = df[col].str.replace(r'\.0$', '', regex=True)
                 # Convert empty strings back to original empty values
                 df[col] = df[col].replace('', '')
-if __name__ == '__main__':
-    main()

 from .scrape_fighters import scrape_all_fighters
 from .to_csv import json_to_csv, fighters_json_to_csv
 from .preprocess import preprocess_fighters_csv
+from ..config import (
+    OUTPUT_DIR,
+    FIGHTERS_JSON_PATH,
+    EVENTS_JSON_PATH,
+    FIGHTS_CSV_PATH,
+    LAST_EVENT_JSON_PATH
+)
 def main():
     """
     args = parser.parse_args()
     # Ensure the output directory exists
+    if not os.path.exists(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR)
+        print(f"Created directory: {OUTPUT_DIR}")
     if args.mode == 'full':
         run_full_pipeline()
     # --- Step 1: Scrape all data from the website ---
     # This will generate fighters.json and events.json
+    scrape_all_fighters(FIGHTERS_JSON_PATH)
+    scrape_all_events(EVENTS_JSON_PATH)
     # --- Step 2: Convert the scraped JSON data to CSV format ---
     # This will generate fighters.csv and fights.csv
+    json_to_csv(EVENTS_JSON_PATH, FIGHTS_CSV_PATH)
+    fighters_json_to_csv(FIGHTERS_JSON_PATH, FIGHTERS_CSV_PATH)
     # --- Step 3: Run post-processing on the generated CSV files ---
     # This cleans names, converts height, etc.
     # --- Step 4: Clean up temporary JSON files ---
     print("\n--- Deleting temporary JSON files ---")
     try:
+        if os.path.exists(EVENTS_JSON_PATH):
+            os.remove(EVENTS_JSON_PATH)
+            print(f"Deleted: {EVENTS_JSON_PATH}")
+        if os.path.exists(FIGHTERS_JSON_PATH):
+            os.remove(FIGHTERS_JSON_PATH)
+            print(f"Deleted: {FIGHTERS_JSON_PATH}")
     except OSError as e:
         print(f"Error deleting JSON files: {e}")
     print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===")
     # --- Step 1: Scrape latest events only ---
+    latest_events = scrape_latest_events(LAST_EVENT_JSON_PATH, num_events)
     # --- Step 2: Save latest events to last_event.json (even if empty) ---
     if latest_events:
+        with open(LAST_EVENT_JSON_PATH, 'w') as f:
             json.dump(latest_events, f, indent=4)
+        print(f"Latest {len(latest_events)} events saved to {LAST_EVENT_JSON_PATH}")
     # --- Step 3: Always check and update from last_event.json ---
     update_fights_csv_from_last_event()
     Ensures latest events are on top and preserves data types.
     """
     # Check if last_event.json exists
+    if not os.path.exists(LAST_EVENT_JSON_PATH):
+        print(f"No {LAST_EVENT_JSON_PATH} found. Nothing to update.")
         return
     # Load events from last_event.json
     try:
+        with open(LAST_EVENT_JSON_PATH, 'r') as f:
             events_from_json = json.load(f)
         if not events_from_json:
     try:
         # Check if main CSV exists
+        if os.path.exists(FIGHTS_CSV_PATH):
+            existing_df = pd.read_csv(FIGHTS_CSV_PATH)
             existing_event_names = set(existing_df['event_name'].unique())
         else:
+            print(f"Main fights CSV ({FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.")
+            json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
             return
         # Create temporary CSV from events in last_event.json
+        temp_json_path = os.path.join(OUTPUT_DIR, 'temp_latest.json')
+        temp_csv_path = os.path.join(OUTPUT_DIR, 'temp_latest.csv')
         with open(temp_json_path, 'w') as f:
             json.dump(events_from_json, f, indent=4)
             # Fix data types to remove .0 from numbers
             fix_data_types(combined_df)
+            combined_df.to_csv(FIGHTS_CSV_PATH, index=False)
+            print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {FIGHTS_CSV_PATH}")
         else:
             print("No new events found that aren't already in the existing CSV.")
     except Exception as e:
         print(f"Error updating fights CSV: {e}")
         print("Falling back to creating new CSV from last_event.json only.")
+        json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
 def fix_data_types(df):
     """
                 df[col] = df[col].str.replace(r'\.0$', '', regex=True)
                 # Convert empty strings back to original empty values
                 df[col] = df[col].replace('', '')

src/scrape/preprocess.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import csv
 import os
-from .. import config
 def convert_height_to_cm(height_str):
     """
@@ -24,7 +24,7 @@ def convert_height_to_cm(height_str):
         # Return original value if parsing fails
         return height_str
-def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
     """
     Reads the fighters CSV, cleans names, converts height to cm,
     and saves the changes back to the same file.
@@ -78,7 +78,4 @@ def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
             print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
     except Exception as e:
-        print(f"An error occurred: {e}")
-if __name__ == '__main__':
-    preprocess_fighters_csv()

 import csv
 import os
+from ..config import FIGHTERS_CSV_PATH
 def convert_height_to_cm(height_str):
     """
         # Return original value if parsing fails
         return height_str
+def preprocess_fighters_csv(file_path=FIGHTERS_CSV_PATH):
     """
     Reads the fighters CSV, cleans names, converts height to cm,
     and saves the changes back to the same file.
             print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
     except Exception as e:
+        print(f"An error occurred: {e}")

src/scrape/scrape_fighters.py CHANGED Viewed

@@ -5,7 +5,7 @@ import time
 import string
 import concurrent.futures
 import os
-from .. import config
 # --- Configuration ---
 # The number of parallel threads to use for scraping fighter details.
@@ -133,14 +133,4 @@ def scrape_all_fighters(json_path):
                     json.dump(fighters_with_details, f, indent=4)
     fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
-    return fighters_with_details
-if __name__ == "__main__":
-    all_fighters_data = scrape_all_fighters(config.FIGHTERS_JSON_PATH)
-    if not os.path.exists(config.OUTPUT_DIR):
-        os.makedirs(config.OUTPUT_DIR)
-    with open(config.FIGHTERS_JSON_PATH, 'w') as f:
-        json.dump(all_fighters_data, f, indent=4)
-    print(f"\nScraping complete. Final data for {len(all_fighters_data)} fighters saved to {config.FIGHTERS_JSON_PATH}")

 import string
 import concurrent.futures
 import os
+from ..config import FIGHTERS_JSON_PATH, OUTPUT_DIR
 # --- Configuration ---
 # The number of parallel threads to use for scraping fighter details.
                     json.dump(fighters_with_details, f, indent=4)
     fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
+    return fighters_with_details

src/scrape/scrape_fights.py CHANGED Viewed

@@ -3,7 +3,7 @@ from bs4 import BeautifulSoup
 import json
 import time
 import concurrent.futures
-from .. import config
 # --- Configuration ---
 # The number of parallel threads to use for scraping fight details.
@@ -255,9 +255,3 @@ def scrape_latest_events(json_path, num_events=5):
             print(f"Could not process event {event_url}. Error: {e}")
     return events
-if __name__ == "__main__":
-    all_events_data = scrape_all_events(config.EVENTS_JSON_PATH)
-    with open(config.EVENTS_JSON_PATH, 'w') as f:
-        json.dump(all_events_data, f, indent=4)
-    print(f"\nScraping complete. Final data saved to {config.EVENTS_JSON_PATH}")

 import json
 import time
 import concurrent.futures
+from ..config import EVENTS_JSON_PATH
 # --- Configuration ---
 # The number of parallel threads to use for scraping fight details.
             print(f"Could not process event {event_url}. Error: {e}")
     return events

src/scrape/to_csv.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 import csv
-from .. import config
 def json_to_csv(json_file_path, csv_file_path):
     try:
@@ -137,8 +137,4 @@ def fighters_json_to_csv(json_file_path, csv_file_path):
             cleaned_row = {key: clean_value(fighter_data.get(key, '')) for key in headers}
             writer.writerow(cleaned_row)
-    print(f"Successfully converted {json_file_path} to {csv_file_path}")
-if __name__ == '__main__':
-    json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH)
-    fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH)

 import json
 import csv
+from ..config import EVENTS_JSON_PATH, FIGHTS_CSV_PATH, FIGHTERS_JSON_PATH
 def json_to_csv(json_file_path, csv_file_path):
     try:
             cleaned_row = {key: clean_value(fighter_data.get(key, '')) for key in headers}
             writer.writerow(cleaned_row)
+    print(f"Successfully converted {json_file_path} to {csv_file_path}")