Spaces:

AlvaroMros
/

ufc-predictor

Sleeping

File size: 7,933 Bytes

import os
import json
import argparse
import pandas as pd
from .scrape_fights import scrape_all_events, scrape_latest_events
from .scrape_fighters import scrape_all_fighters
from .to_csv import json_to_csv, fighters_json_to_csv
from .preprocess import preprocess_fighters_csv
from ..config import (
    OUTPUT_DIR, 
    FIGHTERS_JSON_PATH, 
    EVENTS_JSON_PATH, 
    FIGHTS_CSV_PATH, 
    LAST_EVENT_JSON_PATH
)

def main():
    """
    Main function to run the scraping and preprocessing pipeline.
    Supports both full scraping and incremental updates.
    """
    parser = argparse.ArgumentParser(description="UFC Data Scraping Pipeline")
    parser.add_argument(
        '--mode', 
        type=str, 
        default='full', 
        choices=['full', 'update'],
        help="Scraping mode: 'full' (complete scraping) or 'update' (latest events + sync from last_event.json)"
    )
    parser.add_argument(
        '--num-events', 
        type=int, 
        default=5,
        help="Number of latest events to scrape in update mode (default: 5)"
    )
    
    args = parser.parse_args()
    
    # Ensure the output directory exists
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created directory: {OUTPUT_DIR}")

    if args.mode == 'full':
        run_full_pipeline()
    elif args.mode == 'update':
        run_update_pipeline(args.num_events)

def run_full_pipeline():
    """
    Runs the complete scraping and preprocessing pipeline.
    """
    print("\n=== Running FULL scraping pipeline ===")
    
    # --- Step 1: Scrape all data from the website ---
    # This will generate fighters.json and events.json
    scrape_all_fighters(FIGHTERS_JSON_PATH)
    scrape_all_events(EVENTS_JSON_PATH)

    # --- Step 2: Convert the scraped JSON data to CSV format ---
    # This will generate fighters.csv and fights.csv
    json_to_csv(EVENTS_JSON_PATH, FIGHTS_CSV_PATH)
    fighters_json_to_csv(FIGHTERS_JSON_PATH, FIGHTERS_CSV_PATH)

    # --- Step 3: Run post-processing on the generated CSV files ---
    # This cleans names, converts height, etc.
    print("\n--- Running post-scraping preprocessing ---")
    preprocess_fighters_csv()

    # --- Step 4: Clean up temporary JSON files ---
    print("\n--- Deleting temporary JSON files ---")
    try:
        if os.path.exists(EVENTS_JSON_PATH):
            os.remove(EVENTS_JSON_PATH)
            print(f"Deleted: {EVENTS_JSON_PATH}")
        if os.path.exists(FIGHTERS_JSON_PATH):
            os.remove(FIGHTERS_JSON_PATH)
            print(f"Deleted: {FIGHTERS_JSON_PATH}")
    except OSError as e:
        print(f"Error deleting JSON files: {e}")

    print("\n\n--- Full Scraping and Preprocessing Pipeline Finished ---")

def run_update_pipeline(num_events=5):
    """
    Runs the incremental update pipeline to scrape only the latest events.
    Also adds any events from last_event.json that aren't already in the CSV.
    
    Args:
        num_events (int): Number of latest events to scrape
    """
    print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===")
    
    # --- Step 1: Scrape latest events only ---
    latest_events = scrape_latest_events(LAST_EVENT_JSON_PATH, num_events)
    
    # --- Step 2: Save latest events to last_event.json (even if empty) ---
    if latest_events:
        with open(LAST_EVENT_JSON_PATH, 'w') as f:
            json.dump(latest_events, f, indent=4)
        print(f"Latest {len(latest_events)} events saved to {LAST_EVENT_JSON_PATH}")
    
    # --- Step 3: Always check and update from last_event.json ---
    update_fights_csv_from_last_event()
    
    print(f"\n--- Update Pipeline Finished ---")

def update_fights_csv_from_last_event():
    """
    Updates the existing fights CSV with any events from last_event.json that aren't already present.
    Ensures latest events are on top and preserves data types.
    """
    # Check if last_event.json exists
    if not os.path.exists(LAST_EVENT_JSON_PATH):
        print(f"No {LAST_EVENT_JSON_PATH} found. Nothing to update.")
        return
    
    # Load events from last_event.json
    try:
        with open(LAST_EVENT_JSON_PATH, 'r') as f:
            events_from_json = json.load(f)
        
        if not events_from_json:
            print("No events found in last_event.json.")
            return
            
        print(f"Found {len(events_from_json)} events in last_event.json")
        
    except Exception as e:
        print(f"Error reading last_event.json: {e}")
        return
    
    try:
        # Check if main CSV exists
        if os.path.exists(FIGHTS_CSV_PATH):
            existing_df = pd.read_csv(FIGHTS_CSV_PATH)
            existing_event_names = set(existing_df['event_name'].unique())
        else:
            print(f"Main fights CSV ({FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.")
            json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
            return
        
        # Create temporary CSV from events in last_event.json
        temp_json_path = os.path.join(OUTPUT_DIR, 'temp_latest.json')
        temp_csv_path = os.path.join(OUTPUT_DIR, 'temp_latest.csv')
        
        with open(temp_json_path, 'w') as f:
            json.dump(events_from_json, f, indent=4)
        
        json_to_csv(temp_json_path, temp_csv_path)
        
        # Read the new CSV
        new_df = pd.read_csv(temp_csv_path)
        
        # Filter out events that already exist
        new_events_df = new_df[~new_df['event_name'].isin(existing_event_names)]
        
        if len(new_events_df) > 0:
            # Add new events to the TOP of the CSV (latest first)
            combined_df = pd.concat([new_events_df, existing_df], ignore_index=True)
            
            # Convert date column to datetime for proper sorting
            combined_df['event_date_parsed'] = pd.to_datetime(combined_df['event_date'])
            
            # Sort by date descending (latest first)
            combined_df = combined_df.sort_values('event_date_parsed', ascending=False)
            
            # Drop the temporary date column
            combined_df = combined_df.drop('event_date_parsed', axis=1)
            
            # Fix data types to remove .0 from numbers
            fix_data_types(combined_df)
            
            combined_df.to_csv(FIGHTS_CSV_PATH, index=False)
            print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {FIGHTS_CSV_PATH}")
        else:
            print("No new events found that aren't already in the existing CSV.")
        
        # Clean up temporary files
        if os.path.exists(temp_json_path):
            os.remove(temp_json_path)
        if os.path.exists(temp_csv_path):
            os.remove(temp_csv_path)
            
    except Exception as e:
        print(f"Error updating fights CSV: {e}")
        print("Falling back to creating new CSV from last_event.json only.")
        json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)

def fix_data_types(df):
    """
    Fix data types in the dataframe to remove .0 from numbers and preserve original format.
    
    Args:
        df (pandas.DataFrame): DataFrame to fix
    """
    for col in df.columns:
        if df[col].dtype == 'float64':
            # Check if the column contains only whole numbers (no actual decimals)
            if df[col].notna().all() and (df[col] % 1 == 0).all():
                df[col] = df[col].astype('int64')
            elif df[col].isna().any():
                # Handle columns with missing values - keep as string to avoid .0
                df[col] = df[col].fillna('').astype(str)
                # Remove .0 from string representations
                df[col] = df[col].str.replace(r'\.0$', '', regex=True)
                # Convert empty strings back to original empty values
                df[col] = df[col].replace('', '')