Spaces:
Running
Running
| import os | |
| import json | |
| import argparse | |
| import pandas as pd | |
| from .scrape_fights import scrape_all_events, scrape_latest_events | |
| from .scrape_fighters import scrape_all_fighters | |
| from .to_csv import json_to_csv, fighters_json_to_csv | |
| from .preprocess import preprocess_fighters_csv | |
| from .. import config | |
| def main(): | |
| """ | |
| Main function to run the scraping and preprocessing pipeline. | |
| Supports both full scraping and incremental updates. | |
| """ | |
| parser = argparse.ArgumentParser(description="UFC Data Scraping Pipeline") | |
| parser.add_argument( | |
| '--mode', | |
| type=str, | |
| default='full', | |
| choices=['full', 'update'], | |
| help="Scraping mode: 'full' (complete scraping) or 'update' (latest events + sync from last_event.json)" | |
| ) | |
| parser.add_argument( | |
| '--num-events', | |
| type=int, | |
| default=5, | |
| help="Number of latest events to scrape in update mode (default: 5)" | |
| ) | |
| args = parser.parse_args() | |
| # Ensure the output directory exists | |
| if not os.path.exists(config.OUTPUT_DIR): | |
| os.makedirs(config.OUTPUT_DIR) | |
| print(f"Created directory: {config.OUTPUT_DIR}") | |
| if args.mode == 'full': | |
| run_full_pipeline() | |
| elif args.mode == 'update': | |
| run_update_pipeline(args.num_events) | |
| def run_full_pipeline(): | |
| """ | |
| Runs the complete scraping and preprocessing pipeline. | |
| """ | |
| print("\n=== Running FULL scraping pipeline ===") | |
| # --- Step 1: Scrape all data from the website --- | |
| # This will generate fighters.json and events.json | |
| scrape_all_fighters(config.FIGHTERS_JSON_PATH) | |
| scrape_all_events(config.EVENTS_JSON_PATH) | |
| # --- Step 2: Convert the scraped JSON data to CSV format --- | |
| # This will generate fighters.csv and fights.csv | |
| json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH) | |
| fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH) | |
| # --- Step 3: Run post-processing on the generated CSV files --- | |
| # This cleans names, converts height, etc. | |
| print("\n--- Running post-scraping preprocessing ---") | |
| preprocess_fighters_csv() | |
| # --- Step 4: Clean up temporary JSON files --- | |
| print("\n--- Deleting temporary JSON files ---") | |
| try: | |
| if os.path.exists(config.EVENTS_JSON_PATH): | |
| os.remove(config.EVENTS_JSON_PATH) | |
| print(f"Deleted: {config.EVENTS_JSON_PATH}") | |
| if os.path.exists(config.FIGHTERS_JSON_PATH): | |
| os.remove(config.FIGHTERS_JSON_PATH) | |
| print(f"Deleted: {config.FIGHTERS_JSON_PATH}") | |
| except OSError as e: | |
| print(f"Error deleting JSON files: {e}") | |
| print("\n\n--- Full Scraping and Preprocessing Pipeline Finished ---") | |
| def run_update_pipeline(num_events=5): | |
| """ | |
| Runs the incremental update pipeline to scrape only the latest events. | |
| Also adds any events from last_event.json that aren't already in the CSV. | |
| Args: | |
| num_events (int): Number of latest events to scrape | |
| """ | |
| print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===") | |
| # --- Step 1: Scrape latest events only --- | |
| latest_events = scrape_latest_events(config.LAST_EVENT_JSON_PATH, num_events) | |
| # --- Step 2: Save latest events to last_event.json (even if empty) --- | |
| if latest_events: | |
| with open(config.LAST_EVENT_JSON_PATH, 'w') as f: | |
| json.dump(latest_events, f, indent=4) | |
| print(f"Latest {len(latest_events)} events saved to {config.LAST_EVENT_JSON_PATH}") | |
| # --- Step 3: Always check and update from last_event.json --- | |
| update_fights_csv_from_last_event() | |
| print(f"\n--- Update Pipeline Finished ---") | |
| def update_fights_csv_from_last_event(): | |
| """ | |
| Updates the existing fights CSV with any events from last_event.json that aren't already present. | |
| Ensures latest events are on top and preserves data types. | |
| """ | |
| # Check if last_event.json exists | |
| if not os.path.exists(config.LAST_EVENT_JSON_PATH): | |
| print(f"No {config.LAST_EVENT_JSON_PATH} found. Nothing to update.") | |
| return | |
| # Load events from last_event.json | |
| try: | |
| with open(config.LAST_EVENT_JSON_PATH, 'r') as f: | |
| events_from_json = json.load(f) | |
| if not events_from_json: | |
| print("No events found in last_event.json.") | |
| return | |
| print(f"Found {len(events_from_json)} events in last_event.json") | |
| except Exception as e: | |
| print(f"Error reading last_event.json: {e}") | |
| return | |
| try: | |
| # Check if main CSV exists | |
| if os.path.exists(config.FIGHTS_CSV_PATH): | |
| existing_df = pd.read_csv(config.FIGHTS_CSV_PATH) | |
| existing_event_names = set(existing_df['event_name'].unique()) | |
| else: | |
| print(f"Main fights CSV ({config.FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.") | |
| json_to_csv(config.LAST_EVENT_JSON_PATH, config.FIGHTS_CSV_PATH) | |
| return | |
| # Create temporary CSV from events in last_event.json | |
| temp_json_path = os.path.join(config.OUTPUT_DIR, 'temp_latest.json') | |
| temp_csv_path = os.path.join(config.OUTPUT_DIR, 'temp_latest.csv') | |
| with open(temp_json_path, 'w') as f: | |
| json.dump(events_from_json, f, indent=4) | |
| json_to_csv(temp_json_path, temp_csv_path) | |
| # Read the new CSV | |
| new_df = pd.read_csv(temp_csv_path) | |
| # Filter out events that already exist | |
| new_events_df = new_df[~new_df['event_name'].isin(existing_event_names)] | |
| if len(new_events_df) > 0: | |
| # Add new events to the TOP of the CSV (latest first) | |
| combined_df = pd.concat([new_events_df, existing_df], ignore_index=True) | |
| # Convert date column to datetime for proper sorting | |
| combined_df['event_date_parsed'] = pd.to_datetime(combined_df['event_date']) | |
| # Sort by date descending (latest first) | |
| combined_df = combined_df.sort_values('event_date_parsed', ascending=False) | |
| # Drop the temporary date column | |
| combined_df = combined_df.drop('event_date_parsed', axis=1) | |
| # Fix data types to remove .0 from numbers | |
| fix_data_types(combined_df) | |
| combined_df.to_csv(config.FIGHTS_CSV_PATH, index=False) | |
| print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {config.FIGHTS_CSV_PATH}") | |
| else: | |
| print("No new events found that aren't already in the existing CSV.") | |
| # Clean up temporary files | |
| if os.path.exists(temp_json_path): | |
| os.remove(temp_json_path) | |
| if os.path.exists(temp_csv_path): | |
| os.remove(temp_csv_path) | |
| except Exception as e: | |
| print(f"Error updating fights CSV: {e}") | |
| print("Falling back to creating new CSV from last_event.json only.") | |
| json_to_csv(config.LAST_EVENT_JSON_PATH, config.FIGHTS_CSV_PATH) | |
| def fix_data_types(df): | |
| """ | |
| Fix data types in the dataframe to remove .0 from numbers and preserve original format. | |
| Args: | |
| df (pandas.DataFrame): DataFrame to fix | |
| """ | |
| for col in df.columns: | |
| if df[col].dtype == 'float64': | |
| # Check if the column contains only whole numbers (no actual decimals) | |
| if df[col].notna().all() and (df[col] % 1 == 0).all(): | |
| df[col] = df[col].astype('int64') | |
| elif df[col].isna().any(): | |
| # Handle columns with missing values - keep as string to avoid .0 | |
| df[col] = df[col].fillna('').astype(str) | |
| # Remove .0 from string representations | |
| df[col] = df[col].str.replace(r'\.0$', '', regex=True) | |
| # Convert empty strings back to original empty values | |
| df[col] = df[col].replace('', '') | |
| if __name__ == '__main__': | |
| main() | |