Spaces:
Sleeping
Sleeping
File size: 7,933 Bytes
7036785 5271c2e 38c6a34 9678fdb 7036785 5271c2e 7036785 5271c2e 7036785 9678fdb 7036785 5271c2e bf7e729 9678fdb 7036785 bf7e729 9678fdb 5b07ff1 bf7e729 5b07ff1 bf7e729 5b07ff1 9678fdb 5b07ff1 5271c2e 9678fdb 5271c2e 9678fdb 5271c2e 9678fdb 5271c2e 9678fdb 5271c2e 9678fdb 5271c2e 9678fdb 5271c2e 9678fdb 5271c2e 9678fdb 5271c2e 9678fdb 5271c2e 9678fdb 5271c2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import os
import json
import argparse
import pandas as pd
from .scrape_fights import scrape_all_events, scrape_latest_events
from .scrape_fighters import scrape_all_fighters
from .to_csv import json_to_csv, fighters_json_to_csv
from .preprocess import preprocess_fighters_csv
from ..config import (
OUTPUT_DIR,
FIGHTERS_JSON_PATH,
EVENTS_JSON_PATH,
FIGHTS_CSV_PATH,
LAST_EVENT_JSON_PATH
)
def main():
"""
Main function to run the scraping and preprocessing pipeline.
Supports both full scraping and incremental updates.
"""
parser = argparse.ArgumentParser(description="UFC Data Scraping Pipeline")
parser.add_argument(
'--mode',
type=str,
default='full',
choices=['full', 'update'],
help="Scraping mode: 'full' (complete scraping) or 'update' (latest events + sync from last_event.json)"
)
parser.add_argument(
'--num-events',
type=int,
default=5,
help="Number of latest events to scrape in update mode (default: 5)"
)
args = parser.parse_args()
# Ensure the output directory exists
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
print(f"Created directory: {OUTPUT_DIR}")
if args.mode == 'full':
run_full_pipeline()
elif args.mode == 'update':
run_update_pipeline(args.num_events)
def run_full_pipeline():
"""
Runs the complete scraping and preprocessing pipeline.
"""
print("\n=== Running FULL scraping pipeline ===")
# --- Step 1: Scrape all data from the website ---
# This will generate fighters.json and events.json
scrape_all_fighters(FIGHTERS_JSON_PATH)
scrape_all_events(EVENTS_JSON_PATH)
# --- Step 2: Convert the scraped JSON data to CSV format ---
# This will generate fighters.csv and fights.csv
json_to_csv(EVENTS_JSON_PATH, FIGHTS_CSV_PATH)
fighters_json_to_csv(FIGHTERS_JSON_PATH, FIGHTERS_CSV_PATH)
# --- Step 3: Run post-processing on the generated CSV files ---
# This cleans names, converts height, etc.
print("\n--- Running post-scraping preprocessing ---")
preprocess_fighters_csv()
# --- Step 4: Clean up temporary JSON files ---
print("\n--- Deleting temporary JSON files ---")
try:
if os.path.exists(EVENTS_JSON_PATH):
os.remove(EVENTS_JSON_PATH)
print(f"Deleted: {EVENTS_JSON_PATH}")
if os.path.exists(FIGHTERS_JSON_PATH):
os.remove(FIGHTERS_JSON_PATH)
print(f"Deleted: {FIGHTERS_JSON_PATH}")
except OSError as e:
print(f"Error deleting JSON files: {e}")
print("\n\n--- Full Scraping and Preprocessing Pipeline Finished ---")
def run_update_pipeline(num_events=5):
"""
Runs the incremental update pipeline to scrape only the latest events.
Also adds any events from last_event.json that aren't already in the CSV.
Args:
num_events (int): Number of latest events to scrape
"""
print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===")
# --- Step 1: Scrape latest events only ---
latest_events = scrape_latest_events(LAST_EVENT_JSON_PATH, num_events)
# --- Step 2: Save latest events to last_event.json (even if empty) ---
if latest_events:
with open(LAST_EVENT_JSON_PATH, 'w') as f:
json.dump(latest_events, f, indent=4)
print(f"Latest {len(latest_events)} events saved to {LAST_EVENT_JSON_PATH}")
# --- Step 3: Always check and update from last_event.json ---
update_fights_csv_from_last_event()
print(f"\n--- Update Pipeline Finished ---")
def update_fights_csv_from_last_event():
"""
Updates the existing fights CSV with any events from last_event.json that aren't already present.
Ensures latest events are on top and preserves data types.
"""
# Check if last_event.json exists
if not os.path.exists(LAST_EVENT_JSON_PATH):
print(f"No {LAST_EVENT_JSON_PATH} found. Nothing to update.")
return
# Load events from last_event.json
try:
with open(LAST_EVENT_JSON_PATH, 'r') as f:
events_from_json = json.load(f)
if not events_from_json:
print("No events found in last_event.json.")
return
print(f"Found {len(events_from_json)} events in last_event.json")
except Exception as e:
print(f"Error reading last_event.json: {e}")
return
try:
# Check if main CSV exists
if os.path.exists(FIGHTS_CSV_PATH):
existing_df = pd.read_csv(FIGHTS_CSV_PATH)
existing_event_names = set(existing_df['event_name'].unique())
else:
print(f"Main fights CSV ({FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.")
json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
return
# Create temporary CSV from events in last_event.json
temp_json_path = os.path.join(OUTPUT_DIR, 'temp_latest.json')
temp_csv_path = os.path.join(OUTPUT_DIR, 'temp_latest.csv')
with open(temp_json_path, 'w') as f:
json.dump(events_from_json, f, indent=4)
json_to_csv(temp_json_path, temp_csv_path)
# Read the new CSV
new_df = pd.read_csv(temp_csv_path)
# Filter out events that already exist
new_events_df = new_df[~new_df['event_name'].isin(existing_event_names)]
if len(new_events_df) > 0:
# Add new events to the TOP of the CSV (latest first)
combined_df = pd.concat([new_events_df, existing_df], ignore_index=True)
# Convert date column to datetime for proper sorting
combined_df['event_date_parsed'] = pd.to_datetime(combined_df['event_date'])
# Sort by date descending (latest first)
combined_df = combined_df.sort_values('event_date_parsed', ascending=False)
# Drop the temporary date column
combined_df = combined_df.drop('event_date_parsed', axis=1)
# Fix data types to remove .0 from numbers
fix_data_types(combined_df)
combined_df.to_csv(FIGHTS_CSV_PATH, index=False)
print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {FIGHTS_CSV_PATH}")
else:
print("No new events found that aren't already in the existing CSV.")
# Clean up temporary files
if os.path.exists(temp_json_path):
os.remove(temp_json_path)
if os.path.exists(temp_csv_path):
os.remove(temp_csv_path)
except Exception as e:
print(f"Error updating fights CSV: {e}")
print("Falling back to creating new CSV from last_event.json only.")
json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
def fix_data_types(df):
"""
Fix data types in the dataframe to remove .0 from numbers and preserve original format.
Args:
df (pandas.DataFrame): DataFrame to fix
"""
for col in df.columns:
if df[col].dtype == 'float64':
# Check if the column contains only whole numbers (no actual decimals)
if df[col].notna().all() and (df[col] % 1 == 0).all():
df[col] = df[col].astype('int64')
elif df[col].isna().any():
# Handle columns with missing values - keep as string to avoid .0
df[col] = df[col].fillna('').astype(str)
# Remove .0 from string representations
df[col] = df[col].str.replace(r'\.0$', '', regex=True)
# Convert empty strings back to original empty values
df[col] = df[col].replace('', '')
|