import matplotlib matplotlib.use('Agg') # Use Agg backend for thread safety import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns import json import os from leaderboard_utils import ( get_organization, get_mario_leaderboard, get_sokoban_leaderboard, get_2048_leaderboard, get_candy_leaderboard, get_tetris_leaderboard, get_tetris_planning_leaderboard, get_combined_leaderboard, GAME_ORDER ) # Load model colors with open('assets/model_color.json', 'r') as f: MODEL_COLORS = json.load(f) # Define game score columns mapping GAME_SCORE_COLUMNS = { "Super Mario Bros": "Score", "Sokoban": "Levels Cracked", "2048": "Score", "Candy Crash": "Average Score", "Tetris (complete)": "Score", "Tetris (planning only)": "Score" } def normalize_values(values, mean, std): """ Normalize values using z-score and scale to 0-100 range Args: values (list): List of values to normalize mean (float): Mean value for normalization std (float): Standard deviation for normalization Returns: list: Normalized values scaled to 0-100 range """ if std == 0: return [50 if v > 0 else 0 for v in values] # Handle zero std case z_scores = [(v - mean) / std for v in values] # Scale z-scores to 0-100 range, with mean at 50 scaled_values = [max(0, min(100, (z * 30) + 50)) for z in z_scores] return scaled_values def simplify_model_name(model_name): """ Simplify model name by either taking first 11 chars or string before third '-' """ hyphen_parts = model_name.split('-') return '-'.join(hyphen_parts[:3]) if len(hyphen_parts) >= 3 else model_name[:11] def create_horizontal_bar_chart(df, game_name): """ Create horizontal bar chart for detailed game view Args: df (pd.DataFrame): DataFrame containing game data game_name (str): Name of the game to display Returns: matplotlib.figure.Figure: The generated bar chart figure """ # Close any existing figures to prevent memory leaks plt.close('all') # Set style plt.style.use('default') # Increase figure width to accommodate long model names fig, ax = plt.subplots(figsize=(20, 11)) # Sort by score if game_name == "Super Mario Bros": score_col = "Score" df_sorted = df.sort_values(by=score_col, ascending=True) elif game_name == "Sokoban": # Process Sokoban scores by splitting and getting max level def get_max_level(levels_str): try: # Split by semicolon, strip whitespace, filter empty strings, convert to integers levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()] return max(levels) if levels else 0 except: return 0 # Create a temporary column with max levels df['Max Level'] = df['Levels Cracked'].apply(get_max_level) df_sorted = df.sort_values(by='Max Level', ascending=True) score_col = 'Max Level' elif game_name == "2048": score_col = "Score" df_sorted = df.sort_values(by=score_col, ascending=True) elif game_name == "Candy Crash": score_col = "Average Score" df_sorted = df.sort_values(by=score_col, ascending=True) elif game_name in ["Tetris (complete)", "Tetris (planning only)"]: score_col = "Score" df_sorted = df.sort_values(by=score_col, ascending=True) else: return None # Create color gradient colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(df_sorted))) # Create horizontal bars bars = ax.barh(range(len(df_sorted)), df_sorted[score_col], color=colors) # Add more space for labels on the left plt.subplots_adjust(left=0.3) # Customize the chart ax.set_yticks(range(len(df_sorted))) # Format player names: keep organization info and truncate the rest if too long def format_player_name(player, org): max_length = 40 # Maximum length for player name if len(player) > max_length: # Keep the first part and last part of the name parts = player.split('-') if len(parts) > 3: formatted = f"{parts[0]}-{parts[1]}-...{parts[-1]}" else: formatted = player[:max_length-3] + "..." else: formatted = player return f"{formatted} [{org}]" player_labels = [format_player_name(row['Player'], row['Organization']) for _, row in df_sorted.iterrows()] ax.set_yticklabels(player_labels, fontsize=9) # Add value labels on the bars for i, bar in enumerate(bars): width = bar.get_width() if game_name == "Candy Crash": score_text = f'{width:.1f}' else: score_text = f'{width:.0f}' ax.text(width, bar.get_y() + bar.get_height()/2, score_text, ha='left', va='center', fontsize=10, fontweight='bold', color='white', bbox=dict(facecolor=(0, 0, 0, 0.3), edgecolor='none', alpha=0.5, pad=2)) # Set title and labels ax.set_title(f"{game_name} Performance", pad=20, fontsize=14, fontweight='bold', color='#2c3e50') if game_name == "Sokoban": ax.set_xlabel("Maximum Level Reached", fontsize=12, fontweight='bold', color='#2c3e50', labelpad=10) else: ax.set_xlabel(score_col, fontsize=12, fontweight='bold', color='#2c3e50', labelpad=10) # Add grid lines ax.grid(True, axis='x', linestyle='--', alpha=0.3) # Remove top and right spines ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) # Adjust layout plt.tight_layout() return fig def create_radar_charts(df): """ Create two radar charts with improved normalization using z-scores """ # Close any existing figures to prevent memory leaks plt.close('all') # Define reasoning models reasoning_models = [ 'claude-3-7-sonnet-20250219(thinking)', 'o1-2024-12-17', 'gemini-2.0-flash-thinking-exp-1219', 'o3-mini-2025-01-31(medium)', 'gemini-2.5-pro-exp-03-25', 'o1-mini-2024-09-12', 'deepseek-r1' ] # Split dataframe into reasoning and non-reasoning models df_reasoning = df[df['Player'].isin(reasoning_models)] df_others = df[~df['Player'].isin(reasoning_models)] # Get game columns game_columns = [col for col in df.columns if col.endswith(' Score')] categories = [col.replace(' Score', '') for col in game_columns] # Create figure with two subplots - adjusted size for new layout fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6), subplot_kw=dict(projection='polar')) fig.patch.set_facecolor('white') # Set figure background to white def get_game_stats(df, game_col): """ Get mean and std for a game column, handling missing values """ values = [] for val in df[game_col]: if isinstance(val, str) and val == '_': values.append(0) else: try: values.append(float(val)) except: values.append(0) return np.mean(values), np.std(values) def setup_radar_plot(ax, data, title): ax.set_facecolor('white') # Set subplot background to white num_vars = len(categories) angles = np.linspace(0, 2*np.pi, num_vars, endpoint=False) angles = np.concatenate((angles, [angles[0]])) # Plot grid lines with darker color grid_values = [10, 30, 50, 70, 90] ax.set_rgrids(grid_values, labels=grid_values, angle=45, fontsize=6, alpha=0.7, # Increased alpha for better visibility color='#404040') # Darker color for grid labels # Make grid lines darker but still subtle ax.grid(True, color='#404040', alpha=0.3) # Darker grid lines # Define darker, more vibrant colors for the radar plots colors = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e', '#9467bd', '#8c564b'] # Calculate game statistics once game_stats = {col: get_game_stats(df, col) for col in game_columns} # Plot data with darker lines and higher opacity for fills for idx, (_, row) in enumerate(data.iterrows()): values = [] for col in game_columns: val = row[col] if isinstance(val, str) and val == '_': values.append(0) else: try: values.append(float(val)) except: values.append(0) # Normalize values using game statistics normalized_values = [] for i, v in enumerate(values): mean, std = game_stats[game_columns[i]] normalized_value = normalize_values([v], mean, std)[0] normalized_values.append(normalized_value) # Complete the circular plot normalized_values = np.concatenate((normalized_values, [normalized_values[0]])) model_name = simplify_model_name(row['Player']) ax.plot(angles, normalized_values, 'o-', linewidth=2.0, # Increased line width label=model_name, color=colors[idx % len(colors)], markersize=4) # Increased marker size ax.fill(angles, normalized_values, alpha=0.3, # Increased fill opacity color=colors[idx % len(colors)]) # Format categories formatted_categories = [] for game in categories: if game == "Tetris (planning only)": game = "Tetris\n(planning)" elif game == "Tetris (complete)": game = "Tetris\n(complete)" elif game == "Super Mario Bros": game = "Super\nMario" elif game == "Candy Crash": game = "Candy\nCrash" formatted_categories.append(game) ax.set_xticks(angles[:-1]) ax.set_xticklabels(formatted_categories, fontsize=8, # Slightly larger font color='#202020', # Darker text fontweight='bold') # Bold text ax.tick_params(pad=10, colors='#202020') # Darker tick colors ax.set_title(title, pad=20, fontsize=11, # Slightly larger title color='#202020', # Darker title fontweight='bold') # Bold title legend = ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=7, # Slightly larger legend framealpha=0.9, # More opaque legend edgecolor='#404040', # Darker edge ncol=1) ax.set_ylim(0, 105) ax.spines['polar'].set_color('#404040') # Darker spine ax.spines['polar'].set_alpha(0.5) # More visible spine # Setup both plots setup_radar_plot(ax1, df_reasoning, "Reasoning Models") setup_radar_plot(ax2, df_others, "Non-Reasoning Models") plt.subplots_adjust(right=0.85, wspace=0.3) return fig def get_combined_leaderboard_with_radar(rank_data, selected_games): """ Get combined leaderboard and create radar charts """ df = get_combined_leaderboard(rank_data, selected_games) radar_fig = create_radar_charts(df) return df, radar_fig def create_organization_radar_chart(rank_data): """ Create radar chart comparing organizations """ # Get combined leaderboard with all games df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER}) # Group by organization and calculate average scores org_performance = {} for org in df["Organization"].unique(): org_df = df[df["Organization"] == org] scores = {} for game in GAME_ORDER: game_scores = org_df[f"{game} Score"].apply(lambda x: float(x) if x != "_" else 0) scores[game] = game_scores.mean() org_performance[org] = scores # Create radar chart return create_radar_charts(pd.DataFrame([org_performance])) def create_top_players_radar_chart(rank_data, n=5): """ Create radar chart for top N players """ # Get combined leaderboard with all games df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER}) # Get top N players top_players = df["Player"].head(n).tolist() # Create radar chart for top players return create_radar_charts(df[df["Player"].isin(top_players)]) def create_player_radar_chart(rank_data, player_name): """ Create radar chart for a specific player """ # Get combined leaderboard with all games df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER}) # Get player's data player_df = df[df["Player"] == player_name] if player_df.empty: return None # Create radar chart for the player return create_radar_charts(player_df) def create_group_bar_chart(df): """ Create a grouped bar chart comparing AI model performance across different games Args: df (pd.DataFrame): DataFrame containing the combined leaderboard data Returns: matplotlib.figure.Figure: The generated group bar chart figure """ # Close any existing figures to prevent memory leaks plt.close('all') # Create figure and axis with better styling sns.set_style("whitegrid") fig = plt.figure(figsize=(20, 11)) # Create subplot with specific spacing ax = plt.subplot(111) # Adjust the subplot parameters plt.subplots_adjust(top=0.90, # Add more space at the top bottom=0.15, # Add more space at the bottom right=0.85, # Add more space for legend left=0.05) # Add space on the left # Get unique models models = df['Player'].unique() # Get active games (those that have score columns in the DataFrame) active_games = [] for game in GAME_ORDER: score_col = f"{game} Score" # Use the same column name for all games if score_col in df.columns: active_games.append(game) n_games = len(active_games) if n_games == 0: return fig # Return empty figure if no games are selected # Keep track of which models have data in any game models_with_data = set() # Calculate normalized scores for each game for game_idx, game in enumerate(active_games): # Get all scores for this game game_scores = [] # Use the same score column name for all games score_col = f"{game} Score" for model in models: try: score = df[df['Player'] == model][score_col].values[0] if score != '_' and float(score) > 0: # Only include non-zero scores game_scores.append((model, float(score))) models_with_data.add(model) # Add model to set if it has valid data except (IndexError, ValueError): continue if not game_scores: # Skip if no valid scores for this game continue # Sort scores from highest to lowest game_scores.sort(key=lambda x: x[1], reverse=True) # Extract sorted models and scores sorted_models = [x[0] for x in game_scores] scores = [x[1] for x in game_scores] # Calculate mean and std for normalization mean = np.mean(scores) std = np.std(scores) # Normalize scores normalized_scores = normalize_values(scores, mean, std) # Calculate bar width based on number of models in this game n_models_in_game = len(sorted_models) bar_width = 0.8 / n_models_in_game if n_models_in_game > 0 else 0.8 # Plot bars for each model for i, (model, score) in enumerate(zip(sorted_models, normalized_scores)): # Only add to legend if first appearance and model has data should_label = model in models_with_data and model not in [l.get_text() for l in ax.get_legend().get_texts()] if ax.get_legend() else True # Get color from MODEL_COLORS, use a default if not found color = MODEL_COLORS.get(model, f"C{i % 10}") # Use matplotlib default colors as fallback ax.bar(game_idx + i*bar_width, score, width=bar_width, label=model if should_label else "", color=color, alpha=0.8) # Customize the plot ax.set_xticks(np.arange(n_games)) ax.set_xticklabels(active_games, rotation=45, ha='right', fontsize=10) ax.set_ylabel('Normalized Performance Score', fontsize=12) ax.set_title('AI Model Performance Comparison Across Gaming Tasks', fontsize=14, pad=20) # Add grid lines ax.grid(True, axis='y', linestyle='--', alpha=0.3) # Create legend with unique entries handles, labels = ax.get_legend_handles_labels() by_label = dict(zip(labels, handles)) # Sort models by their first appearance in active games model_order = [] for game in active_games: score_col = f"{game} Score" # Use the same column name for all games for model in models: try: score = df[df['Player'] == model][score_col].values[0] if score != '_' and float(score) > 0 and model not in model_order: model_order.append(model) except (IndexError, ValueError): continue # Create legend with sorted models sorted_handles = [by_label[model] for model in model_order if model in by_label] sorted_labels = [model for model in model_order if model in by_label] ax.legend(sorted_handles, sorted_labels, bbox_to_anchor=(1.00, 1), # Moved from (1.15, 1) to (1.05, 1) to shift left loc='upper left', fontsize=9, title='AI Models', title_fontsize=10) # No need for tight_layout() as we're manually controlling the spacing return fig def get_combined_leaderboard_with_group_bar(rank_data, selected_games): """ Get combined leaderboard and create group bar chart Args: rank_data (dict): Dictionary containing rank data selected_games (dict): Dictionary of game names and their selection status Returns: tuple: (DataFrame, matplotlib.figure.Figure) containing the leaderboard data and group bar chart """ df = get_combined_leaderboard(rank_data, selected_games) group_bar_fig = create_group_bar_chart(df) return df, group_bar_fig def save_visualization(fig, filename): """ Save visualization to file """ fig.savefig(filename, bbox_inches='tight', dpi=300)