Spaces:
Running
Running
import matplotlib | |
matplotlib.use('Agg') # Use Agg backend for thread safety | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
import json | |
import os | |
from leaderboard_utils import ( | |
get_organization, | |
get_mario_leaderboard, | |
get_sokoban_leaderboard, | |
get_2048_leaderboard, | |
get_candy_leaderboard, | |
get_tetris_leaderboard, | |
get_tetris_planning_leaderboard, | |
get_combined_leaderboard, | |
GAME_ORDER | |
) | |
# Load model colors | |
with open('assets/model_color.json', 'r') as f: | |
MODEL_COLORS = json.load(f) | |
# Define game score columns mapping | |
GAME_SCORE_COLUMNS = { | |
"Super Mario Bros": "Score", | |
"Sokoban": "Levels Cracked", | |
"2048": "Score", | |
"Candy Crash": "Average Score", | |
"Tetris (complete)": "Score", | |
"Tetris (planning only)": "Score" | |
} | |
def normalize_values(values, mean, std): | |
""" | |
Normalize values using z-score and scale to 0-100 range | |
Args: | |
values (list): List of values to normalize | |
mean (float): Mean value for normalization | |
std (float): Standard deviation for normalization | |
Returns: | |
list: Normalized values scaled to 0-100 range | |
""" | |
if std == 0: | |
return [50 if v > 0 else 0 for v in values] # Handle zero std case | |
z_scores = [(v - mean) / std for v in values] | |
# Scale z-scores to 0-100 range, with mean at 50 | |
scaled_values = [max(0, min(100, (z * 30) + 50)) for z in z_scores] | |
return scaled_values | |
def simplify_model_name(model_name): | |
""" | |
Simplify model name by either taking first 11 chars or string before third '-' | |
""" | |
hyphen_parts = model_name.split('-') | |
return '-'.join(hyphen_parts[:3]) if len(hyphen_parts) >= 3 else model_name[:11] | |
def create_horizontal_bar_chart(df, game_name): | |
""" | |
Create horizontal bar chart for detailed game view | |
Args: | |
df (pd.DataFrame): DataFrame containing game data | |
game_name (str): Name of the game to display | |
Returns: | |
matplotlib.figure.Figure: The generated bar chart figure | |
""" | |
# Close any existing figures to prevent memory leaks | |
plt.close('all') | |
# Set style | |
plt.style.use('default') | |
# Increase figure width to accommodate long model names | |
fig, ax = plt.subplots(figsize=(20, 11)) | |
# Sort by score | |
if game_name == "Super Mario Bros": | |
score_col = "Score" | |
df_sorted = df.sort_values(by=score_col, ascending=True) | |
elif game_name == "Sokoban": | |
# Process Sokoban scores by splitting and getting max level | |
def get_max_level(levels_str): | |
try: | |
# Split by semicolon, strip whitespace, filter empty strings, convert to integers | |
levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()] | |
return max(levels) if levels else 0 | |
except: | |
return 0 | |
# Create a temporary column with max levels | |
df['Max Level'] = df['Levels Cracked'].apply(get_max_level) | |
df_sorted = df.sort_values(by='Max Level', ascending=True) | |
score_col = 'Max Level' | |
elif game_name == "2048": | |
score_col = "Score" | |
df_sorted = df.sort_values(by=score_col, ascending=True) | |
elif game_name == "Candy Crash": | |
score_col = "Average Score" | |
df_sorted = df.sort_values(by=score_col, ascending=True) | |
elif game_name in ["Tetris (complete)", "Tetris (planning only)"]: | |
score_col = "Score" | |
df_sorted = df.sort_values(by=score_col, ascending=True) | |
else: | |
return None | |
# Create color gradient | |
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(df_sorted))) | |
# Create horizontal bars | |
bars = ax.barh(range(len(df_sorted)), df_sorted[score_col], color=colors) | |
# Add more space for labels on the left | |
plt.subplots_adjust(left=0.3) | |
# Customize the chart | |
ax.set_yticks(range(len(df_sorted))) | |
# Format player names: keep organization info and truncate the rest if too long | |
def format_player_name(player, org): | |
max_length = 40 # Maximum length for player name | |
if len(player) > max_length: | |
# Keep the first part and last part of the name | |
parts = player.split('-') | |
if len(parts) > 3: | |
formatted = f"{parts[0]}-{parts[1]}-...{parts[-1]}" | |
else: | |
formatted = player[:max_length-3] + "..." | |
else: | |
formatted = player | |
return f"{formatted} [{org}]" | |
player_labels = [format_player_name(row['Player'], row['Organization']) | |
for _, row in df_sorted.iterrows()] | |
ax.set_yticklabels(player_labels, fontsize=9) | |
# Add value labels on the bars | |
for i, bar in enumerate(bars): | |
width = bar.get_width() | |
if game_name == "Candy Crash": | |
score_text = f'{width:.1f}' | |
else: | |
score_text = f'{width:.0f}' | |
ax.text(width, bar.get_y() + bar.get_height()/2, | |
score_text, | |
ha='left', va='center', | |
fontsize=10, | |
fontweight='bold', | |
color='white', | |
bbox=dict(facecolor=(0, 0, 0, 0.3), | |
edgecolor='none', | |
alpha=0.5, | |
pad=2)) | |
# Set title and labels | |
ax.set_title(f"{game_name} Performance", | |
pad=20, | |
fontsize=14, | |
fontweight='bold', | |
color='#2c3e50') | |
if game_name == "Sokoban": | |
ax.set_xlabel("Maximum Level Reached", | |
fontsize=12, | |
fontweight='bold', | |
color='#2c3e50', | |
labelpad=10) | |
else: | |
ax.set_xlabel(score_col, | |
fontsize=12, | |
fontweight='bold', | |
color='#2c3e50', | |
labelpad=10) | |
# Add grid lines | |
ax.grid(True, axis='x', linestyle='--', alpha=0.3) | |
# Remove top and right spines | |
ax.spines['top'].set_visible(False) | |
ax.spines['right'].set_visible(False) | |
# Adjust layout | |
plt.tight_layout() | |
return fig | |
def create_radar_charts(df): | |
""" | |
Create two radar charts with improved normalization using z-scores | |
""" | |
# Close any existing figures to prevent memory leaks | |
plt.close('all') | |
# Define reasoning models | |
reasoning_models = [ | |
'claude-3-7-sonnet-20250219(thinking)', | |
'o1-2024-12-17', | |
'gemini-2.0-flash-thinking-exp-1219', | |
'o3-mini-2025-01-31(medium)', | |
'gemini-2.5-pro-exp-03-25', | |
'o1-mini-2024-09-12', | |
'deepseek-r1' | |
] | |
# Split dataframe into reasoning and non-reasoning models | |
df_reasoning = df[df['Player'].isin(reasoning_models)] | |
df_others = df[~df['Player'].isin(reasoning_models)] | |
# Get game columns | |
game_columns = [col for col in df.columns if col.endswith(' Score')] | |
categories = [col.replace(' Score', '') for col in game_columns] | |
# Create figure with two subplots - adjusted size for new layout | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6), subplot_kw=dict(projection='polar')) | |
fig.patch.set_facecolor('white') # Set figure background to white | |
def get_game_stats(df, game_col): | |
""" | |
Get mean and std for a game column, handling missing values | |
""" | |
values = [] | |
for val in df[game_col]: | |
if isinstance(val, str) and val == '_': | |
values.append(0) | |
else: | |
try: | |
values.append(float(val)) | |
except: | |
values.append(0) | |
return np.mean(values), np.std(values) | |
def setup_radar_plot(ax, data, title): | |
ax.set_facecolor('white') # Set subplot background to white | |
num_vars = len(categories) | |
angles = np.linspace(0, 2*np.pi, num_vars, endpoint=False) | |
angles = np.concatenate((angles, [angles[0]])) | |
# Plot grid lines with darker color | |
grid_values = [10, 30, 50, 70, 90] | |
ax.set_rgrids(grid_values, | |
labels=grid_values, | |
angle=45, | |
fontsize=6, | |
alpha=0.7, # Increased alpha for better visibility | |
color='#404040') # Darker color for grid labels | |
# Make grid lines darker but still subtle | |
ax.grid(True, color='#404040', alpha=0.3) # Darker grid lines | |
# Define darker, more vibrant colors for the radar plots | |
colors = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e', '#9467bd', '#8c564b'] | |
# Calculate game statistics once | |
game_stats = {col: get_game_stats(df, col) for col in game_columns} | |
# Plot data with darker lines and higher opacity for fills | |
for idx, (_, row) in enumerate(data.iterrows()): | |
values = [] | |
for col in game_columns: | |
val = row[col] | |
if isinstance(val, str) and val == '_': | |
values.append(0) | |
else: | |
try: | |
values.append(float(val)) | |
except: | |
values.append(0) | |
# Normalize values using game statistics | |
normalized_values = [] | |
for i, v in enumerate(values): | |
mean, std = game_stats[game_columns[i]] | |
normalized_value = normalize_values([v], mean, std)[0] | |
normalized_values.append(normalized_value) | |
# Complete the circular plot | |
normalized_values = np.concatenate((normalized_values, [normalized_values[0]])) | |
model_name = simplify_model_name(row['Player']) | |
ax.plot(angles, normalized_values, 'o-', linewidth=2.0, # Increased line width | |
label=model_name, | |
color=colors[idx % len(colors)], | |
markersize=4) # Increased marker size | |
ax.fill(angles, normalized_values, | |
alpha=0.3, # Increased fill opacity | |
color=colors[idx % len(colors)]) | |
# Format categories | |
formatted_categories = [] | |
for game in categories: | |
if game == "Tetris (planning only)": | |
game = "Tetris\n(planning)" | |
elif game == "Tetris (complete)": | |
game = "Tetris\n(complete)" | |
elif game == "Super Mario Bros": | |
game = "Super\nMario" | |
elif game == "Candy Crash": | |
game = "Candy\nCrash" | |
formatted_categories.append(game) | |
ax.set_xticks(angles[:-1]) | |
ax.set_xticklabels(formatted_categories, | |
fontsize=8, # Slightly larger font | |
color='#202020', # Darker text | |
fontweight='bold') # Bold text | |
ax.tick_params(pad=10, colors='#202020') # Darker tick colors | |
ax.set_title(title, | |
pad=20, | |
fontsize=11, # Slightly larger title | |
color='#202020', # Darker title | |
fontweight='bold') # Bold title | |
legend = ax.legend(loc='upper right', | |
bbox_to_anchor=(1.3, 1.1), | |
fontsize=7, # Slightly larger legend | |
framealpha=0.9, # More opaque legend | |
edgecolor='#404040', # Darker edge | |
ncol=1) | |
ax.set_ylim(0, 105) | |
ax.spines['polar'].set_color('#404040') # Darker spine | |
ax.spines['polar'].set_alpha(0.5) # More visible spine | |
# Setup both plots | |
setup_radar_plot(ax1, df_reasoning, "Reasoning Models") | |
setup_radar_plot(ax2, df_others, "Non-Reasoning Models") | |
plt.subplots_adjust(right=0.85, wspace=0.3) | |
return fig | |
def get_combined_leaderboard_with_radar(rank_data, selected_games): | |
""" | |
Get combined leaderboard and create radar charts | |
""" | |
df = get_combined_leaderboard(rank_data, selected_games) | |
radar_fig = create_radar_charts(df) | |
return df, radar_fig | |
def create_organization_radar_chart(rank_data): | |
""" | |
Create radar chart comparing organizations | |
""" | |
# Get combined leaderboard with all games | |
df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER}) | |
# Group by organization and calculate average scores | |
org_performance = {} | |
for org in df["Organization"].unique(): | |
org_df = df[df["Organization"] == org] | |
scores = {} | |
for game in GAME_ORDER: | |
game_scores = org_df[f"{game} Score"].apply(lambda x: float(x) if x != "_" else 0) | |
scores[game] = game_scores.mean() | |
org_performance[org] = scores | |
# Create radar chart | |
return create_radar_charts(pd.DataFrame([org_performance])) | |
def create_top_players_radar_chart(rank_data, n=5): | |
""" | |
Create radar chart for top N players | |
""" | |
# Get combined leaderboard with all games | |
df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER}) | |
# Get top N players | |
top_players = df["Player"].head(n).tolist() | |
# Create radar chart for top players | |
return create_radar_charts(df[df["Player"].isin(top_players)]) | |
def create_player_radar_chart(rank_data, player_name): | |
""" | |
Create radar chart for a specific player | |
""" | |
# Get combined leaderboard with all games | |
df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER}) | |
# Get player's data | |
player_df = df[df["Player"] == player_name] | |
if player_df.empty: | |
return None | |
# Create radar chart for the player | |
return create_radar_charts(player_df) | |
def create_group_bar_chart(df): | |
""" | |
Create a grouped bar chart comparing AI model performance across different games | |
Args: | |
df (pd.DataFrame): DataFrame containing the combined leaderboard data | |
Returns: | |
matplotlib.figure.Figure: The generated group bar chart figure | |
""" | |
# Close any existing figures to prevent memory leaks | |
plt.close('all') | |
# Create figure and axis with better styling | |
sns.set_style("whitegrid") | |
fig = plt.figure(figsize=(20, 11)) | |
# Create subplot with specific spacing | |
ax = plt.subplot(111) | |
# Adjust the subplot parameters | |
plt.subplots_adjust(top=0.90, # Add more space at the top | |
bottom=0.15, # Add more space at the bottom | |
right=0.85, # Add more space for legend | |
left=0.05) # Add space on the left | |
# Get unique models | |
models = df['Player'].unique() | |
# Get active games (those that have score columns in the DataFrame) | |
active_games = [] | |
for game in GAME_ORDER: | |
score_col = f"{game} Score" # Use the same column name for all games | |
if score_col in df.columns: | |
active_games.append(game) | |
n_games = len(active_games) | |
if n_games == 0: | |
return fig # Return empty figure if no games are selected | |
# Keep track of which models have data in any game | |
models_with_data = set() | |
# Calculate normalized scores for each game | |
for game_idx, game in enumerate(active_games): | |
# Get all scores for this game | |
game_scores = [] | |
# Use the same score column name for all games | |
score_col = f"{game} Score" | |
for model in models: | |
try: | |
score = df[df['Player'] == model][score_col].values[0] | |
if score != '_' and float(score) > 0: # Only include non-zero scores | |
game_scores.append((model, float(score))) | |
models_with_data.add(model) # Add model to set if it has valid data | |
except (IndexError, ValueError): | |
continue | |
if not game_scores: # Skip if no valid scores for this game | |
continue | |
# Sort scores from highest to lowest | |
game_scores.sort(key=lambda x: x[1], reverse=True) | |
# Extract sorted models and scores | |
sorted_models = [x[0] for x in game_scores] | |
scores = [x[1] for x in game_scores] | |
# Calculate mean and std for normalization | |
mean = np.mean(scores) | |
std = np.std(scores) | |
# Normalize scores | |
normalized_scores = normalize_values(scores, mean, std) | |
# Calculate bar width based on number of models in this game | |
n_models_in_game = len(sorted_models) | |
bar_width = 0.8 / n_models_in_game if n_models_in_game > 0 else 0.8 | |
# Plot bars for each model | |
for i, (model, score) in enumerate(zip(sorted_models, normalized_scores)): | |
# Only add to legend if first appearance and model has data | |
should_label = model in models_with_data and model not in [l.get_text() for l in ax.get_legend().get_texts()] if ax.get_legend() else True | |
# Get color from MODEL_COLORS, use a default if not found | |
color = MODEL_COLORS.get(model, f"C{i % 10}") # Use matplotlib default colors as fallback | |
ax.bar(game_idx + i*bar_width, score, | |
width=bar_width, | |
label=model if should_label else "", | |
color=color, | |
alpha=0.8) | |
# Customize the plot | |
ax.set_xticks(np.arange(n_games)) | |
ax.set_xticklabels(active_games, rotation=45, ha='right', fontsize=10) | |
ax.set_ylabel('Normalized Performance Score', fontsize=12) | |
ax.set_title('AI Model Performance Comparison Across Gaming Tasks', | |
fontsize=14, pad=20) | |
# Add grid lines | |
ax.grid(True, axis='y', linestyle='--', alpha=0.3) | |
# Create legend with unique entries | |
handles, labels = ax.get_legend_handles_labels() | |
by_label = dict(zip(labels, handles)) | |
# Sort models by their first appearance in active games | |
model_order = [] | |
for game in active_games: | |
score_col = f"{game} Score" # Use the same column name for all games | |
for model in models: | |
try: | |
score = df[df['Player'] == model][score_col].values[0] | |
if score != '_' and float(score) > 0 and model not in model_order: | |
model_order.append(model) | |
except (IndexError, ValueError): | |
continue | |
# Create legend with sorted models | |
sorted_handles = [by_label[model] for model in model_order if model in by_label] | |
sorted_labels = [model for model in model_order if model in by_label] | |
ax.legend(sorted_handles, sorted_labels, | |
bbox_to_anchor=(1.00, 1), # Moved from (1.15, 1) to (1.05, 1) to shift left | |
loc='upper left', | |
fontsize=9, | |
title='AI Models', | |
title_fontsize=10) | |
# No need for tight_layout() as we're manually controlling the spacing | |
return fig | |
def get_combined_leaderboard_with_group_bar(rank_data, selected_games): | |
""" | |
Get combined leaderboard and create group bar chart | |
Args: | |
rank_data (dict): Dictionary containing rank data | |
selected_games (dict): Dictionary of game names and their selection status | |
Returns: | |
tuple: (DataFrame, matplotlib.figure.Figure) containing the leaderboard data and group bar chart | |
""" | |
df = get_combined_leaderboard(rank_data, selected_games) | |
group_bar_fig = create_group_bar_chart(df) | |
return df, group_bar_fig | |
def save_visualization(fig, filename): | |
""" | |
Save visualization to file | |
""" | |
fig.savefig(filename, bbox_inches='tight', dpi=300) | |