lmgame_bench

Running

App Files Files Community

Yuxuan-Zhang-Dexter commited on Apr 7

Commit

6ebb0fb

1 Parent(s): 72bd46b

update gradip app

Browse files

Files changed (16) hide show

assets/game_video_link.json +6 -0
assets/model_color.json +17 -0
assets/news.json +34 -0
data_visualization.py +550 -0
leaderboard_utils.py +288 -0
rank_data_03_25_2025.json +324 -0
requirements.txt +4 -1
src/about.py +0 -72
src/display/css_html_js.py +0 -105
src/display/formatting.py +0 -27
src/display/utils.py +0 -110
src/envs.py +0 -25
src/leaderboard/read_evals.py +0 -196
src/populate.py +0 -58
src/submission/check_validity.py +0 -99
src/submission/submit.py +0 -119

assets/game_video_link.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "sokoban": "https://www.youtube.com/watch?v=59enV32MBUE",
+    "super_mario": "https://www.youtube.com/watch?v=nixMIJZYAgg",
+    "2048": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
+    "candy": "https://www.youtube.com/watch?v=b-Uyz3W4yIg"
+}

assets/model_color.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "claude-3-7-sonnet-20250219": "#4A90E2",
+    "claude-3-7-sonnet-20250219(thinking)": "#2E5C8A",
+    "claude-3-5-haiku-20241022": "#7FB5E6",
+    "claude-3-5-sonnet-20241022": "#1A4C7C",
+    "gemini-2.0-flash": "#FF4081",
+    "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
+    "gemini-2.5-pro-exp-03-25": "#FF80AB",
+    "gpt-4o-2024-11-20": "#00BFA5",
+    "gpt-4.5-preview-2025-02-27": "#00796B",
+    "o1-2024-12-17": "#4DB6AC",
+    "o1-mini-2024-09-12": "#26A69A",
+    "o3-mini-2025-01-31(medium)": "#80CBC4",
+    "deepseek-v3": "#FFC107",
+    "deepseek-r1": "#FFA000",
+    "Llama-4-Maverick-17B-128E-Instruct-FP8": "#8E24AA"
+}

assets/news.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "news": [
+        {
+            "date": "2025-04-01",
+            "video_link": "https://www.youtube.com/watch?v=uFVpNor7l_E",
+            "twitter_text": "Google's Gemini 2.5 Pro redefines AI gameplay: its multi-modal edge outperforms o1 & Claude 3.7 in Sokoban.",
+            "twitter_link": "https://x.com/haoailab/status/1907140718650704204"
+        },
+        {
+            "date": "2025-03-18",
+            "video_link": "https://www.youtube.com/watch?v=b-Uyz3W4yIg",
+            "twitter_text": "Candy Crush Saga's Hidden Complexity: Top AI Models Take the Challenge",
+            "twitter_link": "https://x.com/haoailab/status/1902095369808601551"
+        },
+        {
+            "date": "2025-03-14",
+            "video_link": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
+            "twitter_text": "2048 Mastery: Only Two AI Models Crack the Code to Surpass Random Play",
+            "twitter_link": "https://x.com/haoailab/status/1900645722095317255"
+        },
+        {
+            "date": "2025-03-06",
+            "video_link": "https://www.youtube.com/watch?v=59enV32MBUE",
+            "twitter_text": "Sokoban Showdown: o3-mini Dominates by Reaching Level 4",
+            "twitter_link": "https://x.com/haoailab/status/1897792946646421514"
+        },
+        {
+            "date": "2025-02-28",
+            "video_link": "https://www.youtube.com/watch?v=nixMIJZYAgg",
+            "twitter_text": "Super Mario AI Revolution: Claude-3.7 Sets Unprecedented Gameplay Benchmarks",
+            "twitter_link": "https://x.com/haoailab/status/1895557913621795076"
+        }
+    ]
+}

data_visualization.py ADDED Viewed

	@@ -0,0 +1,550 @@

+import matplotlib
+matplotlib.use('Agg')  # Use Agg backend for thread safety
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import json
+import os
+from leaderboard_utils import (
+    get_organization,
+    get_mario_leaderboard,
+    get_sokoban_leaderboard,
+    get_2048_leaderboard,
+    get_candy_leaderboard,
+    get_tetris_leaderboard,
+    get_tetris_planning_leaderboard,
+    get_combined_leaderboard,
+    GAME_ORDER
+)
+# Load model colors
+with open('assets/model_color.json', 'r') as f:
+    MODEL_COLORS = json.load(f)
+# Define game score columns mapping
+GAME_SCORE_COLUMNS = {
+    "Super Mario Bros": "Score",
+    "Sokoban": "Levels Cracked",
+    "2048": "Score",
+    "Candy Crash": "Average Score",
+    "Tetris (complete)": "Score",
+    "Tetris (planning only)": "Score"
+}
+def normalize_values(values, mean, std):
+    """
+    Normalize values using z-score and scale to 0-100 range
+    Args:
+        values (list): List of values to normalize
+        mean (float): Mean value for normalization
+        std (float): Standard deviation for normalization
+    Returns:
+        list: Normalized values scaled to 0-100 range
+    """
+    if std == 0:
+        return [50 if v > 0 else 0 for v in values]  # Handle zero std case
+    z_scores = [(v - mean) / std for v in values]
+    # Scale z-scores to 0-100 range, with mean at 50
+    scaled_values = [max(0, min(100, (z * 30) + 50)) for z in z_scores]
+    return scaled_values
+def simplify_model_name(model_name):
+    """
+    Simplify model name by either taking first 11 chars or string before third '-'
+    """
+    hyphen_parts = model_name.split('-')
+    return '-'.join(hyphen_parts[:3]) if len(hyphen_parts) >= 3 else model_name[:11]
+def create_horizontal_bar_chart(df, game_name):
+    """
+    Create horizontal bar chart for detailed game view
+    Args:
+        df (pd.DataFrame): DataFrame containing game data
+        game_name (str): Name of the game to display
+    Returns:
+        matplotlib.figure.Figure: The generated bar chart figure
+    """
+    # Close any existing figures to prevent memory leaks
+    plt.close('all')
+    # Set style
+    plt.style.use('default')
+    # Increase figure width to accommodate long model names
+    fig, ax = plt.subplots(figsize=(20, 11))
+    # Sort by score
+    if game_name == "Super Mario Bros":
+        score_col = "Score"
+        df_sorted = df.sort_values(by=score_col, ascending=True)
+    elif game_name == "Sokoban":
+        # Process Sokoban scores by splitting and getting max level
+        def get_max_level(levels_str):
+            try:
+                # Split by semicolon, strip whitespace, filter empty strings, convert to integers
+                levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
+                return max(levels) if levels else 0
+            except:
+                return 0
+        # Create a temporary column with max levels
+        df['Max Level'] = df['Levels Cracked'].apply(get_max_level)
+        df_sorted = df.sort_values(by='Max Level', ascending=True)
+        score_col = 'Max Level'
+    elif game_name == "2048":
+        score_col = "Score"
+        df_sorted = df.sort_values(by=score_col, ascending=True)
+    elif game_name == "Candy Crash":
+        score_col = "Average Score"
+        df_sorted = df.sort_values(by=score_col, ascending=True)
+    elif game_name in ["Tetris (complete)", "Tetris (planning only)"]:
+        score_col = "Score"
+        df_sorted = df.sort_values(by=score_col, ascending=True)
+    else:
+        return None
+    # Create color gradient
+    colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(df_sorted)))
+    # Create horizontal bars
+    bars = ax.barh(range(len(df_sorted)), df_sorted[score_col], color=colors)
+    # Add more space for labels on the left
+    plt.subplots_adjust(left=0.3)
+    # Customize the chart
+    ax.set_yticks(range(len(df_sorted)))
+    # Format player names: keep organization info and truncate the rest if too long
+    def format_player_name(player, org):
+        max_length = 40  # Maximum length for player name
+        if len(player) > max_length:
+            # Keep the first part and last part of the name
+            parts = player.split('-')
+            if len(parts) > 3:
+                formatted = f"{parts[0]}-{parts[1]}-...{parts[-1]}"
+            else:
+                formatted = player[:max_length-3] + "..."
+        else:
+            formatted = player
+        return f"{formatted} [{org}]"
+    player_labels = [format_player_name(row['Player'], row['Organization'])
+                    for _, row in df_sorted.iterrows()]
+    ax.set_yticklabels(player_labels, fontsize=9)
+    # Add value labels on the bars
+    for i, bar in enumerate(bars):
+        width = bar.get_width()
+        if game_name == "Candy Crash":
+            score_text = f'{width:.1f}'
+        else:
+            score_text = f'{width:.0f}'
+        ax.text(width, bar.get_y() + bar.get_height()/2,
+                score_text,
+                ha='left', va='center',
+                fontsize=10,
+                fontweight='bold',
+                color='white',
+                bbox=dict(facecolor=(0, 0, 0, 0.3),
+                         edgecolor='none',
+                         alpha=0.5,
+                         pad=2))
+    # Set title and labels
+    ax.set_title(f"{game_name} Performance",
+                 pad=20,
+                 fontsize=14,
+                 fontweight='bold',
+                 color='#2c3e50')
+    if game_name == "Sokoban":
+        ax.set_xlabel("Maximum Level Reached",
+                     fontsize=12,
+                     fontweight='bold',
+                     color='#2c3e50',
+                     labelpad=10)
+    else:
+        ax.set_xlabel(score_col,
+                     fontsize=12,
+                     fontweight='bold',
+                     color='#2c3e50',
+                     labelpad=10)
+    # Add grid lines
+    ax.grid(True, axis='x', linestyle='--', alpha=0.3)
+    # Remove top and right spines
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    # Adjust layout
+    plt.tight_layout()
+    return fig
+def create_radar_charts(df):
+    """
+    Create two radar charts with improved normalization using z-scores
+    """
+    # Close any existing figures to prevent memory leaks
+    plt.close('all')
+    # Define reasoning models
+    reasoning_models = [
+        'claude-3-7-sonnet-20250219(thinking)',
+        'o1-2024-12-17',
+        'gemini-2.0-flash-thinking-exp-1219',
+        'o3-mini-2025-01-31(medium)',
+        'gemini-2.5-pro-exp-03-25',
+        'o1-mini-2024-09-12',
+        'deepseek-r1'
+    ]
+    # Split dataframe into reasoning and non-reasoning models
+    df_reasoning = df[df['Player'].isin(reasoning_models)]
+    df_others = df[~df['Player'].isin(reasoning_models)]
+    # Get game columns
+    game_columns = [col for col in df.columns if col.endswith(' Score')]
+    categories = [col.replace(' Score', '') for col in game_columns]
+    # Create figure with two subplots - adjusted size for new layout
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6), subplot_kw=dict(projection='polar'))
+    fig.patch.set_facecolor('white')  # Set figure background to white
+    def get_game_stats(df, game_col):
+        """
+        Get mean and std for a game column, handling missing values
+        """
+        values = []
+        for val in df[game_col]:
+            if isinstance(val, str) and val == '_':
+                values.append(0)
+            else:
+                try:
+                    values.append(float(val))
+                except:
+                    values.append(0)
+        return np.mean(values), np.std(values)
+    def setup_radar_plot(ax, data, title):
+        ax.set_facecolor('white')  # Set subplot background to white
+        num_vars = len(categories)
+        angles = np.linspace(0, 2*np.pi, num_vars, endpoint=False)
+        angles = np.concatenate((angles, [angles[0]]))
+        # Plot grid lines with darker color
+        grid_values = [10, 30, 50, 70, 90]
+        ax.set_rgrids(grid_values,
+                    labels=grid_values,
+                    angle=45,
+                    fontsize=6,
+                    alpha=0.7,  # Increased alpha for better visibility
+                    color='#404040')  # Darker color for grid labels
+        # Make grid lines darker but still subtle
+        ax.grid(True, color='#404040', alpha=0.3)  # Darker grid lines
+        # Define darker, more vibrant colors for the radar plots
+        colors = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e', '#9467bd', '#8c564b']
+        # Calculate game statistics once
+        game_stats = {col: get_game_stats(df, col) for col in game_columns}
+        # Plot data with darker lines and higher opacity for fills
+        for idx, (_, row) in enumerate(data.iterrows()):
+            values = []
+            for col in game_columns:
+                val = row[col]
+                if isinstance(val, str) and val == '_':
+                    values.append(0)
+                else:
+                    try:
+                        values.append(float(val))
+                    except:
+                        values.append(0)
+            # Normalize values using game statistics
+            normalized_values = []
+            for i, v in enumerate(values):
+                mean, std = game_stats[game_columns[i]]
+                normalized_value = normalize_values([v], mean, std)[0]
+                normalized_values.append(normalized_value)
+            # Complete the circular plot
+            normalized_values = np.concatenate((normalized_values, [normalized_values[0]]))
+            model_name = simplify_model_name(row['Player'])
+            ax.plot(angles, normalized_values, 'o-', linewidth=2.0,  # Increased line width
+                   label=model_name,
+                   color=colors[idx % len(colors)],
+                   markersize=4)  # Increased marker size
+            ax.fill(angles, normalized_values,
+                   alpha=0.3,  # Increased fill opacity
+                   color=colors[idx % len(colors)])
+        # Format categories
+        formatted_categories = []
+        for game in categories:
+            if game == "Tetris (planning only)":
+                game = "Tetris\n(planning)"
+            elif game == "Tetris (complete)":
+                game = "Tetris\n(complete)"
+            elif game == "Super Mario Bros":
+                game = "Super\nMario"
+            elif game == "Candy Crash":
+                game = "Candy\nCrash"
+            formatted_categories.append(game)
+        ax.set_xticks(angles[:-1])
+        ax.set_xticklabels(formatted_categories,
+                          fontsize=8,  # Slightly larger font
+                          color='#202020',  # Darker text
+                          fontweight='bold')  # Bold text
+        ax.tick_params(pad=10, colors='#202020')  # Darker tick colors
+        ax.set_title(title,
+                    pad=20,
+                    fontsize=11,  # Slightly larger title
+                    color='#202020',  # Darker title
+                    fontweight='bold')  # Bold title
+        legend = ax.legend(loc='upper right',
+                          bbox_to_anchor=(1.3, 1.1),
+                          fontsize=7,  # Slightly larger legend
+                          framealpha=0.9,  # More opaque legend
+                          edgecolor='#404040',  # Darker edge
+                          ncol=1)
+        ax.set_ylim(0, 105)
+        ax.spines['polar'].set_color('#404040')  # Darker spine
+        ax.spines['polar'].set_alpha(0.5)  # More visible spine
+    # Setup both plots
+    setup_radar_plot(ax1, df_reasoning, "Reasoning Models")
+    setup_radar_plot(ax2, df_others, "Non-Reasoning Models")
+    plt.subplots_adjust(right=0.85, wspace=0.3)
+    return fig
+def get_combined_leaderboard_with_radar(rank_data, selected_games):
+    """
+    Get combined leaderboard and create radar charts
+    """
+    df = get_combined_leaderboard(rank_data, selected_games)
+    radar_fig = create_radar_charts(df)
+    return df, radar_fig
+def create_organization_radar_chart(rank_data):
+    """
+    Create radar chart comparing organizations
+    """
+    # Get combined leaderboard with all games
+    df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})
+    # Group by organization and calculate average scores
+    org_performance = {}
+    for org in df["Organization"].unique():
+        org_df = df[df["Organization"] == org]
+        scores = {}
+        for game in GAME_ORDER:
+            game_scores = org_df[f"{game} Score"].apply(lambda x: float(x) if x != "_" else 0)
+            scores[game] = game_scores.mean()
+        org_performance[org] = scores
+    # Create radar chart
+    return create_radar_charts(pd.DataFrame([org_performance]))
+def create_top_players_radar_chart(rank_data, n=5):
+    """
+    Create radar chart for top N players
+    """
+    # Get combined leaderboard with all games
+    df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})
+    # Get top N players
+    top_players = df["Player"].head(n).tolist()
+    # Create radar chart for top players
+    return create_radar_charts(df[df["Player"].isin(top_players)])
+def create_player_radar_chart(rank_data, player_name):
+    """
+    Create radar chart for a specific player
+    """
+    # Get combined leaderboard with all games
+    df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})
+    # Get player's data
+    player_df = df[df["Player"] == player_name]
+    if player_df.empty:
+        return None
+    # Create radar chart for the player
+    return create_radar_charts(player_df)
+def create_group_bar_chart(df):
+    """
+    Create a grouped bar chart comparing AI model performance across different games
+    Args:
+        df (pd.DataFrame): DataFrame containing the combined leaderboard data
+    Returns:
+        matplotlib.figure.Figure: The generated group bar chart figure
+    """
+    # Close any existing figures to prevent memory leaks
+    plt.close('all')
+    # Create figure and axis with better styling
+    sns.set_style("whitegrid")
+    fig = plt.figure(figsize=(20, 11))
+    # Create subplot with specific spacing
+    ax = plt.subplot(111)
+    # Adjust the subplot parameters
+    plt.subplots_adjust(top=0.90,    # Add more space at the top
+                       bottom=0.15,   # Add more space at the bottom
+                       right=0.85,   # Add more space for legend
+                       left=0.05)     # Add space on the left
+    # Get unique models
+    models = df['Player'].unique()
+    # Get active games (those that have score columns in the DataFrame)
+    active_games = []
+    for game in GAME_ORDER:
+        score_col = f"{game} Score"  # Use the same column name for all games
+        if score_col in df.columns:
+            active_games.append(game)
+    n_games = len(active_games)
+    if n_games == 0:
+        return fig  # Return empty figure if no games are selected
+    # Keep track of which models have data in any game
+    models_with_data = set()
+    # Calculate normalized scores for each game
+    for game_idx, game in enumerate(active_games):
+        # Get all scores for this game
+        game_scores = []
+        # Use the same score column name for all games
+        score_col = f"{game} Score"
+        for model in models:
+            try:
+                score = df[df['Player'] == model][score_col].values[0]
+                if score != '_' and float(score) > 0:  # Only include non-zero scores
+                    game_scores.append((model, float(score)))
+                    models_with_data.add(model)  # Add model to set if it has valid data
+            except (IndexError, ValueError):
+                continue
+        if not game_scores:  # Skip if no valid scores for this game
+            continue
+        # Sort scores from highest to lowest
+        game_scores.sort(key=lambda x: x[1], reverse=True)
+        # Extract sorted models and scores
+        sorted_models = [x[0] for x in game_scores]
+        scores = [x[1] for x in game_scores]
+        # Calculate mean and std for normalization
+        mean = np.mean(scores)
+        std = np.std(scores)
+        # Normalize scores
+        normalized_scores = normalize_values(scores, mean, std)
+        # Calculate bar width based on number of models in this game
+        n_models_in_game = len(sorted_models)
+        bar_width = 0.8 / n_models_in_game if n_models_in_game > 0 else 0.8
+        # Plot bars for each model
+        for i, (model, score) in enumerate(zip(sorted_models, normalized_scores)):
+            # Only add to legend if first appearance and model has data
+            should_label = model in models_with_data and model not in [l.get_text() for l in ax.get_legend().get_texts()] if ax.get_legend() else True
+            # Get color from MODEL_COLORS, use a default if not found
+            color = MODEL_COLORS.get(model, f"C{i % 10}")  # Use matplotlib default colors as fallback
+            ax.bar(game_idx + i*bar_width, score,
+                  width=bar_width,
+                  label=model if should_label else "",
+                  color=color,
+                  alpha=0.8)
+    # Customize the plot
+    ax.set_xticks(np.arange(n_games))
+    ax.set_xticklabels(active_games, rotation=45, ha='right', fontsize=10)
+    ax.set_ylabel('Normalized Performance Score', fontsize=12)
+    ax.set_title('AI Model Performance Comparison Across Gaming Tasks',
+                 fontsize=14, pad=20)
+    # Add grid lines
+    ax.grid(True, axis='y', linestyle='--', alpha=0.3)
+    # Create legend with unique entries
+    handles, labels = ax.get_legend_handles_labels()
+    by_label = dict(zip(labels, handles))
+    # Sort models by their first appearance in active games
+    model_order = []
+    for game in active_games:
+        score_col = f"{game} Score"  # Use the same column name for all games
+        for model in models:
+            try:
+                score = df[df['Player'] == model][score_col].values[0]
+                if score != '_' and float(score) > 0 and model not in model_order:
+                    model_order.append(model)
+            except (IndexError, ValueError):
+                continue
+    # Create legend with sorted models
+    sorted_handles = [by_label[model] for model in model_order if model in by_label]
+    sorted_labels = [model for model in model_order if model in by_label]
+    ax.legend(sorted_handles, sorted_labels,
+              bbox_to_anchor=(1.00, 1),  # Moved from (1.15, 1) to (1.05, 1) to shift left
+              loc='upper left',
+              fontsize=9,
+              title='AI Models',
+              title_fontsize=10)
+    # No need for tight_layout() as we're manually controlling the spacing
+    return fig
+def get_combined_leaderboard_with_group_bar(rank_data, selected_games):
+    """
+    Get combined leaderboard and create group bar chart
+    Args:
+        rank_data (dict): Dictionary containing rank data
+        selected_games (dict): Dictionary of game names and their selection status
+    Returns:
+        tuple: (DataFrame, matplotlib.figure.Figure) containing the leaderboard data and group bar chart
+    """
+    df = get_combined_leaderboard(rank_data, selected_games)
+    group_bar_fig = create_group_bar_chart(df)
+    return df, group_bar_fig
+def save_visualization(fig, filename):
+    """
+    Save visualization to file
+    """
+    fig.savefig(filename, bbox_inches='tight', dpi=300)

leaderboard_utils.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import pandas as pd
+import json
+import numpy as np
+# Define game order
+GAME_ORDER = [
+    "Super Mario Bros",
+    "Sokoban",
+    "2048",
+    "Candy Crash",
+    "Tetris (complete)",
+    "Tetris (planning only)"
+]
+def get_organization(model_name):
+    m = model_name.lower()
+    if "claude" in m:
+        return "anthropic"
+    elif "gemini" in m:
+        return "google"
+    elif "o1" in m or "gpt" in m or "o3" in m:
+        return "openai"
+    elif "deepseek" in m:
+        return "deepseek"
+    else:
+        return "unknown"
+def get_mario_leaderboard(rank_data):
+    data = rank_data.get("Super Mario Bros", {}).get("results", [])
+    df = pd.DataFrame(data)
+    df = df.rename(columns={
+        "model": "Player",
+        "progress": "Progress (current/total)",
+        "score": "Score",
+        "time_s": "Time (s)"
+    })
+    df["Organization"] = df["Player"].apply(get_organization)
+    df = df[["Player", "Organization", "Progress (current/total)", "Score", "Time (s)"]]
+    return df
+def get_sokoban_leaderboard(rank_data):
+    data = rank_data.get("Sokoban", {}).get("results", [])
+    df = pd.DataFrame(data)
+    df = df.rename(columns={
+        "model": "Player",
+        "levels_cracked": "Levels Cracked",
+        "steps": "Steps"
+    })
+    df["Organization"] = df["Player"].apply(get_organization)
+    df = df[["Player", "Organization", "Levels Cracked", "Steps"]]
+    return df
+def get_2048_leaderboard(rank_data):
+    data = rank_data.get("2048", {}).get("results", [])
+    df = pd.DataFrame(data)
+    df = df.rename(columns={
+        "model": "Player",
+        "score": "Score",
+        "steps": "Steps",
+        "time": "Time"
+    })
+    df["Organization"] = df["Player"].apply(get_organization)
+    df = df[["Player", "Organization", "Score", "Steps", "Time"]]
+    return df
+def get_candy_leaderboard(rank_data):
+    data = rank_data.get("Candy Crash", {}).get("results", [])
+    df = pd.DataFrame(data)
+    df = df.rename(columns={
+        "model": "Player",
+        "score_runs": "Score Runs",
+        "average_score": "Average Score",
+        "steps": "Steps"
+    })
+    df["Organization"] = df["Player"].apply(get_organization)
+    df = df[["Player", "Organization", "Score Runs", "Average Score", "Steps"]]
+    return df
+def get_tetris_leaderboard(rank_data):
+    data = rank_data.get("Tetris (complete)", {}).get("results", [])
+    df = pd.DataFrame(data)
+    df = df.rename(columns={
+        "model": "Player",
+        "score": "Score",
+        "steps_blocks": "Steps"
+    })
+    df["Organization"] = df["Player"].apply(get_organization)
+    df = df[["Player", "Organization", "Score", "Steps"]]
+    return df
+def get_tetris_planning_leaderboard(rank_data):
+    data = rank_data.get("Tetris (planning only)", {}).get("results", [])
+    df = pd.DataFrame(data)
+    df = df.rename(columns={
+        "model": "Player",
+        "score": "Score",
+        "steps_blocks": "Steps"
+    })
+    df["Organization"] = df["Player"].apply(get_organization)
+    df = df[["Player", "Organization", "Score", "Steps"]]
+    return df
+def calculate_rank_and_completeness(rank_data, selected_games):
+    # Dictionary to store DataFrames for each game
+    game_dfs = {}
+    # Get DataFrames for selected games
+    if selected_games.get("Super Mario Bros"):
+        game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
+    if selected_games.get("Sokoban"):
+        game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
+    if selected_games.get("2048"):
+        game_dfs["2048"] = get_2048_leaderboard(rank_data)
+    if selected_games.get("Candy Crash"):
+        game_dfs["Candy Crash"] = get_candy_leaderboard(rank_data)
+    if selected_games.get("Tetris (complete)"):
+        game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
+    if selected_games.get("Tetris (planning only)"):
+        game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
+    # Get all unique players
+    all_players = set()
+    for df in game_dfs.values():
+        all_players.update(df["Player"].unique())
+    all_players = sorted(list(all_players))
+    # Create results DataFrame
+    results = []
+    for player in all_players:
+        player_data = {
+            "Player": player,
+            "Organization": get_organization(player)
+        }
+        ranks = []
+        games_played = 0
+        # Calculate rank and completeness for each game
+        for game in GAME_ORDER:
+            if game in game_dfs:
+                df = game_dfs[game]
+                if player in df["Player"].values:
+                    games_played += 1
+                    # Get player's score based on game type
+                    if game == "Super Mario Bros":
+                        player_score = df[df["Player"] == player]["Score"].iloc[0]
+                        rank = len(df[df["Score"] > player_score]) + 1
+                    elif game == "Sokoban":
+                        # Parse Sokoban score string and get maximum level
+                        levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
+                        try:
+                            # Split by semicolon, strip whitespace, filter empty strings, convert to integers
+                            levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
+                            player_score = max(levels) if levels else 0
+                        except:
+                            player_score = 0
+                        # Calculate rank based on maximum level
+                        rank = len(df[df["Levels Cracked"].apply(
+                            lambda x: max([int(y.strip()) for y in x.split(";") if y.strip()]) > player_score
+                        )]) + 1
+                    elif game == "2048":
+                        player_score = df[df["Player"] == player]["Score"].iloc[0]
+                        rank = len(df[df["Score"] > player_score]) + 1
+                    elif game == "Candy Crash":
+                        player_score = df[df["Player"] == player]["Average Score"].iloc[0]
+                        rank = len(df[df["Average Score"] > player_score]) + 1
+                    elif game == "Tetris (complete)":
+                        player_score = df[df["Player"] == player]["Score"].iloc[0]
+                        rank = len(df[df["Score"] > player_score]) + 1
+                    elif game == "Tetris (planning only)":
+                        player_score = df[df["Player"] == player]["Score"].iloc[0]
+                        rank = len(df[df["Score"] > player_score]) + 1
+                    ranks.append(rank)
+                    player_data[f"{game} Score"] = player_score
+                else:
+                    player_data[f"{game} Score"] = "_"
+        # Calculate average rank and completeness for sorting only
+        if ranks:
+            player_data["Sort Rank"] = round(np.mean(ranks), 2)
+            player_data["Games Played"] = games_played
+        else:
+            player_data["Sort Rank"] = float('inf')
+            player_data["Games Played"] = 0
+        results.append(player_data)
+    # Create DataFrame and sort by average rank and completeness
+    df_results = pd.DataFrame(results)
+    if not df_results.empty:
+        # Sort by average rank (ascending) and completeness (descending)
+        df_results = df_results.sort_values(
+            by=["Sort Rank", "Games Played"],
+            ascending=[True, False]
+        )
+        # Drop the sorting columns
+        df_results = df_results.drop(["Sort Rank", "Games Played"], axis=1)
+    return df_results
+def get_combined_leaderboard(rank_data, selected_games):
+    """
+    Get combined leaderboard for selected games
+    Args:
+        rank_data (dict): Dictionary containing rank data
+        selected_games (dict): Dictionary of game names and their selection status
+    Returns:
+        pd.DataFrame: Combined leaderboard DataFrame
+    """
+    # Dictionary to store DataFrames for each game
+    game_dfs = {}
+    # Get DataFrames for selected games
+    if selected_games.get("Super Mario Bros"):
+        game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
+    if selected_games.get("Sokoban"):
+        game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
+    if selected_games.get("2048"):
+        game_dfs["2048"] = get_2048_leaderboard(rank_data)
+    if selected_games.get("Candy Crash"):
+        game_dfs["Candy Crash"] = get_candy_leaderboard(rank_data)
+    if selected_games.get("Tetris (complete)"):
+        game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
+    if selected_games.get("Tetris (planning only)"):
+        game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
+    # Get all unique players
+    all_players = set()
+    for df in game_dfs.values():
+        all_players.update(df["Player"].unique())
+    all_players = sorted(list(all_players))
+    # Create results DataFrame
+    results = []
+    for player in all_players:
+        player_data = {
+            "Player": player,
+            "Organization": get_organization(player)
+        }
+        # Add scores for each game
+        for game in GAME_ORDER:
+            if game in game_dfs:
+                df = game_dfs[game]
+                if player in df["Player"].values:
+                    if game == "Super Mario Bros":
+                        player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
+                    elif game == "Sokoban":
+                        # Parse Sokoban score string and get maximum level
+                        levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
+                        try:
+                            levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
+                            player_data[f"{game} Score"] = max(levels) if levels else 0
+                        except:
+                            player_data[f"{game} Score"] = 0
+                    elif game == "2048":
+                        player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
+                    elif game == "Candy Crash":
+                        player_data[f"{game} Score"] = df[df["Player"] == player]["Average Score"].iloc[0]
+                    elif game in ["Tetris (complete)", "Tetris (planning only)"]:
+                        player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
+                else:
+                    player_data[f"{game} Score"] = "_"
+        results.append(player_data)
+    # Create DataFrame
+    df_results = pd.DataFrame(results)
+    # Sort by total score across all games
+    if not df_results.empty:
+        # Calculate total score for each player
+        df_results["Total Score"] = 0
+        for game in GAME_ORDER:
+            if f"{game} Score" in df_results.columns:
+                df_results["Total Score"] += df_results[f"{game} Score"].apply(
+                    lambda x: float(x) if x != "_" else 0
+                )
+        # Sort by total score in descending order
+        df_results = df_results.sort_values("Total Score", ascending=False)
+        # Drop the temporary total score column
+        df_results = df_results.drop("Total Score", axis=1)
+    return df_results

rank_data_03_25_2025.json ADDED Viewed

	@@ -0,0 +1,324 @@

+{
+    "Super Mario Bros": {
+        "runs": 5,
+        "results": [
+            {
+                "model": "claude-3-7-sonnet-20250219",
+                "score": 710,
+                "progress": "1-1",
+                "time_s": 64.2,
+                "rank": 1
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 560,
+                "progress": "1-1",
+                "time_s": 58.6,
+                "rank": 2
+            },
+            {
+                "model": "gemini-2.0-flash",
+                "score": 320,
+                "progress": "1-1",
+                "time_s": 51.8,
+                "rank": 3
+            },
+            {
+                "model": "claude-3-5-haiku-20241022",
+                "score": 140,
+                "progress": "1-1",
+                "time_s": 76.4,
+                "rank": 4
+            },
+            {
+                "model": "gpt-4.5-preview-2025-02-27",
+                "score": 160,
+                "progress": "1-1",
+                "time_s": 62.8,
+                "rank": 5
+            }
+        ]
+    },
+    "2048": {
+        "runs": 1,
+        "results": [
+            {
+                "model": "claude-3-7-sonnet-20250219(thinking)",
+                "score": 256,
+                "steps": 114,
+                "time": ">200",
+                "rank": 1
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score": 256,
+                "steps": 116,
+                "time": ">200",
+                "rank": 2
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219",
+                "score": 256,
+                "steps": 130,
+                "time": "20:36",
+                "rank": 3
+            },
+            {
+                "model": "deepseek-v3",
+                "score": 256,
+                "steps": 216,
+                "time": "54.02",
+                "rank": 4
+            },
+            {
+                "model": "gemini-2.0-flash",
+                "score": 128,
+                "steps": 111,
+                "time": "18:43",
+                "rank": 5
+            },
+            {
+                "model": "gemini-2.0-flash-thinking-exp-1219",
+                "score": 128,
+                "steps": 132,
+                "time": ">100",
+                "rank": 6
+            },
+            {
+                "model": "gemini-2.5-pro-exp-03-25",
+                "score": 128,
+                "steps": 138,
+                "time": "169",
+                "rank": 7
+            },
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "score": 64,
+                "steps": 92,
+                "time": "9:2",
+                "rank": 9
+            },
+            {
+                "model": "gpt-4.5-preview-2025-02-27",
+                "score": 34,
+                "steps": 34,
+                "time": "8:25",
+                "rank": 10
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 16,
+                "steps": 21,
+                "time": "1:17",
+                "rank": 11
+            },
+            {
+                "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
+                "score": 128,
+                "steps": 145,
+                "time": ">100",
+                "rank": 8
+            }
+        ]
+    },
+    "Tetris (complete)": {
+        "runs": 3,
+        "results": [
+            {
+                "model": "claude-3-7-sonnet-20250219",
+                "score": 95,
+                "steps_blocks": 27,
+                "rank": 1
+            },
+            {
+                "model": "claude-3-5-haiku-20241022",
+                "score": 90,
+                "steps_blocks": 25,
+                "rank": 2
+            },
+            {
+                "model": "gemini-2.0-flash",
+                "score": 82,
+                "steps_blocks": 23,
+                "rank": 3
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 54,
+                "steps_blocks": 19,
+                "rank": 4
+            }
+        ]
+    },
+    "Tetris (planning only)": {
+        "runs": 3,
+        "results": [
+            {
+                "model": "claude-3-7-sonnet-20250219",
+                "score": 110,
+                "steps_blocks": 29,
+                "rank": 1
+            },
+            {
+                "model": "claude-3-5-haiku-20241022",
+                "score": 92,
+                "steps_blocks": 25,
+                "rank": 2
+            },
+            {
+                "model": "gemini-2.0-flash",
+                "score": 87,
+                "steps_blocks": 24,
+                "rank": 3
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "score": 56,
+                "steps_blocks": 20,
+                "rank": 4
+            }
+        ]
+    },
+    "Candy Crash": {
+        "runs": 3,
+        "results": [
+            {
+                "model": "o3-mini-2025-01-31(medium)",
+                "score_runs": "90;109;120",
+                "average_score": 106.33,
+                "steps": 25,
+                "rank": 1
+            },
+            {
+                "model": "o1-2024-12-17",
+                "score_runs": "96;114;83",
+                "average_score": 97.67,
+                "steps": 25,
+                "rank": 2
+            },
+            {
+                "model": "deepseek-r1",
+                "score_runs": "62;108;105",
+                "average_score": 91.67,
+                "steps": 25,
+                "rank": 3
+            },
+            {
+                "model": "gemini-2.5-pro-exp-03-25",
+                "score_runs": "50;36;68",
+                "average_score": 51.33,
+                "steps": 25,
+                "rank": 4
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219(thinking)",
+                "score_runs": "36;46;24",
+                "average_score": 35.33,
+                "steps": 25,
+                "rank": 5
+            },
+            {
+                "model": "gemini-2.0-flash-thinking-exp-1219",
+                "score_runs": "0;15;39",
+                "average_score": 18,
+                "steps": 25,
+                "rank": 6
+            },
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "score_runs": "3;0;0",
+                "average_score": 1,
+                "steps": 25,
+                "rank": 7
+            },
+            {
+                "model": "deepseek-v3",
+                "score_runs": "0;0;0",
+                "average_score": 0,
+                "steps": 25,
+                "rank":9
+            },
+            {
+                "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
+                "score_runs": "6;0;0",
+                "average_score": 2,
+                "steps": 25,
+                "rank": 8
+            }
+        ]
+    },
+    "Sokoban": {
+        "runs": 3,
+        "results": [
+            {
+                "model": "o3-mini-2025-01-31(medium)",
+                "levels_cracked": "2; 3; 2",
+                "steps": "[17,52,68];[24,58,78,91];[19,44,64]",
+                "rank": 1
+            },
+            {
+                "model": "gemini-2.5-pro-exp-03-25",
+                "levels_cracked": "2;2;3",
+                "steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
+                "rank": 2
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219(thinking)",
+                "levels_cracked": "1; 2; 0",
+                "steps": "[17,35];[15,40,43];[4]",
+                "rank": 3
+            },
+            {
+                "model": "o1-2024-12-17",
+                "levels_cracked": "1; 1; 1",
+                "steps": null,
+                "rank": 4
+            },
+            {
+                "model": "deepseek-r1",
+                "levels_cracked": "1; 0; 1",
+                "steps": "[19,42];[13];[19,36]",
+                "note": "stuck",
+                "rank": 5
+            },
+            {
+                "model": "o1-mini-2024-09-12",
+                "levels_cracked": "0;1;0",
+                "steps": null,
+                "rank": 6
+            },
+            {
+                "model": "gemini-2.0-flash-thinking-exp-1219",
+                "levels_cracked": "0; 0; 0",
+                "steps": "[23]; [14]; [14]",
+                "rank": 7
+            },
+            {
+                "model": "gpt-4o-2024-11-20",
+                "levels_cracked": "0; 0; 0",
+                "steps": "[68];[105];[168]",
+                "note": "stuck in a loop",
+                "rank": 8
+            },
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "levels_cracked": "0; 0; 0",
+                "steps": "[21]; [30]; [51]",
+                "note": "stuck in a loop",
+                "rank": 9
+            },
+            {
+                "model": "deepseek-v3",
+                "levels_cracked": "0; 0; 0",
+                "steps": "[9]; [47]; [64]",
+                "rank": 10
+            },
+            {
+                "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
+                "levels_cracked": "0;0;0",
+                "steps": "[5]",
+                "rank": 11
+            }
+        ]
+    }
+}

requirements.txt CHANGED Viewed

@@ -13,4 +13,7 @@ python-dateutil
 tqdm
 transformers
 tokenizers>=0.15.0
-sentencepiece

 tqdm
 transformers
 tokenizers>=0.15.0
+sentencepiece
+seaborn>=0.12.0
+Pillow>=10.0.0
+plotly>=5.15.0

src/about.py DELETED Viewed

@@ -1,72 +0,0 @@
-from dataclasses import dataclass
-from enum import Enum
-@dataclass
-class Task:
-    benchmark: str
-    metric: str
-    col_name: str
-# Select your tasks here
-# ---------------------------------------------------
-class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0 # Change with your few shot
-# ---------------------------------------------------
-# Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
-# What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """
-Intro text
-"""
-# Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-"""
-EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
-"""
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""
-"""

src/display/css_html_js.py DELETED Viewed

@@ -1,105 +0,0 @@
-custom_css = """
-.markdown-text {
-    font-size: 16px !important;
-}
-#models-to-add-text {
-    font-size: 18px !important;
-}
-#citation-button span {
-    font-size: 16px !important;
-}
-#citation-button textarea {
-    font-size: 16px !important;
-}
-#citation-button > label > button {
-    margin: 6px;
-    transform: scale(1.3);
-}
-#leaderboard-table {
-    margin-top: 15px
-}
-#leaderboard-table-lite {
-    margin-top: 15px
-}
-#search-bar-table-box > div:first-child {
-    background: none;
-    border: none;
-}
-#search-bar {
-    padding: 0px;
-}
-/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
-#leaderboard-table td:nth-child(2),
-#leaderboard-table th:nth-child(2) {
-    max-width: 400px;
-    overflow: auto;
-    white-space: nowrap;
-}
-.tab-buttons button {
-    font-size: 20px;
-}
-#scale-logo {
-    border-style: none !important;
-    box-shadow: none;
-    display: block;
-    margin-left: auto;
-    margin-right: auto;
-    max-width: 600px;
-}
-#scale-logo .download {
-    display: none;
-}
-#filter_type{
-    border: 0;
-    padding-left: 0;
-    padding-top: 0;
-}
-#filter_type label {
-    display: flex;
-}
-#filter_type label > span{
-    margin-top: var(--spacing-lg);
-    margin-right: 0.5em;
-}
-#filter_type label > .wrap{
-    width: 103px;
-}
-#filter_type label > .wrap .wrap-inner{
-    padding: 2px;
-}
-#filter_type label > .wrap .wrap-inner input{
-    width: 1px
-}
-#filter-columns-type{
-    border:0;
-    padding:0.5;
-}
-#filter-columns-size{
-    border:0;
-    padding:0.5;
-}
-#box-filter > .form{
-    border: 0
-}
-"""
-get_window_url_params = """
-    function(url_params) {
-        const params = new URLSearchParams(window.location.search);
-        url_params = Object.fromEntries(params);
-        return url_params;
-    }
-    """

src/display/formatting.py DELETED Viewed

@@ -1,27 +0,0 @@
-def model_hyperlink(link, model_name):
-    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def make_clickable_model(model_name):
-    link = f"https://huggingface.co/{model_name}"
-    return model_hyperlink(link, model_name)
-def styled_error(error):
-    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
-def styled_warning(warn):
-    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
-def styled_message(message):
-    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
-def has_no_nan_values(df, columns):
-    return df[columns].notna().all(axis=1)
-def has_nan_values(df, columns):
-    return df[columns].isna().any(axis=1)

src/display/utils.py DELETED Viewed

@@ -1,110 +0,0 @@
-from dataclasses import dataclass, make_dataclass
-from enum import Enum
-import pandas as pd
-from src.about import Tasks
-def fields(raw_class):
-    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
-# These classes are for user facing column names,
-# to avoid having to change them all around the code
-# when a modif is needed
-@dataclass
-class ColumnContent:
-    name: str
-    type: str
-    displayed_by_default: bool
-    hidden: bool = False
-    never_hidden: bool = False
-## Leaderboard columns
-auto_eval_column_dict = []
-# Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
-for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-# Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
-# We use make dataclass to dynamically fill the scores from Tasks
-AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
-## For the queue columns in the submission tab
-@dataclass(frozen=True)
-class EvalQueueColumn:  # Queue column
-    model = ColumnContent("model", "markdown", True)
-    revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
-    precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
-    status = ColumnContent("status", "str", True)
-## All the model information that we might need
-@dataclass
-class ModelDetails:
-    name: str
-    display_name: str = ""
-    symbol: str = "" # emoji
-class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
-    Unknown = ModelDetails(name="", symbol="?")
-    def to_str(self, separator=" "):
-        return f"{self.value.symbol}{separator}{self.value.name}"
-    @staticmethod
-    def from_str(type):
-        if "fine-tuned" in type or "🔶" in type:
-            return ModelType.FT
-        if "pretrained" in type or "🟢" in type:
-            return ModelType.PT
-        if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
-        if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
-        return ModelType.Unknown
-class WeightType(Enum):
-    Adapter = ModelDetails("Adapter")
-    Original = ModelDetails("Original")
-    Delta = ModelDetails("Delta")
-class Precision(Enum):
-    float16 = ModelDetails("float16")
-    bfloat16 = ModelDetails("bfloat16")
-    Unknown = ModelDetails("?")
-    def from_str(precision):
-        if precision in ["torch.float16", "float16"]:
-            return Precision.float16
-        if precision in ["torch.bfloat16", "bfloat16"]:
-            return Precision.bfloat16
-        return Precision.Unknown
-# Column selection
-COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
-EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
-EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py DELETED Viewed

@@ -1,25 +0,0 @@
-import os
-from huggingface_hub import HfApi
-# Info to change for your repository
-# ----------------------------------
-TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
-# ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
-QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/results"
-# If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
-# Local caches
-EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
-EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
-API = HfApi(token=TOKEN)

src/leaderboard/read_evals.py DELETED Viewed

@@ -1,196 +0,0 @@
-import glob
-import json
-import math
-import os
-from dataclasses import dataclass
-import dateutil
-import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
-from src.submission.check_validity import is_model_on_hub
-@dataclass
-class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
-    model: str
-    revision: str # commit hash, "" if main
-    results: dict
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = "" # submission date of request file
-    still_on_hub: bool = False
-    @classmethod
-    def init_from_json_file(self, json_filepath):
-        """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        config = data.get("config")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
-        for task in Tasks:
-            task = task.value
-            # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return self(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture
-        )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-        try:
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
-    def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    request_files = os.path.join(
-        requests_path,
-        f"{model_name}_eval_request_*.json",
-    )
-    request_files = glob.glob(request_files)
-    # Select correct request file (precision)
-    request_file = ""
-    request_files = sorted(request_files, reverse=True)
-    for tmp_request_file in request_files:
-        with open(tmp_request_file, "r") as f:
-            req_content = json.load(f)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
-                request_file = tmp_request_file
-    return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
-    """From the path of the results folder root, extract all needed info for results"""
-    model_result_filepaths = []
-    for root, _, files in os.walk(results_path):
-        # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-            continue
-        # Sort the files by date
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {}
-    for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
-    results = []
-    for v in eval_results.values():
-        try:
-            v.to_dict() # we test if the dict version is complete
-            results.append(v)
-        except KeyError:  # not all eval values present
-            continue
-    return results

src/populate.py DELETED Viewed

@@ -1,58 +0,0 @@
-import json
-import os
-import pandas as pd
-from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    """Creates a dataframe from all the individual experiment results"""
-    raw_data = get_raw_eval_results(results_path, requests_path)
-    all_data_json = [v.to_dict() for v in raw_data]
-    df = pd.DataFrame.from_records(all_data_json)
-    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
-    df = df[cols].round(decimals=2)
-    # filter out if any of the benchmarks have not been produced
-    df = df[has_no_nan_values(df, benchmark_cols)]
-    return df
-def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
-    """Creates the different dataframes for the evaluation queues requestes"""
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
-    all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

src/submission/check_validity.py DELETED Viewed

@@ -1,99 +0,0 @@
-import json
-import os
-import re
-from collections import defaultdict
-from datetime import datetime, timedelta, timezone
-import huggingface_hub
-from huggingface_hub import ModelCard
-from huggingface_hub.hf_api import ModelInfo
-from transformers import AutoConfig
-from transformers.models.auto.tokenization_auto import AutoTokenizer
-def check_model_card(repo_id: str) -> tuple[bool, str]:
-    """Checks if the model card and license exist and have been filled"""
-    try:
-        card = ModelCard.load(repo_id)
-    except huggingface_hub.utils.EntryNotFoundError:
-        return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
-    # Enforce license metadata
-    if card.data.license is None:
-        if not ("license_name" in card.data and "license_link" in card.data):
-            return False, (
-                "License not found. Please add a license to your model card using the `license` metadata or a"
-                " `license_name`/`license_link` pair."
-            )
-    # Enforce card content
-    if len(card.text) < 200:
-        return False, "Please add a description to your model card, it is too short."
-    return True, ""
-def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
-    """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
-    try:
-        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
-        if test_tokenizer:
-            try:
-                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
-            except ValueError as e:
-                return (
-                    False,
-                    f"uses a tokenizer which is not in a transformers release: {e}",
-                    None
-                )
-            except Exception as e:
-                return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
-        return True, None, config
-    except ValueError:
-        return (
-            False,
-            "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
-            None
-        )
-    except Exception as e:
-        return False, "was not found on hub!", None
-def get_model_size(model_info: ModelInfo, precision: str):
-    """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
-    try:
-        model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except (AttributeError, TypeError):
-        return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
-    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
-    model_size = size_factor * model_size
-    return model_size
-def get_model_arch(model_info: ModelInfo):
-    """Gets the model architecture from the configuration"""
-    return model_info.config.get("architectures", "Unknown")
-def already_submitted_models(requested_models_dir: str) -> set[str]:
-    """Gather a list of already submitted models to avoid duplicates"""
-    depth = 1
-    file_names = []
-    users_to_submission_dates = defaultdict(list)
-    for root, _, files in os.walk(requested_models_dir):
-        current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
-        if current_depth == depth:
-            for file in files:
-                if not file.endswith(".json"):
-                    continue
-                with open(os.path.join(root, file), "r") as f:
-                    info = json.load(f)
-                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
-                    # Select organisation
-                    if info["model"].count("/") == 0 or "submitted_time" not in info:
-                        continue
-                    organisation, _ = info["model"].split("/")
-                    users_to_submission_dates[organisation].append(info["submitted_time"])
-    return set(file_names), users_to_submission_dates

src/submission/submit.py DELETED Viewed

@@ -1,119 +0,0 @@
-import json
-import os
-from datetime import datetime, timezone
-from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
-from src.submission.check_validity import (
-    already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
-)
-REQUESTED_MODELS = None
-USERS_TO_SUBMISSION_DATES = None
-def add_new_eval(
-    model: str,
-    base_model: str,
-    revision: str,
-    precision: str,
-    weight_type: str,
-    model_type: str,
-):
-    global REQUESTED_MODELS
-    global USERS_TO_SUBMISSION_DATES
-    if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-    user_name = ""
-    model_path = model
-    if "/" in model:
-        user_name = model.split("/")[0]
-        model_path = model.split("/")[1]
-    precision = precision.split(" ")[0]
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
-    # Does the model actually exist?
-    if revision == "":
-        revision = "main"
-    # Is the model on the hub?
-    if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
-    if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
-    # Is the model info correctly filled?
-    try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
-    # Seems good, creating the eval
-    print("Adding new eval")
-    eval_entry = {
-        "model": model,
-        "base_model": base_model,
-        "revision": revision,
-        "precision": precision,
-        "weight_type": weight_type,
-        "status": "PENDING",
-        "submitted_time": current_time,
-        "model_type": model_type,
-        "likes": model_info.likes,
-        "params": model_size,
-        "license": license,
-        "private": False,
-    }
-    # Check for duplicate submission
-    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
-    print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-    os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-    with open(out_path, "w") as f:
-        f.write(json.dumps(eval_entry))
-    print("Uploading eval file")
-    API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model} to eval queue",
-    )
-    # Remove the local file
-    os.remove(out_path)
-    return styled_message(
-        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
-    )