Spaces:
Running
Running
Yuxuan-Zhang-Dexter
commited on
Commit
·
6ebb0fb
1
Parent(s):
72bd46b
update gradip app
Browse files- assets/game_video_link.json +6 -0
- assets/model_color.json +17 -0
- assets/news.json +34 -0
- data_visualization.py +550 -0
- leaderboard_utils.py +288 -0
- rank_data_03_25_2025.json +324 -0
- requirements.txt +4 -1
- src/about.py +0 -72
- src/display/css_html_js.py +0 -105
- src/display/formatting.py +0 -27
- src/display/utils.py +0 -110
- src/envs.py +0 -25
- src/leaderboard/read_evals.py +0 -196
- src/populate.py +0 -58
- src/submission/check_validity.py +0 -99
- src/submission/submit.py +0 -119
assets/game_video_link.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"sokoban": "https://www.youtube.com/watch?v=59enV32MBUE",
|
3 |
+
"super_mario": "https://www.youtube.com/watch?v=nixMIJZYAgg",
|
4 |
+
"2048": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
|
5 |
+
"candy": "https://www.youtube.com/watch?v=b-Uyz3W4yIg"
|
6 |
+
}
|
assets/model_color.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"claude-3-7-sonnet-20250219": "#4A90E2",
|
3 |
+
"claude-3-7-sonnet-20250219(thinking)": "#2E5C8A",
|
4 |
+
"claude-3-5-haiku-20241022": "#7FB5E6",
|
5 |
+
"claude-3-5-sonnet-20241022": "#1A4C7C",
|
6 |
+
"gemini-2.0-flash": "#FF4081",
|
7 |
+
"gemini-2.0-flash-thinking-exp-1219": "#C2185B",
|
8 |
+
"gemini-2.5-pro-exp-03-25": "#FF80AB",
|
9 |
+
"gpt-4o-2024-11-20": "#00BFA5",
|
10 |
+
"gpt-4.5-preview-2025-02-27": "#00796B",
|
11 |
+
"o1-2024-12-17": "#4DB6AC",
|
12 |
+
"o1-mini-2024-09-12": "#26A69A",
|
13 |
+
"o3-mini-2025-01-31(medium)": "#80CBC4",
|
14 |
+
"deepseek-v3": "#FFC107",
|
15 |
+
"deepseek-r1": "#FFA000",
|
16 |
+
"Llama-4-Maverick-17B-128E-Instruct-FP8": "#8E24AA"
|
17 |
+
}
|
assets/news.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"news": [
|
3 |
+
{
|
4 |
+
"date": "2025-04-01",
|
5 |
+
"video_link": "https://www.youtube.com/watch?v=uFVpNor7l_E",
|
6 |
+
"twitter_text": "Google's Gemini 2.5 Pro redefines AI gameplay: its multi-modal edge outperforms o1 & Claude 3.7 in Sokoban.",
|
7 |
+
"twitter_link": "https://x.com/haoailab/status/1907140718650704204"
|
8 |
+
},
|
9 |
+
{
|
10 |
+
"date": "2025-03-18",
|
11 |
+
"video_link": "https://www.youtube.com/watch?v=b-Uyz3W4yIg",
|
12 |
+
"twitter_text": "Candy Crush Saga's Hidden Complexity: Top AI Models Take the Challenge",
|
13 |
+
"twitter_link": "https://x.com/haoailab/status/1902095369808601551"
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"date": "2025-03-14",
|
17 |
+
"video_link": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
|
18 |
+
"twitter_text": "2048 Mastery: Only Two AI Models Crack the Code to Surpass Random Play",
|
19 |
+
"twitter_link": "https://x.com/haoailab/status/1900645722095317255"
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"date": "2025-03-06",
|
23 |
+
"video_link": "https://www.youtube.com/watch?v=59enV32MBUE",
|
24 |
+
"twitter_text": "Sokoban Showdown: o3-mini Dominates by Reaching Level 4",
|
25 |
+
"twitter_link": "https://x.com/haoailab/status/1897792946646421514"
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"date": "2025-02-28",
|
29 |
+
"video_link": "https://www.youtube.com/watch?v=nixMIJZYAgg",
|
30 |
+
"twitter_text": "Super Mario AI Revolution: Claude-3.7 Sets Unprecedented Gameplay Benchmarks",
|
31 |
+
"twitter_link": "https://x.com/haoailab/status/1895557913621795076"
|
32 |
+
}
|
33 |
+
]
|
34 |
+
}
|
data_visualization.py
ADDED
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib
|
2 |
+
matplotlib.use('Agg') # Use Agg backend for thread safety
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import seaborn as sns
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
from leaderboard_utils import (
|
10 |
+
get_organization,
|
11 |
+
get_mario_leaderboard,
|
12 |
+
get_sokoban_leaderboard,
|
13 |
+
get_2048_leaderboard,
|
14 |
+
get_candy_leaderboard,
|
15 |
+
get_tetris_leaderboard,
|
16 |
+
get_tetris_planning_leaderboard,
|
17 |
+
get_combined_leaderboard,
|
18 |
+
GAME_ORDER
|
19 |
+
)
|
20 |
+
|
21 |
+
# Load model colors
|
22 |
+
with open('assets/model_color.json', 'r') as f:
|
23 |
+
MODEL_COLORS = json.load(f)
|
24 |
+
|
25 |
+
# Define game score columns mapping
|
26 |
+
GAME_SCORE_COLUMNS = {
|
27 |
+
"Super Mario Bros": "Score",
|
28 |
+
"Sokoban": "Levels Cracked",
|
29 |
+
"2048": "Score",
|
30 |
+
"Candy Crash": "Average Score",
|
31 |
+
"Tetris (complete)": "Score",
|
32 |
+
"Tetris (planning only)": "Score"
|
33 |
+
}
|
34 |
+
|
35 |
+
def normalize_values(values, mean, std):
|
36 |
+
"""
|
37 |
+
Normalize values using z-score and scale to 0-100 range
|
38 |
+
|
39 |
+
Args:
|
40 |
+
values (list): List of values to normalize
|
41 |
+
mean (float): Mean value for normalization
|
42 |
+
std (float): Standard deviation for normalization
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
list: Normalized values scaled to 0-100 range
|
46 |
+
"""
|
47 |
+
if std == 0:
|
48 |
+
return [50 if v > 0 else 0 for v in values] # Handle zero std case
|
49 |
+
z_scores = [(v - mean) / std for v in values]
|
50 |
+
# Scale z-scores to 0-100 range, with mean at 50
|
51 |
+
scaled_values = [max(0, min(100, (z * 30) + 50)) for z in z_scores]
|
52 |
+
return scaled_values
|
53 |
+
|
54 |
+
def simplify_model_name(model_name):
|
55 |
+
"""
|
56 |
+
Simplify model name by either taking first 11 chars or string before third '-'
|
57 |
+
"""
|
58 |
+
hyphen_parts = model_name.split('-')
|
59 |
+
return '-'.join(hyphen_parts[:3]) if len(hyphen_parts) >= 3 else model_name[:11]
|
60 |
+
|
61 |
+
def create_horizontal_bar_chart(df, game_name):
|
62 |
+
"""
|
63 |
+
Create horizontal bar chart for detailed game view
|
64 |
+
|
65 |
+
Args:
|
66 |
+
df (pd.DataFrame): DataFrame containing game data
|
67 |
+
game_name (str): Name of the game to display
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
matplotlib.figure.Figure: The generated bar chart figure
|
71 |
+
"""
|
72 |
+
# Close any existing figures to prevent memory leaks
|
73 |
+
plt.close('all')
|
74 |
+
|
75 |
+
# Set style
|
76 |
+
plt.style.use('default')
|
77 |
+
# Increase figure width to accommodate long model names
|
78 |
+
fig, ax = plt.subplots(figsize=(20, 11))
|
79 |
+
|
80 |
+
# Sort by score
|
81 |
+
if game_name == "Super Mario Bros":
|
82 |
+
score_col = "Score"
|
83 |
+
df_sorted = df.sort_values(by=score_col, ascending=True)
|
84 |
+
elif game_name == "Sokoban":
|
85 |
+
# Process Sokoban scores by splitting and getting max level
|
86 |
+
def get_max_level(levels_str):
|
87 |
+
try:
|
88 |
+
# Split by semicolon, strip whitespace, filter empty strings, convert to integers
|
89 |
+
levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
|
90 |
+
return max(levels) if levels else 0
|
91 |
+
except:
|
92 |
+
return 0
|
93 |
+
|
94 |
+
# Create a temporary column with max levels
|
95 |
+
df['Max Level'] = df['Levels Cracked'].apply(get_max_level)
|
96 |
+
df_sorted = df.sort_values(by='Max Level', ascending=True)
|
97 |
+
score_col = 'Max Level'
|
98 |
+
elif game_name == "2048":
|
99 |
+
score_col = "Score"
|
100 |
+
df_sorted = df.sort_values(by=score_col, ascending=True)
|
101 |
+
elif game_name == "Candy Crash":
|
102 |
+
score_col = "Average Score"
|
103 |
+
df_sorted = df.sort_values(by=score_col, ascending=True)
|
104 |
+
elif game_name in ["Tetris (complete)", "Tetris (planning only)"]:
|
105 |
+
score_col = "Score"
|
106 |
+
df_sorted = df.sort_values(by=score_col, ascending=True)
|
107 |
+
else:
|
108 |
+
return None
|
109 |
+
|
110 |
+
# Create color gradient
|
111 |
+
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(df_sorted)))
|
112 |
+
|
113 |
+
# Create horizontal bars
|
114 |
+
bars = ax.barh(range(len(df_sorted)), df_sorted[score_col], color=colors)
|
115 |
+
|
116 |
+
# Add more space for labels on the left
|
117 |
+
plt.subplots_adjust(left=0.3)
|
118 |
+
|
119 |
+
# Customize the chart
|
120 |
+
ax.set_yticks(range(len(df_sorted)))
|
121 |
+
|
122 |
+
# Format player names: keep organization info and truncate the rest if too long
|
123 |
+
def format_player_name(player, org):
|
124 |
+
max_length = 40 # Maximum length for player name
|
125 |
+
if len(player) > max_length:
|
126 |
+
# Keep the first part and last part of the name
|
127 |
+
parts = player.split('-')
|
128 |
+
if len(parts) > 3:
|
129 |
+
formatted = f"{parts[0]}-{parts[1]}-...{parts[-1]}"
|
130 |
+
else:
|
131 |
+
formatted = player[:max_length-3] + "..."
|
132 |
+
else:
|
133 |
+
formatted = player
|
134 |
+
return f"{formatted} [{org}]"
|
135 |
+
|
136 |
+
player_labels = [format_player_name(row['Player'], row['Organization'])
|
137 |
+
for _, row in df_sorted.iterrows()]
|
138 |
+
ax.set_yticklabels(player_labels, fontsize=9)
|
139 |
+
|
140 |
+
# Add value labels on the bars
|
141 |
+
for i, bar in enumerate(bars):
|
142 |
+
width = bar.get_width()
|
143 |
+
if game_name == "Candy Crash":
|
144 |
+
score_text = f'{width:.1f}'
|
145 |
+
else:
|
146 |
+
score_text = f'{width:.0f}'
|
147 |
+
|
148 |
+
ax.text(width, bar.get_y() + bar.get_height()/2,
|
149 |
+
score_text,
|
150 |
+
ha='left', va='center',
|
151 |
+
fontsize=10,
|
152 |
+
fontweight='bold',
|
153 |
+
color='white',
|
154 |
+
bbox=dict(facecolor=(0, 0, 0, 0.3),
|
155 |
+
edgecolor='none',
|
156 |
+
alpha=0.5,
|
157 |
+
pad=2))
|
158 |
+
|
159 |
+
# Set title and labels
|
160 |
+
ax.set_title(f"{game_name} Performance",
|
161 |
+
pad=20,
|
162 |
+
fontsize=14,
|
163 |
+
fontweight='bold',
|
164 |
+
color='#2c3e50')
|
165 |
+
|
166 |
+
if game_name == "Sokoban":
|
167 |
+
ax.set_xlabel("Maximum Level Reached",
|
168 |
+
fontsize=12,
|
169 |
+
fontweight='bold',
|
170 |
+
color='#2c3e50',
|
171 |
+
labelpad=10)
|
172 |
+
else:
|
173 |
+
ax.set_xlabel(score_col,
|
174 |
+
fontsize=12,
|
175 |
+
fontweight='bold',
|
176 |
+
color='#2c3e50',
|
177 |
+
labelpad=10)
|
178 |
+
|
179 |
+
# Add grid lines
|
180 |
+
ax.grid(True, axis='x', linestyle='--', alpha=0.3)
|
181 |
+
|
182 |
+
# Remove top and right spines
|
183 |
+
ax.spines['top'].set_visible(False)
|
184 |
+
ax.spines['right'].set_visible(False)
|
185 |
+
|
186 |
+
# Adjust layout
|
187 |
+
plt.tight_layout()
|
188 |
+
|
189 |
+
return fig
|
190 |
+
|
191 |
+
def create_radar_charts(df):
|
192 |
+
"""
|
193 |
+
Create two radar charts with improved normalization using z-scores
|
194 |
+
"""
|
195 |
+
# Close any existing figures to prevent memory leaks
|
196 |
+
plt.close('all')
|
197 |
+
|
198 |
+
# Define reasoning models
|
199 |
+
reasoning_models = [
|
200 |
+
'claude-3-7-sonnet-20250219(thinking)',
|
201 |
+
'o1-2024-12-17',
|
202 |
+
'gemini-2.0-flash-thinking-exp-1219',
|
203 |
+
'o3-mini-2025-01-31(medium)',
|
204 |
+
'gemini-2.5-pro-exp-03-25',
|
205 |
+
'o1-mini-2024-09-12',
|
206 |
+
'deepseek-r1'
|
207 |
+
]
|
208 |
+
|
209 |
+
# Split dataframe into reasoning and non-reasoning models
|
210 |
+
df_reasoning = df[df['Player'].isin(reasoning_models)]
|
211 |
+
df_others = df[~df['Player'].isin(reasoning_models)]
|
212 |
+
|
213 |
+
# Get game columns
|
214 |
+
game_columns = [col for col in df.columns if col.endswith(' Score')]
|
215 |
+
categories = [col.replace(' Score', '') for col in game_columns]
|
216 |
+
|
217 |
+
# Create figure with two subplots - adjusted size for new layout
|
218 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6), subplot_kw=dict(projection='polar'))
|
219 |
+
fig.patch.set_facecolor('white') # Set figure background to white
|
220 |
+
|
221 |
+
def get_game_stats(df, game_col):
|
222 |
+
"""
|
223 |
+
Get mean and std for a game column, handling missing values
|
224 |
+
"""
|
225 |
+
values = []
|
226 |
+
for val in df[game_col]:
|
227 |
+
if isinstance(val, str) and val == '_':
|
228 |
+
values.append(0)
|
229 |
+
else:
|
230 |
+
try:
|
231 |
+
values.append(float(val))
|
232 |
+
except:
|
233 |
+
values.append(0)
|
234 |
+
return np.mean(values), np.std(values)
|
235 |
+
|
236 |
+
def setup_radar_plot(ax, data, title):
|
237 |
+
ax.set_facecolor('white') # Set subplot background to white
|
238 |
+
|
239 |
+
num_vars = len(categories)
|
240 |
+
angles = np.linspace(0, 2*np.pi, num_vars, endpoint=False)
|
241 |
+
angles = np.concatenate((angles, [angles[0]]))
|
242 |
+
|
243 |
+
# Plot grid lines with darker color
|
244 |
+
grid_values = [10, 30, 50, 70, 90]
|
245 |
+
ax.set_rgrids(grid_values,
|
246 |
+
labels=grid_values,
|
247 |
+
angle=45,
|
248 |
+
fontsize=6,
|
249 |
+
alpha=0.7, # Increased alpha for better visibility
|
250 |
+
color='#404040') # Darker color for grid labels
|
251 |
+
|
252 |
+
# Make grid lines darker but still subtle
|
253 |
+
ax.grid(True, color='#404040', alpha=0.3) # Darker grid lines
|
254 |
+
|
255 |
+
# Define darker, more vibrant colors for the radar plots
|
256 |
+
colors = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e', '#9467bd', '#8c564b']
|
257 |
+
|
258 |
+
# Calculate game statistics once
|
259 |
+
game_stats = {col: get_game_stats(df, col) for col in game_columns}
|
260 |
+
|
261 |
+
# Plot data with darker lines and higher opacity for fills
|
262 |
+
for idx, (_, row) in enumerate(data.iterrows()):
|
263 |
+
values = []
|
264 |
+
for col in game_columns:
|
265 |
+
val = row[col]
|
266 |
+
if isinstance(val, str) and val == '_':
|
267 |
+
values.append(0)
|
268 |
+
else:
|
269 |
+
try:
|
270 |
+
values.append(float(val))
|
271 |
+
except:
|
272 |
+
values.append(0)
|
273 |
+
|
274 |
+
# Normalize values using game statistics
|
275 |
+
normalized_values = []
|
276 |
+
for i, v in enumerate(values):
|
277 |
+
mean, std = game_stats[game_columns[i]]
|
278 |
+
normalized_value = normalize_values([v], mean, std)[0]
|
279 |
+
normalized_values.append(normalized_value)
|
280 |
+
|
281 |
+
# Complete the circular plot
|
282 |
+
normalized_values = np.concatenate((normalized_values, [normalized_values[0]]))
|
283 |
+
|
284 |
+
model_name = simplify_model_name(row['Player'])
|
285 |
+
ax.plot(angles, normalized_values, 'o-', linewidth=2.0, # Increased line width
|
286 |
+
label=model_name,
|
287 |
+
color=colors[idx % len(colors)],
|
288 |
+
markersize=4) # Increased marker size
|
289 |
+
ax.fill(angles, normalized_values,
|
290 |
+
alpha=0.3, # Increased fill opacity
|
291 |
+
color=colors[idx % len(colors)])
|
292 |
+
|
293 |
+
# Format categories
|
294 |
+
formatted_categories = []
|
295 |
+
for game in categories:
|
296 |
+
if game == "Tetris (planning only)":
|
297 |
+
game = "Tetris\n(planning)"
|
298 |
+
elif game == "Tetris (complete)":
|
299 |
+
game = "Tetris\n(complete)"
|
300 |
+
elif game == "Super Mario Bros":
|
301 |
+
game = "Super\nMario"
|
302 |
+
elif game == "Candy Crash":
|
303 |
+
game = "Candy\nCrash"
|
304 |
+
formatted_categories.append(game)
|
305 |
+
|
306 |
+
ax.set_xticks(angles[:-1])
|
307 |
+
ax.set_xticklabels(formatted_categories,
|
308 |
+
fontsize=8, # Slightly larger font
|
309 |
+
color='#202020', # Darker text
|
310 |
+
fontweight='bold') # Bold text
|
311 |
+
ax.tick_params(pad=10, colors='#202020') # Darker tick colors
|
312 |
+
|
313 |
+
ax.set_title(title,
|
314 |
+
pad=20,
|
315 |
+
fontsize=11, # Slightly larger title
|
316 |
+
color='#202020', # Darker title
|
317 |
+
fontweight='bold') # Bold title
|
318 |
+
|
319 |
+
legend = ax.legend(loc='upper right',
|
320 |
+
bbox_to_anchor=(1.3, 1.1),
|
321 |
+
fontsize=7, # Slightly larger legend
|
322 |
+
framealpha=0.9, # More opaque legend
|
323 |
+
edgecolor='#404040', # Darker edge
|
324 |
+
ncol=1)
|
325 |
+
|
326 |
+
ax.set_ylim(0, 105)
|
327 |
+
ax.spines['polar'].set_color('#404040') # Darker spine
|
328 |
+
ax.spines['polar'].set_alpha(0.5) # More visible spine
|
329 |
+
|
330 |
+
# Setup both plots
|
331 |
+
setup_radar_plot(ax1, df_reasoning, "Reasoning Models")
|
332 |
+
setup_radar_plot(ax2, df_others, "Non-Reasoning Models")
|
333 |
+
|
334 |
+
plt.subplots_adjust(right=0.85, wspace=0.3)
|
335 |
+
|
336 |
+
return fig
|
337 |
+
|
338 |
+
def get_combined_leaderboard_with_radar(rank_data, selected_games):
|
339 |
+
"""
|
340 |
+
Get combined leaderboard and create radar charts
|
341 |
+
"""
|
342 |
+
df = get_combined_leaderboard(rank_data, selected_games)
|
343 |
+
radar_fig = create_radar_charts(df)
|
344 |
+
return df, radar_fig
|
345 |
+
|
346 |
+
def create_organization_radar_chart(rank_data):
|
347 |
+
"""
|
348 |
+
Create radar chart comparing organizations
|
349 |
+
"""
|
350 |
+
# Get combined leaderboard with all games
|
351 |
+
df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})
|
352 |
+
|
353 |
+
# Group by organization and calculate average scores
|
354 |
+
org_performance = {}
|
355 |
+
for org in df["Organization"].unique():
|
356 |
+
org_df = df[df["Organization"] == org]
|
357 |
+
scores = {}
|
358 |
+
for game in GAME_ORDER:
|
359 |
+
game_scores = org_df[f"{game} Score"].apply(lambda x: float(x) if x != "_" else 0)
|
360 |
+
scores[game] = game_scores.mean()
|
361 |
+
org_performance[org] = scores
|
362 |
+
|
363 |
+
# Create radar chart
|
364 |
+
return create_radar_charts(pd.DataFrame([org_performance]))
|
365 |
+
|
366 |
+
def create_top_players_radar_chart(rank_data, n=5):
|
367 |
+
"""
|
368 |
+
Create radar chart for top N players
|
369 |
+
"""
|
370 |
+
# Get combined leaderboard with all games
|
371 |
+
df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})
|
372 |
+
|
373 |
+
# Get top N players
|
374 |
+
top_players = df["Player"].head(n).tolist()
|
375 |
+
|
376 |
+
# Create radar chart for top players
|
377 |
+
return create_radar_charts(df[df["Player"].isin(top_players)])
|
378 |
+
|
379 |
+
def create_player_radar_chart(rank_data, player_name):
|
380 |
+
"""
|
381 |
+
Create radar chart for a specific player
|
382 |
+
"""
|
383 |
+
# Get combined leaderboard with all games
|
384 |
+
df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})
|
385 |
+
|
386 |
+
# Get player's data
|
387 |
+
player_df = df[df["Player"] == player_name]
|
388 |
+
|
389 |
+
if player_df.empty:
|
390 |
+
return None
|
391 |
+
|
392 |
+
# Create radar chart for the player
|
393 |
+
return create_radar_charts(player_df)
|
394 |
+
|
395 |
+
def create_group_bar_chart(df):
|
396 |
+
"""
|
397 |
+
Create a grouped bar chart comparing AI model performance across different games
|
398 |
+
|
399 |
+
Args:
|
400 |
+
df (pd.DataFrame): DataFrame containing the combined leaderboard data
|
401 |
+
|
402 |
+
Returns:
|
403 |
+
matplotlib.figure.Figure: The generated group bar chart figure
|
404 |
+
"""
|
405 |
+
# Close any existing figures to prevent memory leaks
|
406 |
+
plt.close('all')
|
407 |
+
|
408 |
+
# Create figure and axis with better styling
|
409 |
+
sns.set_style("whitegrid")
|
410 |
+
fig = plt.figure(figsize=(20, 11))
|
411 |
+
|
412 |
+
# Create subplot with specific spacing
|
413 |
+
ax = plt.subplot(111)
|
414 |
+
|
415 |
+
# Adjust the subplot parameters
|
416 |
+
plt.subplots_adjust(top=0.90, # Add more space at the top
|
417 |
+
bottom=0.15, # Add more space at the bottom
|
418 |
+
right=0.85, # Add more space for legend
|
419 |
+
left=0.05) # Add space on the left
|
420 |
+
|
421 |
+
# Get unique models
|
422 |
+
models = df['Player'].unique()
|
423 |
+
|
424 |
+
# Get active games (those that have score columns in the DataFrame)
|
425 |
+
active_games = []
|
426 |
+
for game in GAME_ORDER:
|
427 |
+
score_col = f"{game} Score" # Use the same column name for all games
|
428 |
+
if score_col in df.columns:
|
429 |
+
active_games.append(game)
|
430 |
+
|
431 |
+
n_games = len(active_games)
|
432 |
+
if n_games == 0:
|
433 |
+
return fig # Return empty figure if no games are selected
|
434 |
+
|
435 |
+
# Keep track of which models have data in any game
|
436 |
+
models_with_data = set()
|
437 |
+
|
438 |
+
# Calculate normalized scores for each game
|
439 |
+
for game_idx, game in enumerate(active_games):
|
440 |
+
# Get all scores for this game
|
441 |
+
game_scores = []
|
442 |
+
|
443 |
+
# Use the same score column name for all games
|
444 |
+
score_col = f"{game} Score"
|
445 |
+
|
446 |
+
for model in models:
|
447 |
+
try:
|
448 |
+
score = df[df['Player'] == model][score_col].values[0]
|
449 |
+
if score != '_' and float(score) > 0: # Only include non-zero scores
|
450 |
+
game_scores.append((model, float(score)))
|
451 |
+
models_with_data.add(model) # Add model to set if it has valid data
|
452 |
+
except (IndexError, ValueError):
|
453 |
+
continue
|
454 |
+
|
455 |
+
if not game_scores: # Skip if no valid scores for this game
|
456 |
+
continue
|
457 |
+
|
458 |
+
# Sort scores from highest to lowest
|
459 |
+
game_scores.sort(key=lambda x: x[1], reverse=True)
|
460 |
+
|
461 |
+
# Extract sorted models and scores
|
462 |
+
sorted_models = [x[0] for x in game_scores]
|
463 |
+
scores = [x[1] for x in game_scores]
|
464 |
+
|
465 |
+
# Calculate mean and std for normalization
|
466 |
+
mean = np.mean(scores)
|
467 |
+
std = np.std(scores)
|
468 |
+
|
469 |
+
# Normalize scores
|
470 |
+
normalized_scores = normalize_values(scores, mean, std)
|
471 |
+
|
472 |
+
# Calculate bar width based on number of models in this game
|
473 |
+
n_models_in_game = len(sorted_models)
|
474 |
+
bar_width = 0.8 / n_models_in_game if n_models_in_game > 0 else 0.8
|
475 |
+
|
476 |
+
# Plot bars for each model
|
477 |
+
for i, (model, score) in enumerate(zip(sorted_models, normalized_scores)):
|
478 |
+
# Only add to legend if first appearance and model has data
|
479 |
+
should_label = model in models_with_data and model not in [l.get_text() for l in ax.get_legend().get_texts()] if ax.get_legend() else True
|
480 |
+
|
481 |
+
# Get color from MODEL_COLORS, use a default if not found
|
482 |
+
color = MODEL_COLORS.get(model, f"C{i % 10}") # Use matplotlib default colors as fallback
|
483 |
+
|
484 |
+
ax.bar(game_idx + i*bar_width, score,
|
485 |
+
width=bar_width,
|
486 |
+
label=model if should_label else "",
|
487 |
+
color=color,
|
488 |
+
alpha=0.8)
|
489 |
+
|
490 |
+
# Customize the plot
|
491 |
+
ax.set_xticks(np.arange(n_games))
|
492 |
+
ax.set_xticklabels(active_games, rotation=45, ha='right', fontsize=10)
|
493 |
+
ax.set_ylabel('Normalized Performance Score', fontsize=12)
|
494 |
+
ax.set_title('AI Model Performance Comparison Across Gaming Tasks',
|
495 |
+
fontsize=14, pad=20)
|
496 |
+
|
497 |
+
# Add grid lines
|
498 |
+
ax.grid(True, axis='y', linestyle='--', alpha=0.3)
|
499 |
+
|
500 |
+
# Create legend with unique entries
|
501 |
+
handles, labels = ax.get_legend_handles_labels()
|
502 |
+
by_label = dict(zip(labels, handles))
|
503 |
+
|
504 |
+
# Sort models by their first appearance in active games
|
505 |
+
model_order = []
|
506 |
+
for game in active_games:
|
507 |
+
score_col = f"{game} Score" # Use the same column name for all games
|
508 |
+
for model in models:
|
509 |
+
try:
|
510 |
+
score = df[df['Player'] == model][score_col].values[0]
|
511 |
+
if score != '_' and float(score) > 0 and model not in model_order:
|
512 |
+
model_order.append(model)
|
513 |
+
except (IndexError, ValueError):
|
514 |
+
continue
|
515 |
+
|
516 |
+
# Create legend with sorted models
|
517 |
+
sorted_handles = [by_label[model] for model in model_order if model in by_label]
|
518 |
+
sorted_labels = [model for model in model_order if model in by_label]
|
519 |
+
|
520 |
+
ax.legend(sorted_handles, sorted_labels,
|
521 |
+
bbox_to_anchor=(1.00, 1), # Moved from (1.15, 1) to (1.05, 1) to shift left
|
522 |
+
loc='upper left',
|
523 |
+
fontsize=9,
|
524 |
+
title='AI Models',
|
525 |
+
title_fontsize=10)
|
526 |
+
|
527 |
+
# No need for tight_layout() as we're manually controlling the spacing
|
528 |
+
|
529 |
+
return fig
|
530 |
+
|
531 |
+
def get_combined_leaderboard_with_group_bar(rank_data, selected_games):
|
532 |
+
"""
|
533 |
+
Get combined leaderboard and create group bar chart
|
534 |
+
|
535 |
+
Args:
|
536 |
+
rank_data (dict): Dictionary containing rank data
|
537 |
+
selected_games (dict): Dictionary of game names and their selection status
|
538 |
+
|
539 |
+
Returns:
|
540 |
+
tuple: (DataFrame, matplotlib.figure.Figure) containing the leaderboard data and group bar chart
|
541 |
+
"""
|
542 |
+
df = get_combined_leaderboard(rank_data, selected_games)
|
543 |
+
group_bar_fig = create_group_bar_chart(df)
|
544 |
+
return df, group_bar_fig
|
545 |
+
|
546 |
+
def save_visualization(fig, filename):
|
547 |
+
"""
|
548 |
+
Save visualization to file
|
549 |
+
"""
|
550 |
+
fig.savefig(filename, bbox_inches='tight', dpi=300)
|
leaderboard_utils.py
ADDED
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
# Define game order
|
6 |
+
GAME_ORDER = [
|
7 |
+
"Super Mario Bros",
|
8 |
+
"Sokoban",
|
9 |
+
"2048",
|
10 |
+
"Candy Crash",
|
11 |
+
"Tetris (complete)",
|
12 |
+
"Tetris (planning only)"
|
13 |
+
]
|
14 |
+
|
15 |
+
def get_organization(model_name):
|
16 |
+
m = model_name.lower()
|
17 |
+
if "claude" in m:
|
18 |
+
return "anthropic"
|
19 |
+
elif "gemini" in m:
|
20 |
+
return "google"
|
21 |
+
elif "o1" in m or "gpt" in m or "o3" in m:
|
22 |
+
return "openai"
|
23 |
+
elif "deepseek" in m:
|
24 |
+
return "deepseek"
|
25 |
+
else:
|
26 |
+
return "unknown"
|
27 |
+
|
28 |
+
def get_mario_leaderboard(rank_data):
|
29 |
+
data = rank_data.get("Super Mario Bros", {}).get("results", [])
|
30 |
+
df = pd.DataFrame(data)
|
31 |
+
df = df.rename(columns={
|
32 |
+
"model": "Player",
|
33 |
+
"progress": "Progress (current/total)",
|
34 |
+
"score": "Score",
|
35 |
+
"time_s": "Time (s)"
|
36 |
+
})
|
37 |
+
df["Organization"] = df["Player"].apply(get_organization)
|
38 |
+
df = df[["Player", "Organization", "Progress (current/total)", "Score", "Time (s)"]]
|
39 |
+
return df
|
40 |
+
|
41 |
+
def get_sokoban_leaderboard(rank_data):
|
42 |
+
data = rank_data.get("Sokoban", {}).get("results", [])
|
43 |
+
df = pd.DataFrame(data)
|
44 |
+
df = df.rename(columns={
|
45 |
+
"model": "Player",
|
46 |
+
"levels_cracked": "Levels Cracked",
|
47 |
+
"steps": "Steps"
|
48 |
+
})
|
49 |
+
df["Organization"] = df["Player"].apply(get_organization)
|
50 |
+
df = df[["Player", "Organization", "Levels Cracked", "Steps"]]
|
51 |
+
return df
|
52 |
+
|
53 |
+
def get_2048_leaderboard(rank_data):
|
54 |
+
data = rank_data.get("2048", {}).get("results", [])
|
55 |
+
df = pd.DataFrame(data)
|
56 |
+
df = df.rename(columns={
|
57 |
+
"model": "Player",
|
58 |
+
"score": "Score",
|
59 |
+
"steps": "Steps",
|
60 |
+
"time": "Time"
|
61 |
+
})
|
62 |
+
df["Organization"] = df["Player"].apply(get_organization)
|
63 |
+
df = df[["Player", "Organization", "Score", "Steps", "Time"]]
|
64 |
+
return df
|
65 |
+
|
66 |
+
def get_candy_leaderboard(rank_data):
|
67 |
+
data = rank_data.get("Candy Crash", {}).get("results", [])
|
68 |
+
df = pd.DataFrame(data)
|
69 |
+
df = df.rename(columns={
|
70 |
+
"model": "Player",
|
71 |
+
"score_runs": "Score Runs",
|
72 |
+
"average_score": "Average Score",
|
73 |
+
"steps": "Steps"
|
74 |
+
})
|
75 |
+
df["Organization"] = df["Player"].apply(get_organization)
|
76 |
+
df = df[["Player", "Organization", "Score Runs", "Average Score", "Steps"]]
|
77 |
+
return df
|
78 |
+
|
79 |
+
def get_tetris_leaderboard(rank_data):
|
80 |
+
data = rank_data.get("Tetris (complete)", {}).get("results", [])
|
81 |
+
df = pd.DataFrame(data)
|
82 |
+
df = df.rename(columns={
|
83 |
+
"model": "Player",
|
84 |
+
"score": "Score",
|
85 |
+
"steps_blocks": "Steps"
|
86 |
+
})
|
87 |
+
df["Organization"] = df["Player"].apply(get_organization)
|
88 |
+
df = df[["Player", "Organization", "Score", "Steps"]]
|
89 |
+
return df
|
90 |
+
|
91 |
+
def get_tetris_planning_leaderboard(rank_data):
|
92 |
+
data = rank_data.get("Tetris (planning only)", {}).get("results", [])
|
93 |
+
df = pd.DataFrame(data)
|
94 |
+
df = df.rename(columns={
|
95 |
+
"model": "Player",
|
96 |
+
"score": "Score",
|
97 |
+
"steps_blocks": "Steps"
|
98 |
+
})
|
99 |
+
df["Organization"] = df["Player"].apply(get_organization)
|
100 |
+
df = df[["Player", "Organization", "Score", "Steps"]]
|
101 |
+
return df
|
102 |
+
|
103 |
+
def calculate_rank_and_completeness(rank_data, selected_games):
|
104 |
+
# Dictionary to store DataFrames for each game
|
105 |
+
game_dfs = {}
|
106 |
+
|
107 |
+
# Get DataFrames for selected games
|
108 |
+
if selected_games.get("Super Mario Bros"):
|
109 |
+
game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
|
110 |
+
if selected_games.get("Sokoban"):
|
111 |
+
game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
|
112 |
+
if selected_games.get("2048"):
|
113 |
+
game_dfs["2048"] = get_2048_leaderboard(rank_data)
|
114 |
+
if selected_games.get("Candy Crash"):
|
115 |
+
game_dfs["Candy Crash"] = get_candy_leaderboard(rank_data)
|
116 |
+
if selected_games.get("Tetris (complete)"):
|
117 |
+
game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
|
118 |
+
if selected_games.get("Tetris (planning only)"):
|
119 |
+
game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
|
120 |
+
|
121 |
+
# Get all unique players
|
122 |
+
all_players = set()
|
123 |
+
for df in game_dfs.values():
|
124 |
+
all_players.update(df["Player"].unique())
|
125 |
+
all_players = sorted(list(all_players))
|
126 |
+
|
127 |
+
# Create results DataFrame
|
128 |
+
results = []
|
129 |
+
for player in all_players:
|
130 |
+
player_data = {
|
131 |
+
"Player": player,
|
132 |
+
"Organization": get_organization(player)
|
133 |
+
}
|
134 |
+
ranks = []
|
135 |
+
games_played = 0
|
136 |
+
|
137 |
+
# Calculate rank and completeness for each game
|
138 |
+
for game in GAME_ORDER:
|
139 |
+
if game in game_dfs:
|
140 |
+
df = game_dfs[game]
|
141 |
+
if player in df["Player"].values:
|
142 |
+
games_played += 1
|
143 |
+
# Get player's score based on game type
|
144 |
+
if game == "Super Mario Bros":
|
145 |
+
player_score = df[df["Player"] == player]["Score"].iloc[0]
|
146 |
+
rank = len(df[df["Score"] > player_score]) + 1
|
147 |
+
elif game == "Sokoban":
|
148 |
+
# Parse Sokoban score string and get maximum level
|
149 |
+
levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
|
150 |
+
try:
|
151 |
+
# Split by semicolon, strip whitespace, filter empty strings, convert to integers
|
152 |
+
levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
|
153 |
+
player_score = max(levels) if levels else 0
|
154 |
+
except:
|
155 |
+
player_score = 0
|
156 |
+
# Calculate rank based on maximum level
|
157 |
+
rank = len(df[df["Levels Cracked"].apply(
|
158 |
+
lambda x: max([int(y.strip()) for y in x.split(";") if y.strip()]) > player_score
|
159 |
+
)]) + 1
|
160 |
+
elif game == "2048":
|
161 |
+
player_score = df[df["Player"] == player]["Score"].iloc[0]
|
162 |
+
rank = len(df[df["Score"] > player_score]) + 1
|
163 |
+
elif game == "Candy Crash":
|
164 |
+
player_score = df[df["Player"] == player]["Average Score"].iloc[0]
|
165 |
+
rank = len(df[df["Average Score"] > player_score]) + 1
|
166 |
+
elif game == "Tetris (complete)":
|
167 |
+
player_score = df[df["Player"] == player]["Score"].iloc[0]
|
168 |
+
rank = len(df[df["Score"] > player_score]) + 1
|
169 |
+
elif game == "Tetris (planning only)":
|
170 |
+
player_score = df[df["Player"] == player]["Score"].iloc[0]
|
171 |
+
rank = len(df[df["Score"] > player_score]) + 1
|
172 |
+
|
173 |
+
ranks.append(rank)
|
174 |
+
player_data[f"{game} Score"] = player_score
|
175 |
+
else:
|
176 |
+
player_data[f"{game} Score"] = "_"
|
177 |
+
|
178 |
+
# Calculate average rank and completeness for sorting only
|
179 |
+
if ranks:
|
180 |
+
player_data["Sort Rank"] = round(np.mean(ranks), 2)
|
181 |
+
player_data["Games Played"] = games_played
|
182 |
+
else:
|
183 |
+
player_data["Sort Rank"] = float('inf')
|
184 |
+
player_data["Games Played"] = 0
|
185 |
+
|
186 |
+
results.append(player_data)
|
187 |
+
|
188 |
+
# Create DataFrame and sort by average rank and completeness
|
189 |
+
df_results = pd.DataFrame(results)
|
190 |
+
if not df_results.empty:
|
191 |
+
# Sort by average rank (ascending) and completeness (descending)
|
192 |
+
df_results = df_results.sort_values(
|
193 |
+
by=["Sort Rank", "Games Played"],
|
194 |
+
ascending=[True, False]
|
195 |
+
)
|
196 |
+
# Drop the sorting columns
|
197 |
+
df_results = df_results.drop(["Sort Rank", "Games Played"], axis=1)
|
198 |
+
|
199 |
+
return df_results
|
200 |
+
|
201 |
+
def get_combined_leaderboard(rank_data, selected_games):
|
202 |
+
"""
|
203 |
+
Get combined leaderboard for selected games
|
204 |
+
|
205 |
+
Args:
|
206 |
+
rank_data (dict): Dictionary containing rank data
|
207 |
+
selected_games (dict): Dictionary of game names and their selection status
|
208 |
+
|
209 |
+
Returns:
|
210 |
+
pd.DataFrame: Combined leaderboard DataFrame
|
211 |
+
"""
|
212 |
+
# Dictionary to store DataFrames for each game
|
213 |
+
game_dfs = {}
|
214 |
+
|
215 |
+
# Get DataFrames for selected games
|
216 |
+
if selected_games.get("Super Mario Bros"):
|
217 |
+
game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
|
218 |
+
if selected_games.get("Sokoban"):
|
219 |
+
game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
|
220 |
+
if selected_games.get("2048"):
|
221 |
+
game_dfs["2048"] = get_2048_leaderboard(rank_data)
|
222 |
+
if selected_games.get("Candy Crash"):
|
223 |
+
game_dfs["Candy Crash"] = get_candy_leaderboard(rank_data)
|
224 |
+
if selected_games.get("Tetris (complete)"):
|
225 |
+
game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
|
226 |
+
if selected_games.get("Tetris (planning only)"):
|
227 |
+
game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
|
228 |
+
|
229 |
+
# Get all unique players
|
230 |
+
all_players = set()
|
231 |
+
for df in game_dfs.values():
|
232 |
+
all_players.update(df["Player"].unique())
|
233 |
+
all_players = sorted(list(all_players))
|
234 |
+
|
235 |
+
# Create results DataFrame
|
236 |
+
results = []
|
237 |
+
for player in all_players:
|
238 |
+
player_data = {
|
239 |
+
"Player": player,
|
240 |
+
"Organization": get_organization(player)
|
241 |
+
}
|
242 |
+
|
243 |
+
# Add scores for each game
|
244 |
+
for game in GAME_ORDER:
|
245 |
+
if game in game_dfs:
|
246 |
+
df = game_dfs[game]
|
247 |
+
if player in df["Player"].values:
|
248 |
+
if game == "Super Mario Bros":
|
249 |
+
player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
|
250 |
+
elif game == "Sokoban":
|
251 |
+
# Parse Sokoban score string and get maximum level
|
252 |
+
levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
|
253 |
+
try:
|
254 |
+
levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
|
255 |
+
player_data[f"{game} Score"] = max(levels) if levels else 0
|
256 |
+
except:
|
257 |
+
player_data[f"{game} Score"] = 0
|
258 |
+
elif game == "2048":
|
259 |
+
player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
|
260 |
+
elif game == "Candy Crash":
|
261 |
+
player_data[f"{game} Score"] = df[df["Player"] == player]["Average Score"].iloc[0]
|
262 |
+
elif game in ["Tetris (complete)", "Tetris (planning only)"]:
|
263 |
+
player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
|
264 |
+
else:
|
265 |
+
player_data[f"{game} Score"] = "_"
|
266 |
+
|
267 |
+
results.append(player_data)
|
268 |
+
|
269 |
+
# Create DataFrame
|
270 |
+
df_results = pd.DataFrame(results)
|
271 |
+
|
272 |
+
# Sort by total score across all games
|
273 |
+
if not df_results.empty:
|
274 |
+
# Calculate total score for each player
|
275 |
+
df_results["Total Score"] = 0
|
276 |
+
for game in GAME_ORDER:
|
277 |
+
if f"{game} Score" in df_results.columns:
|
278 |
+
df_results["Total Score"] += df_results[f"{game} Score"].apply(
|
279 |
+
lambda x: float(x) if x != "_" else 0
|
280 |
+
)
|
281 |
+
|
282 |
+
# Sort by total score in descending order
|
283 |
+
df_results = df_results.sort_values("Total Score", ascending=False)
|
284 |
+
|
285 |
+
# Drop the temporary total score column
|
286 |
+
df_results = df_results.drop("Total Score", axis=1)
|
287 |
+
|
288 |
+
return df_results
|
rank_data_03_25_2025.json
ADDED
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Super Mario Bros": {
|
3 |
+
"runs": 5,
|
4 |
+
"results": [
|
5 |
+
{
|
6 |
+
"model": "claude-3-7-sonnet-20250219",
|
7 |
+
"score": 710,
|
8 |
+
"progress": "1-1",
|
9 |
+
"time_s": 64.2,
|
10 |
+
"rank": 1
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"model": "gpt-4o-2024-11-20",
|
14 |
+
"score": 560,
|
15 |
+
"progress": "1-1",
|
16 |
+
"time_s": 58.6,
|
17 |
+
"rank": 2
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"model": "gemini-2.0-flash",
|
21 |
+
"score": 320,
|
22 |
+
"progress": "1-1",
|
23 |
+
"time_s": 51.8,
|
24 |
+
"rank": 3
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"model": "claude-3-5-haiku-20241022",
|
28 |
+
"score": 140,
|
29 |
+
"progress": "1-1",
|
30 |
+
"time_s": 76.4,
|
31 |
+
"rank": 4
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"model": "gpt-4.5-preview-2025-02-27",
|
35 |
+
"score": 160,
|
36 |
+
"progress": "1-1",
|
37 |
+
"time_s": 62.8,
|
38 |
+
"rank": 5
|
39 |
+
}
|
40 |
+
]
|
41 |
+
},
|
42 |
+
"2048": {
|
43 |
+
"runs": 1,
|
44 |
+
"results": [
|
45 |
+
{
|
46 |
+
"model": "claude-3-7-sonnet-20250219(thinking)",
|
47 |
+
"score": 256,
|
48 |
+
"steps": 114,
|
49 |
+
"time": ">200",
|
50 |
+
"rank": 1
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"model": "o1-2024-12-17",
|
54 |
+
"score": 256,
|
55 |
+
"steps": 116,
|
56 |
+
"time": ">200",
|
57 |
+
"rank": 2
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"model": "claude-3-7-sonnet-20250219",
|
61 |
+
"score": 256,
|
62 |
+
"steps": 130,
|
63 |
+
"time": "20:36",
|
64 |
+
"rank": 3
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"model": "deepseek-v3",
|
68 |
+
"score": 256,
|
69 |
+
"steps": 216,
|
70 |
+
"time": "54.02",
|
71 |
+
"rank": 4
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"model": "gemini-2.0-flash",
|
75 |
+
"score": 128,
|
76 |
+
"steps": 111,
|
77 |
+
"time": "18:43",
|
78 |
+
"rank": 5
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"model": "gemini-2.0-flash-thinking-exp-1219",
|
82 |
+
"score": 128,
|
83 |
+
"steps": 132,
|
84 |
+
"time": ">100",
|
85 |
+
"rank": 6
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"model": "gemini-2.5-pro-exp-03-25",
|
89 |
+
"score": 128,
|
90 |
+
"steps": 138,
|
91 |
+
"time": "169",
|
92 |
+
"rank": 7
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"model": "claude-3-5-sonnet-20241022",
|
96 |
+
"score": 64,
|
97 |
+
"steps": 92,
|
98 |
+
"time": "9:2",
|
99 |
+
"rank": 9
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"model": "gpt-4.5-preview-2025-02-27",
|
103 |
+
"score": 34,
|
104 |
+
"steps": 34,
|
105 |
+
"time": "8:25",
|
106 |
+
"rank": 10
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"model": "gpt-4o-2024-11-20",
|
110 |
+
"score": 16,
|
111 |
+
"steps": 21,
|
112 |
+
"time": "1:17",
|
113 |
+
"rank": 11
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
117 |
+
"score": 128,
|
118 |
+
"steps": 145,
|
119 |
+
"time": ">100",
|
120 |
+
"rank": 8
|
121 |
+
}
|
122 |
+
]
|
123 |
+
},
|
124 |
+
"Tetris (complete)": {
|
125 |
+
"runs": 3,
|
126 |
+
"results": [
|
127 |
+
{
|
128 |
+
"model": "claude-3-7-sonnet-20250219",
|
129 |
+
"score": 95,
|
130 |
+
"steps_blocks": 27,
|
131 |
+
"rank": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"model": "claude-3-5-haiku-20241022",
|
135 |
+
"score": 90,
|
136 |
+
"steps_blocks": 25,
|
137 |
+
"rank": 2
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"model": "gemini-2.0-flash",
|
141 |
+
"score": 82,
|
142 |
+
"steps_blocks": 23,
|
143 |
+
"rank": 3
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"model": "gpt-4o-2024-11-20",
|
147 |
+
"score": 54,
|
148 |
+
"steps_blocks": 19,
|
149 |
+
"rank": 4
|
150 |
+
}
|
151 |
+
]
|
152 |
+
},
|
153 |
+
"Tetris (planning only)": {
|
154 |
+
"runs": 3,
|
155 |
+
"results": [
|
156 |
+
{
|
157 |
+
"model": "claude-3-7-sonnet-20250219",
|
158 |
+
"score": 110,
|
159 |
+
"steps_blocks": 29,
|
160 |
+
"rank": 1
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"model": "claude-3-5-haiku-20241022",
|
164 |
+
"score": 92,
|
165 |
+
"steps_blocks": 25,
|
166 |
+
"rank": 2
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"model": "gemini-2.0-flash",
|
170 |
+
"score": 87,
|
171 |
+
"steps_blocks": 24,
|
172 |
+
"rank": 3
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"model": "gpt-4o-2024-11-20",
|
176 |
+
"score": 56,
|
177 |
+
"steps_blocks": 20,
|
178 |
+
"rank": 4
|
179 |
+
}
|
180 |
+
]
|
181 |
+
},
|
182 |
+
"Candy Crash": {
|
183 |
+
"runs": 3,
|
184 |
+
"results": [
|
185 |
+
{
|
186 |
+
"model": "o3-mini-2025-01-31(medium)",
|
187 |
+
"score_runs": "90;109;120",
|
188 |
+
"average_score": 106.33,
|
189 |
+
"steps": 25,
|
190 |
+
"rank": 1
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"model": "o1-2024-12-17",
|
194 |
+
"score_runs": "96;114;83",
|
195 |
+
"average_score": 97.67,
|
196 |
+
"steps": 25,
|
197 |
+
"rank": 2
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"model": "deepseek-r1",
|
201 |
+
"score_runs": "62;108;105",
|
202 |
+
"average_score": 91.67,
|
203 |
+
"steps": 25,
|
204 |
+
"rank": 3
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"model": "gemini-2.5-pro-exp-03-25",
|
208 |
+
"score_runs": "50;36;68",
|
209 |
+
"average_score": 51.33,
|
210 |
+
"steps": 25,
|
211 |
+
"rank": 4
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"model": "claude-3-7-sonnet-20250219(thinking)",
|
215 |
+
"score_runs": "36;46;24",
|
216 |
+
"average_score": 35.33,
|
217 |
+
"steps": 25,
|
218 |
+
"rank": 5
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"model": "gemini-2.0-flash-thinking-exp-1219",
|
222 |
+
"score_runs": "0;15;39",
|
223 |
+
"average_score": 18,
|
224 |
+
"steps": 25,
|
225 |
+
"rank": 6
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"model": "claude-3-5-sonnet-20241022",
|
229 |
+
"score_runs": "3;0;0",
|
230 |
+
"average_score": 1,
|
231 |
+
"steps": 25,
|
232 |
+
"rank": 7
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"model": "deepseek-v3",
|
236 |
+
"score_runs": "0;0;0",
|
237 |
+
"average_score": 0,
|
238 |
+
"steps": 25,
|
239 |
+
"rank":9
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
243 |
+
"score_runs": "6;0;0",
|
244 |
+
"average_score": 2,
|
245 |
+
"steps": 25,
|
246 |
+
"rank": 8
|
247 |
+
}
|
248 |
+
]
|
249 |
+
},
|
250 |
+
"Sokoban": {
|
251 |
+
"runs": 3,
|
252 |
+
"results": [
|
253 |
+
{
|
254 |
+
"model": "o3-mini-2025-01-31(medium)",
|
255 |
+
"levels_cracked": "2; 3; 2",
|
256 |
+
"steps": "[17,52,68];[24,58,78,91];[19,44,64]",
|
257 |
+
"rank": 1
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"model": "gemini-2.5-pro-exp-03-25",
|
261 |
+
"levels_cracked": "2;2;3",
|
262 |
+
"steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
|
263 |
+
"rank": 2
|
264 |
+
},
|
265 |
+
{
|
266 |
+
"model": "claude-3-7-sonnet-20250219(thinking)",
|
267 |
+
"levels_cracked": "1; 2; 0",
|
268 |
+
"steps": "[17,35];[15,40,43];[4]",
|
269 |
+
"rank": 3
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"model": "o1-2024-12-17",
|
273 |
+
"levels_cracked": "1; 1; 1",
|
274 |
+
"steps": null,
|
275 |
+
"rank": 4
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"model": "deepseek-r1",
|
279 |
+
"levels_cracked": "1; 0; 1",
|
280 |
+
"steps": "[19,42];[13];[19,36]",
|
281 |
+
"note": "stuck",
|
282 |
+
"rank": 5
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"model": "o1-mini-2024-09-12",
|
286 |
+
"levels_cracked": "0;1;0",
|
287 |
+
"steps": null,
|
288 |
+
"rank": 6
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"model": "gemini-2.0-flash-thinking-exp-1219",
|
292 |
+
"levels_cracked": "0; 0; 0",
|
293 |
+
"steps": "[23]; [14]; [14]",
|
294 |
+
"rank": 7
|
295 |
+
},
|
296 |
+
{
|
297 |
+
"model": "gpt-4o-2024-11-20",
|
298 |
+
"levels_cracked": "0; 0; 0",
|
299 |
+
"steps": "[68];[105];[168]",
|
300 |
+
"note": "stuck in a loop",
|
301 |
+
"rank": 8
|
302 |
+
},
|
303 |
+
{
|
304 |
+
"model": "claude-3-5-sonnet-20241022",
|
305 |
+
"levels_cracked": "0; 0; 0",
|
306 |
+
"steps": "[21]; [30]; [51]",
|
307 |
+
"note": "stuck in a loop",
|
308 |
+
"rank": 9
|
309 |
+
},
|
310 |
+
{
|
311 |
+
"model": "deepseek-v3",
|
312 |
+
"levels_cracked": "0; 0; 0",
|
313 |
+
"steps": "[9]; [47]; [64]",
|
314 |
+
"rank": 10
|
315 |
+
},
|
316 |
+
{
|
317 |
+
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
318 |
+
"levels_cracked": "0;0;0",
|
319 |
+
"steps": "[5]",
|
320 |
+
"rank": 11
|
321 |
+
}
|
322 |
+
]
|
323 |
+
}
|
324 |
+
}
|
requirements.txt
CHANGED
@@ -13,4 +13,7 @@ python-dateutil
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
-
sentencepiece
|
|
|
|
|
|
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
+
sentencepiece
|
17 |
+
seaborn>=0.12.0
|
18 |
+
Pillow>=10.0.0
|
19 |
+
plotly>=5.15.0
|
src/about.py
DELETED
@@ -1,72 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from enum import Enum
|
3 |
-
|
4 |
-
@dataclass
|
5 |
-
class Task:
|
6 |
-
benchmark: str
|
7 |
-
metric: str
|
8 |
-
col_name: str
|
9 |
-
|
10 |
-
|
11 |
-
# Select your tasks here
|
12 |
-
# ---------------------------------------------------
|
13 |
-
class Tasks(Enum):
|
14 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
-
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
-
|
18 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
-
# ---------------------------------------------------
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
|
25 |
-
|
26 |
-
# What does your leaderboard evaluate?
|
27 |
-
INTRODUCTION_TEXT = """
|
28 |
-
Intro text
|
29 |
-
"""
|
30 |
-
|
31 |
-
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
-
LLM_BENCHMARKS_TEXT = f"""
|
33 |
-
## How it works
|
34 |
-
|
35 |
-
## Reproducibility
|
36 |
-
To reproduce our results, here is the commands you can run:
|
37 |
-
|
38 |
-
"""
|
39 |
-
|
40 |
-
EVALUATION_QUEUE_TEXT = """
|
41 |
-
## Some good practices before submitting a model
|
42 |
-
|
43 |
-
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
44 |
-
```python
|
45 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
46 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
47 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
48 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
49 |
-
```
|
50 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
51 |
-
|
52 |
-
Note: make sure your model is public!
|
53 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
54 |
-
|
55 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
56 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
57 |
-
|
58 |
-
### 3) Make sure your model has an open license!
|
59 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
60 |
-
|
61 |
-
### 4) Fill up your model card
|
62 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
63 |
-
|
64 |
-
## In case of model failure
|
65 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
66 |
-
Make sure you have followed the above steps first.
|
67 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
68 |
-
"""
|
69 |
-
|
70 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
-
CITATION_BUTTON_TEXT = r"""
|
72 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/css_html_js.py
DELETED
@@ -1,105 +0,0 @@
|
|
1 |
-
custom_css = """
|
2 |
-
|
3 |
-
.markdown-text {
|
4 |
-
font-size: 16px !important;
|
5 |
-
}
|
6 |
-
|
7 |
-
#models-to-add-text {
|
8 |
-
font-size: 18px !important;
|
9 |
-
}
|
10 |
-
|
11 |
-
#citation-button span {
|
12 |
-
font-size: 16px !important;
|
13 |
-
}
|
14 |
-
|
15 |
-
#citation-button textarea {
|
16 |
-
font-size: 16px !important;
|
17 |
-
}
|
18 |
-
|
19 |
-
#citation-button > label > button {
|
20 |
-
margin: 6px;
|
21 |
-
transform: scale(1.3);
|
22 |
-
}
|
23 |
-
|
24 |
-
#leaderboard-table {
|
25 |
-
margin-top: 15px
|
26 |
-
}
|
27 |
-
|
28 |
-
#leaderboard-table-lite {
|
29 |
-
margin-top: 15px
|
30 |
-
}
|
31 |
-
|
32 |
-
#search-bar-table-box > div:first-child {
|
33 |
-
background: none;
|
34 |
-
border: none;
|
35 |
-
}
|
36 |
-
|
37 |
-
#search-bar {
|
38 |
-
padding: 0px;
|
39 |
-
}
|
40 |
-
|
41 |
-
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
42 |
-
#leaderboard-table td:nth-child(2),
|
43 |
-
#leaderboard-table th:nth-child(2) {
|
44 |
-
max-width: 400px;
|
45 |
-
overflow: auto;
|
46 |
-
white-space: nowrap;
|
47 |
-
}
|
48 |
-
|
49 |
-
.tab-buttons button {
|
50 |
-
font-size: 20px;
|
51 |
-
}
|
52 |
-
|
53 |
-
#scale-logo {
|
54 |
-
border-style: none !important;
|
55 |
-
box-shadow: none;
|
56 |
-
display: block;
|
57 |
-
margin-left: auto;
|
58 |
-
margin-right: auto;
|
59 |
-
max-width: 600px;
|
60 |
-
}
|
61 |
-
|
62 |
-
#scale-logo .download {
|
63 |
-
display: none;
|
64 |
-
}
|
65 |
-
#filter_type{
|
66 |
-
border: 0;
|
67 |
-
padding-left: 0;
|
68 |
-
padding-top: 0;
|
69 |
-
}
|
70 |
-
#filter_type label {
|
71 |
-
display: flex;
|
72 |
-
}
|
73 |
-
#filter_type label > span{
|
74 |
-
margin-top: var(--spacing-lg);
|
75 |
-
margin-right: 0.5em;
|
76 |
-
}
|
77 |
-
#filter_type label > .wrap{
|
78 |
-
width: 103px;
|
79 |
-
}
|
80 |
-
#filter_type label > .wrap .wrap-inner{
|
81 |
-
padding: 2px;
|
82 |
-
}
|
83 |
-
#filter_type label > .wrap .wrap-inner input{
|
84 |
-
width: 1px
|
85 |
-
}
|
86 |
-
#filter-columns-type{
|
87 |
-
border:0;
|
88 |
-
padding:0.5;
|
89 |
-
}
|
90 |
-
#filter-columns-size{
|
91 |
-
border:0;
|
92 |
-
padding:0.5;
|
93 |
-
}
|
94 |
-
#box-filter > .form{
|
95 |
-
border: 0
|
96 |
-
}
|
97 |
-
"""
|
98 |
-
|
99 |
-
get_window_url_params = """
|
100 |
-
function(url_params) {
|
101 |
-
const params = new URLSearchParams(window.location.search);
|
102 |
-
url_params = Object.fromEntries(params);
|
103 |
-
return url_params;
|
104 |
-
}
|
105 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/formatting.py
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
def model_hyperlink(link, model_name):
|
2 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
-
|
4 |
-
|
5 |
-
def make_clickable_model(model_name):
|
6 |
-
link = f"https://huggingface.co/{model_name}"
|
7 |
-
return model_hyperlink(link, model_name)
|
8 |
-
|
9 |
-
|
10 |
-
def styled_error(error):
|
11 |
-
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
12 |
-
|
13 |
-
|
14 |
-
def styled_warning(warn):
|
15 |
-
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
16 |
-
|
17 |
-
|
18 |
-
def styled_message(message):
|
19 |
-
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
20 |
-
|
21 |
-
|
22 |
-
def has_no_nan_values(df, columns):
|
23 |
-
return df[columns].notna().all(axis=1)
|
24 |
-
|
25 |
-
|
26 |
-
def has_nan_values(df, columns):
|
27 |
-
return df[columns].isna().any(axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
DELETED
@@ -1,110 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass, make_dataclass
|
2 |
-
from enum import Enum
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
-
|
6 |
-
from src.about import Tasks
|
7 |
-
|
8 |
-
def fields(raw_class):
|
9 |
-
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
-
|
11 |
-
|
12 |
-
# These classes are for user facing column names,
|
13 |
-
# to avoid having to change them all around the code
|
14 |
-
# when a modif is needed
|
15 |
-
@dataclass
|
16 |
-
class ColumnContent:
|
17 |
-
name: str
|
18 |
-
type: str
|
19 |
-
displayed_by_default: bool
|
20 |
-
hidden: bool = False
|
21 |
-
never_hidden: bool = False
|
22 |
-
|
23 |
-
## Leaderboard columns
|
24 |
-
auto_eval_column_dict = []
|
25 |
-
# Init
|
26 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
-
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
-
for task in Tasks:
|
31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
-
# Model information
|
33 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
-
|
43 |
-
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
45 |
-
|
46 |
-
## For the queue columns in the submission tab
|
47 |
-
@dataclass(frozen=True)
|
48 |
-
class EvalQueueColumn: # Queue column
|
49 |
-
model = ColumnContent("model", "markdown", True)
|
50 |
-
revision = ColumnContent("revision", "str", True)
|
51 |
-
private = ColumnContent("private", "bool", True)
|
52 |
-
precision = ColumnContent("precision", "str", True)
|
53 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
-
status = ColumnContent("status", "str", True)
|
55 |
-
|
56 |
-
## All the model information that we might need
|
57 |
-
@dataclass
|
58 |
-
class ModelDetails:
|
59 |
-
name: str
|
60 |
-
display_name: str = ""
|
61 |
-
symbol: str = "" # emoji
|
62 |
-
|
63 |
-
|
64 |
-
class ModelType(Enum):
|
65 |
-
PT = ModelDetails(name="pretrained", symbol="🟢")
|
66 |
-
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
67 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
68 |
-
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
69 |
-
Unknown = ModelDetails(name="", symbol="?")
|
70 |
-
|
71 |
-
def to_str(self, separator=" "):
|
72 |
-
return f"{self.value.symbol}{separator}{self.value.name}"
|
73 |
-
|
74 |
-
@staticmethod
|
75 |
-
def from_str(type):
|
76 |
-
if "fine-tuned" in type or "🔶" in type:
|
77 |
-
return ModelType.FT
|
78 |
-
if "pretrained" in type or "🟢" in type:
|
79 |
-
return ModelType.PT
|
80 |
-
if "RL-tuned" in type or "🟦" in type:
|
81 |
-
return ModelType.RL
|
82 |
-
if "instruction-tuned" in type or "⭕" in type:
|
83 |
-
return ModelType.IFT
|
84 |
-
return ModelType.Unknown
|
85 |
-
|
86 |
-
class WeightType(Enum):
|
87 |
-
Adapter = ModelDetails("Adapter")
|
88 |
-
Original = ModelDetails("Original")
|
89 |
-
Delta = ModelDetails("Delta")
|
90 |
-
|
91 |
-
class Precision(Enum):
|
92 |
-
float16 = ModelDetails("float16")
|
93 |
-
bfloat16 = ModelDetails("bfloat16")
|
94 |
-
Unknown = ModelDetails("?")
|
95 |
-
|
96 |
-
def from_str(precision):
|
97 |
-
if precision in ["torch.float16", "float16"]:
|
98 |
-
return Precision.float16
|
99 |
-
if precision in ["torch.bfloat16", "bfloat16"]:
|
100 |
-
return Precision.bfloat16
|
101 |
-
return Precision.Unknown
|
102 |
-
|
103 |
-
# Column selection
|
104 |
-
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
105 |
-
|
106 |
-
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
107 |
-
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
-
|
109 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
from huggingface_hub import HfApi
|
4 |
-
|
5 |
-
# Info to change for your repository
|
6 |
-
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
-
|
9 |
-
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
-
# ----------------------------------
|
11 |
-
|
12 |
-
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
-
RESULTS_REPO = f"{OWNER}/results"
|
15 |
-
|
16 |
-
# If you setup a cache later, just change HF_HOME
|
17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
-
|
19 |
-
# Local caches
|
20 |
-
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
22 |
-
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
-
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
-
|
25 |
-
API = HfApi(token=TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/read_evals.py
DELETED
@@ -1,196 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import json
|
3 |
-
import math
|
4 |
-
import os
|
5 |
-
from dataclasses import dataclass
|
6 |
-
|
7 |
-
import dateutil
|
8 |
-
import numpy as np
|
9 |
-
|
10 |
-
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
-
from src.submission.check_validity import is_model_on_hub
|
13 |
-
|
14 |
-
|
15 |
-
@dataclass
|
16 |
-
class EvalResult:
|
17 |
-
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
-
"""
|
19 |
-
eval_name: str # org_model_precision (uid)
|
20 |
-
full_model: str # org/model (path on hub)
|
21 |
-
org: str
|
22 |
-
model: str
|
23 |
-
revision: str # commit hash, "" if main
|
24 |
-
results: dict
|
25 |
-
precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
-
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
-
architecture: str = "Unknown"
|
29 |
-
license: str = "?"
|
30 |
-
likes: int = 0
|
31 |
-
num_params: int = 0
|
32 |
-
date: str = "" # submission date of request file
|
33 |
-
still_on_hub: bool = False
|
34 |
-
|
35 |
-
@classmethod
|
36 |
-
def init_from_json_file(self, json_filepath):
|
37 |
-
"""Inits the result from the specific model result file"""
|
38 |
-
with open(json_filepath) as fp:
|
39 |
-
data = json.load(fp)
|
40 |
-
|
41 |
-
config = data.get("config")
|
42 |
-
|
43 |
-
# Precision
|
44 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
-
|
46 |
-
# Get model and org
|
47 |
-
org_and_model = config.get("model_name", config.get("model_args", None))
|
48 |
-
org_and_model = org_and_model.split("/", 1)
|
49 |
-
|
50 |
-
if len(org_and_model) == 1:
|
51 |
-
org = None
|
52 |
-
model = org_and_model[0]
|
53 |
-
result_key = f"{model}_{precision.value.name}"
|
54 |
-
else:
|
55 |
-
org = org_and_model[0]
|
56 |
-
model = org_and_model[1]
|
57 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
58 |
-
full_model = "/".join(org_and_model)
|
59 |
-
|
60 |
-
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
-
)
|
63 |
-
architecture = "?"
|
64 |
-
if model_config is not None:
|
65 |
-
architectures = getattr(model_config, "architectures", None)
|
66 |
-
if architectures:
|
67 |
-
architecture = ";".join(architectures)
|
68 |
-
|
69 |
-
# Extract results available in this file (some results are split in several files)
|
70 |
-
results = {}
|
71 |
-
for task in Tasks:
|
72 |
-
task = task.value
|
73 |
-
|
74 |
-
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
-
continue
|
78 |
-
|
79 |
-
mean_acc = np.mean(accs) * 100.0
|
80 |
-
results[task.benchmark] = mean_acc
|
81 |
-
|
82 |
-
return self(
|
83 |
-
eval_name=result_key,
|
84 |
-
full_model=full_model,
|
85 |
-
org=org,
|
86 |
-
model=model,
|
87 |
-
results=results,
|
88 |
-
precision=precision,
|
89 |
-
revision= config.get("model_sha", ""),
|
90 |
-
still_on_hub=still_on_hub,
|
91 |
-
architecture=architecture
|
92 |
-
)
|
93 |
-
|
94 |
-
def update_with_request_file(self, requests_path):
|
95 |
-
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
-
|
98 |
-
try:
|
99 |
-
with open(request_file, "r") as f:
|
100 |
-
request = json.load(f)
|
101 |
-
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
102 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
103 |
-
self.license = request.get("license", "?")
|
104 |
-
self.likes = request.get("likes", 0)
|
105 |
-
self.num_params = request.get("params", 0)
|
106 |
-
self.date = request.get("submitted_time", "")
|
107 |
-
except Exception:
|
108 |
-
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
-
|
110 |
-
def to_dict(self):
|
111 |
-
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
-
data_dict = {
|
114 |
-
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
-
AutoEvalColumn.revision.name: self.revision,
|
122 |
-
AutoEvalColumn.average.name: average,
|
123 |
-
AutoEvalColumn.license.name: self.license,
|
124 |
-
AutoEvalColumn.likes.name: self.likes,
|
125 |
-
AutoEvalColumn.params.name: self.num_params,
|
126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
-
}
|
128 |
-
|
129 |
-
for task in Tasks:
|
130 |
-
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
131 |
-
|
132 |
-
return data_dict
|
133 |
-
|
134 |
-
|
135 |
-
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
-
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
-
request_files = os.path.join(
|
138 |
-
requests_path,
|
139 |
-
f"{model_name}_eval_request_*.json",
|
140 |
-
)
|
141 |
-
request_files = glob.glob(request_files)
|
142 |
-
|
143 |
-
# Select correct request file (precision)
|
144 |
-
request_file = ""
|
145 |
-
request_files = sorted(request_files, reverse=True)
|
146 |
-
for tmp_request_file in request_files:
|
147 |
-
with open(tmp_request_file, "r") as f:
|
148 |
-
req_content = json.load(f)
|
149 |
-
if (
|
150 |
-
req_content["status"] in ["FINISHED"]
|
151 |
-
and req_content["precision"] == precision.split(".")[-1]
|
152 |
-
):
|
153 |
-
request_file = tmp_request_file
|
154 |
-
return request_file
|
155 |
-
|
156 |
-
|
157 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
158 |
-
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
-
model_result_filepaths = []
|
160 |
-
|
161 |
-
for root, _, files in os.walk(results_path):
|
162 |
-
# We should only have json files in model results
|
163 |
-
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
164 |
-
continue
|
165 |
-
|
166 |
-
# Sort the files by date
|
167 |
-
try:
|
168 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
169 |
-
except dateutil.parser._parser.ParserError:
|
170 |
-
files = [files[-1]]
|
171 |
-
|
172 |
-
for file in files:
|
173 |
-
model_result_filepaths.append(os.path.join(root, file))
|
174 |
-
|
175 |
-
eval_results = {}
|
176 |
-
for model_result_filepath in model_result_filepaths:
|
177 |
-
# Creation of result
|
178 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
-
eval_result.update_with_request_file(requests_path)
|
180 |
-
|
181 |
-
# Store results of same eval together
|
182 |
-
eval_name = eval_result.eval_name
|
183 |
-
if eval_name in eval_results.keys():
|
184 |
-
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
185 |
-
else:
|
186 |
-
eval_results[eval_name] = eval_result
|
187 |
-
|
188 |
-
results = []
|
189 |
-
for v in eval_results.values():
|
190 |
-
try:
|
191 |
-
v.to_dict() # we test if the dict version is complete
|
192 |
-
results.append(v)
|
193 |
-
except KeyError: # not all eval values present
|
194 |
-
continue
|
195 |
-
|
196 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/populate.py
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
-
|
6 |
-
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
-
|
10 |
-
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
-
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
-
|
16 |
-
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
-
df = df[cols].round(decimals=2)
|
19 |
-
|
20 |
-
# filter out if any of the benchmarks have not been produced
|
21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
-
return df
|
23 |
-
|
24 |
-
|
25 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
26 |
-
"""Creates the different dataframes for the evaluation queues requestes"""
|
27 |
-
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
28 |
-
all_evals = []
|
29 |
-
|
30 |
-
for entry in entries:
|
31 |
-
if ".json" in entry:
|
32 |
-
file_path = os.path.join(save_path, entry)
|
33 |
-
with open(file_path) as fp:
|
34 |
-
data = json.load(fp)
|
35 |
-
|
36 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
37 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
-
|
39 |
-
all_evals.append(data)
|
40 |
-
elif ".md" not in entry:
|
41 |
-
# this is a folder
|
42 |
-
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
43 |
-
for sub_entry in sub_entries:
|
44 |
-
file_path = os.path.join(save_path, entry, sub_entry)
|
45 |
-
with open(file_path) as fp:
|
46 |
-
data = json.load(fp)
|
47 |
-
|
48 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
-
all_evals.append(data)
|
51 |
-
|
52 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
53 |
-
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
54 |
-
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
55 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
-
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
-
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
58 |
-
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/check_validity.py
DELETED
@@ -1,99 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import re
|
4 |
-
from collections import defaultdict
|
5 |
-
from datetime import datetime, timedelta, timezone
|
6 |
-
|
7 |
-
import huggingface_hub
|
8 |
-
from huggingface_hub import ModelCard
|
9 |
-
from huggingface_hub.hf_api import ModelInfo
|
10 |
-
from transformers import AutoConfig
|
11 |
-
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
12 |
-
|
13 |
-
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
-
"""Checks if the model card and license exist and have been filled"""
|
15 |
-
try:
|
16 |
-
card = ModelCard.load(repo_id)
|
17 |
-
except huggingface_hub.utils.EntryNotFoundError:
|
18 |
-
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
19 |
-
|
20 |
-
# Enforce license metadata
|
21 |
-
if card.data.license is None:
|
22 |
-
if not ("license_name" in card.data and "license_link" in card.data):
|
23 |
-
return False, (
|
24 |
-
"License not found. Please add a license to your model card using the `license` metadata or a"
|
25 |
-
" `license_name`/`license_link` pair."
|
26 |
-
)
|
27 |
-
|
28 |
-
# Enforce card content
|
29 |
-
if len(card.text) < 200:
|
30 |
-
return False, "Please add a description to your model card, it is too short."
|
31 |
-
|
32 |
-
return True, ""
|
33 |
-
|
34 |
-
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
35 |
-
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
36 |
-
try:
|
37 |
-
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
38 |
-
if test_tokenizer:
|
39 |
-
try:
|
40 |
-
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
41 |
-
except ValueError as e:
|
42 |
-
return (
|
43 |
-
False,
|
44 |
-
f"uses a tokenizer which is not in a transformers release: {e}",
|
45 |
-
None
|
46 |
-
)
|
47 |
-
except Exception as e:
|
48 |
-
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
49 |
-
return True, None, config
|
50 |
-
|
51 |
-
except ValueError:
|
52 |
-
return (
|
53 |
-
False,
|
54 |
-
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
55 |
-
None
|
56 |
-
)
|
57 |
-
|
58 |
-
except Exception as e:
|
59 |
-
return False, "was not found on hub!", None
|
60 |
-
|
61 |
-
|
62 |
-
def get_model_size(model_info: ModelInfo, precision: str):
|
63 |
-
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
64 |
-
try:
|
65 |
-
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
66 |
-
except (AttributeError, TypeError):
|
67 |
-
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
68 |
-
|
69 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
70 |
-
model_size = size_factor * model_size
|
71 |
-
return model_size
|
72 |
-
|
73 |
-
def get_model_arch(model_info: ModelInfo):
|
74 |
-
"""Gets the model architecture from the configuration"""
|
75 |
-
return model_info.config.get("architectures", "Unknown")
|
76 |
-
|
77 |
-
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
78 |
-
"""Gather a list of already submitted models to avoid duplicates"""
|
79 |
-
depth = 1
|
80 |
-
file_names = []
|
81 |
-
users_to_submission_dates = defaultdict(list)
|
82 |
-
|
83 |
-
for root, _, files in os.walk(requested_models_dir):
|
84 |
-
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
85 |
-
if current_depth == depth:
|
86 |
-
for file in files:
|
87 |
-
if not file.endswith(".json"):
|
88 |
-
continue
|
89 |
-
with open(os.path.join(root, file), "r") as f:
|
90 |
-
info = json.load(f)
|
91 |
-
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
92 |
-
|
93 |
-
# Select organisation
|
94 |
-
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
95 |
-
continue
|
96 |
-
organisation, _ = info["model"].split("/")
|
97 |
-
users_to_submission_dates[organisation].append(info["submitted_time"])
|
98 |
-
|
99 |
-
return set(file_names), users_to_submission_dates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/submit.py
DELETED
@@ -1,119 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
from datetime import datetime, timezone
|
4 |
-
|
5 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
-
from src.submission.check_validity import (
|
8 |
-
already_submitted_models,
|
9 |
-
check_model_card,
|
10 |
-
get_model_size,
|
11 |
-
is_model_on_hub,
|
12 |
-
)
|
13 |
-
|
14 |
-
REQUESTED_MODELS = None
|
15 |
-
USERS_TO_SUBMISSION_DATES = None
|
16 |
-
|
17 |
-
def add_new_eval(
|
18 |
-
model: str,
|
19 |
-
base_model: str,
|
20 |
-
revision: str,
|
21 |
-
precision: str,
|
22 |
-
weight_type: str,
|
23 |
-
model_type: str,
|
24 |
-
):
|
25 |
-
global REQUESTED_MODELS
|
26 |
-
global USERS_TO_SUBMISSION_DATES
|
27 |
-
if not REQUESTED_MODELS:
|
28 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
-
|
30 |
-
user_name = ""
|
31 |
-
model_path = model
|
32 |
-
if "/" in model:
|
33 |
-
user_name = model.split("/")[0]
|
34 |
-
model_path = model.split("/")[1]
|
35 |
-
|
36 |
-
precision = precision.split(" ")[0]
|
37 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
-
|
39 |
-
if model_type is None or model_type == "":
|
40 |
-
return styled_error("Please select a model type.")
|
41 |
-
|
42 |
-
# Does the model actually exist?
|
43 |
-
if revision == "":
|
44 |
-
revision = "main"
|
45 |
-
|
46 |
-
# Is the model on the hub?
|
47 |
-
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
-
if not base_model_on_hub:
|
50 |
-
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
-
|
52 |
-
if not weight_type == "Adapter":
|
53 |
-
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
-
if not model_on_hub:
|
55 |
-
return styled_error(f'Model "{model}" {error}')
|
56 |
-
|
57 |
-
# Is the model info correctly filled?
|
58 |
-
try:
|
59 |
-
model_info = API.model_info(repo_id=model, revision=revision)
|
60 |
-
except Exception:
|
61 |
-
return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
-
|
63 |
-
model_size = get_model_size(model_info=model_info, precision=precision)
|
64 |
-
|
65 |
-
# Were the model card and license filled?
|
66 |
-
try:
|
67 |
-
license = model_info.cardData["license"]
|
68 |
-
except Exception:
|
69 |
-
return styled_error("Please select a license for your model")
|
70 |
-
|
71 |
-
modelcard_OK, error_msg = check_model_card(model)
|
72 |
-
if not modelcard_OK:
|
73 |
-
return styled_error(error_msg)
|
74 |
-
|
75 |
-
# Seems good, creating the eval
|
76 |
-
print("Adding new eval")
|
77 |
-
|
78 |
-
eval_entry = {
|
79 |
-
"model": model,
|
80 |
-
"base_model": base_model,
|
81 |
-
"revision": revision,
|
82 |
-
"precision": precision,
|
83 |
-
"weight_type": weight_type,
|
84 |
-
"status": "PENDING",
|
85 |
-
"submitted_time": current_time,
|
86 |
-
"model_type": model_type,
|
87 |
-
"likes": model_info.likes,
|
88 |
-
"params": model_size,
|
89 |
-
"license": license,
|
90 |
-
"private": False,
|
91 |
-
}
|
92 |
-
|
93 |
-
# Check for duplicate submission
|
94 |
-
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
95 |
-
return styled_warning("This model has been already submitted.")
|
96 |
-
|
97 |
-
print("Creating eval file")
|
98 |
-
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
-
os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
101 |
-
|
102 |
-
with open(out_path, "w") as f:
|
103 |
-
f.write(json.dumps(eval_entry))
|
104 |
-
|
105 |
-
print("Uploading eval file")
|
106 |
-
API.upload_file(
|
107 |
-
path_or_fileobj=out_path,
|
108 |
-
path_in_repo=out_path.split("eval-queue/")[1],
|
109 |
-
repo_id=QUEUE_REPO,
|
110 |
-
repo_type="dataset",
|
111 |
-
commit_message=f"Add {model} to eval queue",
|
112 |
-
)
|
113 |
-
|
114 |
-
# Remove the local file
|
115 |
-
os.remove(out_path)
|
116 |
-
|
117 |
-
return styled_message(
|
118 |
-
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|