Yuxuan-Zhang-Dexter commited on
Commit
6ebb0fb
·
1 Parent(s): 72bd46b

update gradip app

Browse files
assets/game_video_link.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "sokoban": "https://www.youtube.com/watch?v=59enV32MBUE",
3
+ "super_mario": "https://www.youtube.com/watch?v=nixMIJZYAgg",
4
+ "2048": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
5
+ "candy": "https://www.youtube.com/watch?v=b-Uyz3W4yIg"
6
+ }
assets/model_color.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "claude-3-7-sonnet-20250219": "#4A90E2",
3
+ "claude-3-7-sonnet-20250219(thinking)": "#2E5C8A",
4
+ "claude-3-5-haiku-20241022": "#7FB5E6",
5
+ "claude-3-5-sonnet-20241022": "#1A4C7C",
6
+ "gemini-2.0-flash": "#FF4081",
7
+ "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
8
+ "gemini-2.5-pro-exp-03-25": "#FF80AB",
9
+ "gpt-4o-2024-11-20": "#00BFA5",
10
+ "gpt-4.5-preview-2025-02-27": "#00796B",
11
+ "o1-2024-12-17": "#4DB6AC",
12
+ "o1-mini-2024-09-12": "#26A69A",
13
+ "o3-mini-2025-01-31(medium)": "#80CBC4",
14
+ "deepseek-v3": "#FFC107",
15
+ "deepseek-r1": "#FFA000",
16
+ "Llama-4-Maverick-17B-128E-Instruct-FP8": "#8E24AA"
17
+ }
assets/news.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "news": [
3
+ {
4
+ "date": "2025-04-01",
5
+ "video_link": "https://www.youtube.com/watch?v=uFVpNor7l_E",
6
+ "twitter_text": "Google's Gemini 2.5 Pro redefines AI gameplay: its multi-modal edge outperforms o1 & Claude 3.7 in Sokoban.",
7
+ "twitter_link": "https://x.com/haoailab/status/1907140718650704204"
8
+ },
9
+ {
10
+ "date": "2025-03-18",
11
+ "video_link": "https://www.youtube.com/watch?v=b-Uyz3W4yIg",
12
+ "twitter_text": "Candy Crush Saga's Hidden Complexity: Top AI Models Take the Challenge",
13
+ "twitter_link": "https://x.com/haoailab/status/1902095369808601551"
14
+ },
15
+ {
16
+ "date": "2025-03-14",
17
+ "video_link": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
18
+ "twitter_text": "2048 Mastery: Only Two AI Models Crack the Code to Surpass Random Play",
19
+ "twitter_link": "https://x.com/haoailab/status/1900645722095317255"
20
+ },
21
+ {
22
+ "date": "2025-03-06",
23
+ "video_link": "https://www.youtube.com/watch?v=59enV32MBUE",
24
+ "twitter_text": "Sokoban Showdown: o3-mini Dominates by Reaching Level 4",
25
+ "twitter_link": "https://x.com/haoailab/status/1897792946646421514"
26
+ },
27
+ {
28
+ "date": "2025-02-28",
29
+ "video_link": "https://www.youtube.com/watch?v=nixMIJZYAgg",
30
+ "twitter_text": "Super Mario AI Revolution: Claude-3.7 Sets Unprecedented Gameplay Benchmarks",
31
+ "twitter_link": "https://x.com/haoailab/status/1895557913621795076"
32
+ }
33
+ ]
34
+ }
data_visualization.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib
2
+ matplotlib.use('Agg') # Use Agg backend for thread safety
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ import json
8
+ import os
9
+ from leaderboard_utils import (
10
+ get_organization,
11
+ get_mario_leaderboard,
12
+ get_sokoban_leaderboard,
13
+ get_2048_leaderboard,
14
+ get_candy_leaderboard,
15
+ get_tetris_leaderboard,
16
+ get_tetris_planning_leaderboard,
17
+ get_combined_leaderboard,
18
+ GAME_ORDER
19
+ )
20
+
21
+ # Load model colors
22
+ with open('assets/model_color.json', 'r') as f:
23
+ MODEL_COLORS = json.load(f)
24
+
25
+ # Define game score columns mapping
26
+ GAME_SCORE_COLUMNS = {
27
+ "Super Mario Bros": "Score",
28
+ "Sokoban": "Levels Cracked",
29
+ "2048": "Score",
30
+ "Candy Crash": "Average Score",
31
+ "Tetris (complete)": "Score",
32
+ "Tetris (planning only)": "Score"
33
+ }
34
+
35
+ def normalize_values(values, mean, std):
36
+ """
37
+ Normalize values using z-score and scale to 0-100 range
38
+
39
+ Args:
40
+ values (list): List of values to normalize
41
+ mean (float): Mean value for normalization
42
+ std (float): Standard deviation for normalization
43
+
44
+ Returns:
45
+ list: Normalized values scaled to 0-100 range
46
+ """
47
+ if std == 0:
48
+ return [50 if v > 0 else 0 for v in values] # Handle zero std case
49
+ z_scores = [(v - mean) / std for v in values]
50
+ # Scale z-scores to 0-100 range, with mean at 50
51
+ scaled_values = [max(0, min(100, (z * 30) + 50)) for z in z_scores]
52
+ return scaled_values
53
+
54
+ def simplify_model_name(model_name):
55
+ """
56
+ Simplify model name by either taking first 11 chars or string before third '-'
57
+ """
58
+ hyphen_parts = model_name.split('-')
59
+ return '-'.join(hyphen_parts[:3]) if len(hyphen_parts) >= 3 else model_name[:11]
60
+
61
+ def create_horizontal_bar_chart(df, game_name):
62
+ """
63
+ Create horizontal bar chart for detailed game view
64
+
65
+ Args:
66
+ df (pd.DataFrame): DataFrame containing game data
67
+ game_name (str): Name of the game to display
68
+
69
+ Returns:
70
+ matplotlib.figure.Figure: The generated bar chart figure
71
+ """
72
+ # Close any existing figures to prevent memory leaks
73
+ plt.close('all')
74
+
75
+ # Set style
76
+ plt.style.use('default')
77
+ # Increase figure width to accommodate long model names
78
+ fig, ax = plt.subplots(figsize=(20, 11))
79
+
80
+ # Sort by score
81
+ if game_name == "Super Mario Bros":
82
+ score_col = "Score"
83
+ df_sorted = df.sort_values(by=score_col, ascending=True)
84
+ elif game_name == "Sokoban":
85
+ # Process Sokoban scores by splitting and getting max level
86
+ def get_max_level(levels_str):
87
+ try:
88
+ # Split by semicolon, strip whitespace, filter empty strings, convert to integers
89
+ levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
90
+ return max(levels) if levels else 0
91
+ except:
92
+ return 0
93
+
94
+ # Create a temporary column with max levels
95
+ df['Max Level'] = df['Levels Cracked'].apply(get_max_level)
96
+ df_sorted = df.sort_values(by='Max Level', ascending=True)
97
+ score_col = 'Max Level'
98
+ elif game_name == "2048":
99
+ score_col = "Score"
100
+ df_sorted = df.sort_values(by=score_col, ascending=True)
101
+ elif game_name == "Candy Crash":
102
+ score_col = "Average Score"
103
+ df_sorted = df.sort_values(by=score_col, ascending=True)
104
+ elif game_name in ["Tetris (complete)", "Tetris (planning only)"]:
105
+ score_col = "Score"
106
+ df_sorted = df.sort_values(by=score_col, ascending=True)
107
+ else:
108
+ return None
109
+
110
+ # Create color gradient
111
+ colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(df_sorted)))
112
+
113
+ # Create horizontal bars
114
+ bars = ax.barh(range(len(df_sorted)), df_sorted[score_col], color=colors)
115
+
116
+ # Add more space for labels on the left
117
+ plt.subplots_adjust(left=0.3)
118
+
119
+ # Customize the chart
120
+ ax.set_yticks(range(len(df_sorted)))
121
+
122
+ # Format player names: keep organization info and truncate the rest if too long
123
+ def format_player_name(player, org):
124
+ max_length = 40 # Maximum length for player name
125
+ if len(player) > max_length:
126
+ # Keep the first part and last part of the name
127
+ parts = player.split('-')
128
+ if len(parts) > 3:
129
+ formatted = f"{parts[0]}-{parts[1]}-...{parts[-1]}"
130
+ else:
131
+ formatted = player[:max_length-3] + "..."
132
+ else:
133
+ formatted = player
134
+ return f"{formatted} [{org}]"
135
+
136
+ player_labels = [format_player_name(row['Player'], row['Organization'])
137
+ for _, row in df_sorted.iterrows()]
138
+ ax.set_yticklabels(player_labels, fontsize=9)
139
+
140
+ # Add value labels on the bars
141
+ for i, bar in enumerate(bars):
142
+ width = bar.get_width()
143
+ if game_name == "Candy Crash":
144
+ score_text = f'{width:.1f}'
145
+ else:
146
+ score_text = f'{width:.0f}'
147
+
148
+ ax.text(width, bar.get_y() + bar.get_height()/2,
149
+ score_text,
150
+ ha='left', va='center',
151
+ fontsize=10,
152
+ fontweight='bold',
153
+ color='white',
154
+ bbox=dict(facecolor=(0, 0, 0, 0.3),
155
+ edgecolor='none',
156
+ alpha=0.5,
157
+ pad=2))
158
+
159
+ # Set title and labels
160
+ ax.set_title(f"{game_name} Performance",
161
+ pad=20,
162
+ fontsize=14,
163
+ fontweight='bold',
164
+ color='#2c3e50')
165
+
166
+ if game_name == "Sokoban":
167
+ ax.set_xlabel("Maximum Level Reached",
168
+ fontsize=12,
169
+ fontweight='bold',
170
+ color='#2c3e50',
171
+ labelpad=10)
172
+ else:
173
+ ax.set_xlabel(score_col,
174
+ fontsize=12,
175
+ fontweight='bold',
176
+ color='#2c3e50',
177
+ labelpad=10)
178
+
179
+ # Add grid lines
180
+ ax.grid(True, axis='x', linestyle='--', alpha=0.3)
181
+
182
+ # Remove top and right spines
183
+ ax.spines['top'].set_visible(False)
184
+ ax.spines['right'].set_visible(False)
185
+
186
+ # Adjust layout
187
+ plt.tight_layout()
188
+
189
+ return fig
190
+
191
+ def create_radar_charts(df):
192
+ """
193
+ Create two radar charts with improved normalization using z-scores
194
+ """
195
+ # Close any existing figures to prevent memory leaks
196
+ plt.close('all')
197
+
198
+ # Define reasoning models
199
+ reasoning_models = [
200
+ 'claude-3-7-sonnet-20250219(thinking)',
201
+ 'o1-2024-12-17',
202
+ 'gemini-2.0-flash-thinking-exp-1219',
203
+ 'o3-mini-2025-01-31(medium)',
204
+ 'gemini-2.5-pro-exp-03-25',
205
+ 'o1-mini-2024-09-12',
206
+ 'deepseek-r1'
207
+ ]
208
+
209
+ # Split dataframe into reasoning and non-reasoning models
210
+ df_reasoning = df[df['Player'].isin(reasoning_models)]
211
+ df_others = df[~df['Player'].isin(reasoning_models)]
212
+
213
+ # Get game columns
214
+ game_columns = [col for col in df.columns if col.endswith(' Score')]
215
+ categories = [col.replace(' Score', '') for col in game_columns]
216
+
217
+ # Create figure with two subplots - adjusted size for new layout
218
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6), subplot_kw=dict(projection='polar'))
219
+ fig.patch.set_facecolor('white') # Set figure background to white
220
+
221
+ def get_game_stats(df, game_col):
222
+ """
223
+ Get mean and std for a game column, handling missing values
224
+ """
225
+ values = []
226
+ for val in df[game_col]:
227
+ if isinstance(val, str) and val == '_':
228
+ values.append(0)
229
+ else:
230
+ try:
231
+ values.append(float(val))
232
+ except:
233
+ values.append(0)
234
+ return np.mean(values), np.std(values)
235
+
236
+ def setup_radar_plot(ax, data, title):
237
+ ax.set_facecolor('white') # Set subplot background to white
238
+
239
+ num_vars = len(categories)
240
+ angles = np.linspace(0, 2*np.pi, num_vars, endpoint=False)
241
+ angles = np.concatenate((angles, [angles[0]]))
242
+
243
+ # Plot grid lines with darker color
244
+ grid_values = [10, 30, 50, 70, 90]
245
+ ax.set_rgrids(grid_values,
246
+ labels=grid_values,
247
+ angle=45,
248
+ fontsize=6,
249
+ alpha=0.7, # Increased alpha for better visibility
250
+ color='#404040') # Darker color for grid labels
251
+
252
+ # Make grid lines darker but still subtle
253
+ ax.grid(True, color='#404040', alpha=0.3) # Darker grid lines
254
+
255
+ # Define darker, more vibrant colors for the radar plots
256
+ colors = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e', '#9467bd', '#8c564b']
257
+
258
+ # Calculate game statistics once
259
+ game_stats = {col: get_game_stats(df, col) for col in game_columns}
260
+
261
+ # Plot data with darker lines and higher opacity for fills
262
+ for idx, (_, row) in enumerate(data.iterrows()):
263
+ values = []
264
+ for col in game_columns:
265
+ val = row[col]
266
+ if isinstance(val, str) and val == '_':
267
+ values.append(0)
268
+ else:
269
+ try:
270
+ values.append(float(val))
271
+ except:
272
+ values.append(0)
273
+
274
+ # Normalize values using game statistics
275
+ normalized_values = []
276
+ for i, v in enumerate(values):
277
+ mean, std = game_stats[game_columns[i]]
278
+ normalized_value = normalize_values([v], mean, std)[0]
279
+ normalized_values.append(normalized_value)
280
+
281
+ # Complete the circular plot
282
+ normalized_values = np.concatenate((normalized_values, [normalized_values[0]]))
283
+
284
+ model_name = simplify_model_name(row['Player'])
285
+ ax.plot(angles, normalized_values, 'o-', linewidth=2.0, # Increased line width
286
+ label=model_name,
287
+ color=colors[idx % len(colors)],
288
+ markersize=4) # Increased marker size
289
+ ax.fill(angles, normalized_values,
290
+ alpha=0.3, # Increased fill opacity
291
+ color=colors[idx % len(colors)])
292
+
293
+ # Format categories
294
+ formatted_categories = []
295
+ for game in categories:
296
+ if game == "Tetris (planning only)":
297
+ game = "Tetris\n(planning)"
298
+ elif game == "Tetris (complete)":
299
+ game = "Tetris\n(complete)"
300
+ elif game == "Super Mario Bros":
301
+ game = "Super\nMario"
302
+ elif game == "Candy Crash":
303
+ game = "Candy\nCrash"
304
+ formatted_categories.append(game)
305
+
306
+ ax.set_xticks(angles[:-1])
307
+ ax.set_xticklabels(formatted_categories,
308
+ fontsize=8, # Slightly larger font
309
+ color='#202020', # Darker text
310
+ fontweight='bold') # Bold text
311
+ ax.tick_params(pad=10, colors='#202020') # Darker tick colors
312
+
313
+ ax.set_title(title,
314
+ pad=20,
315
+ fontsize=11, # Slightly larger title
316
+ color='#202020', # Darker title
317
+ fontweight='bold') # Bold title
318
+
319
+ legend = ax.legend(loc='upper right',
320
+ bbox_to_anchor=(1.3, 1.1),
321
+ fontsize=7, # Slightly larger legend
322
+ framealpha=0.9, # More opaque legend
323
+ edgecolor='#404040', # Darker edge
324
+ ncol=1)
325
+
326
+ ax.set_ylim(0, 105)
327
+ ax.spines['polar'].set_color('#404040') # Darker spine
328
+ ax.spines['polar'].set_alpha(0.5) # More visible spine
329
+
330
+ # Setup both plots
331
+ setup_radar_plot(ax1, df_reasoning, "Reasoning Models")
332
+ setup_radar_plot(ax2, df_others, "Non-Reasoning Models")
333
+
334
+ plt.subplots_adjust(right=0.85, wspace=0.3)
335
+
336
+ return fig
337
+
338
+ def get_combined_leaderboard_with_radar(rank_data, selected_games):
339
+ """
340
+ Get combined leaderboard and create radar charts
341
+ """
342
+ df = get_combined_leaderboard(rank_data, selected_games)
343
+ radar_fig = create_radar_charts(df)
344
+ return df, radar_fig
345
+
346
+ def create_organization_radar_chart(rank_data):
347
+ """
348
+ Create radar chart comparing organizations
349
+ """
350
+ # Get combined leaderboard with all games
351
+ df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})
352
+
353
+ # Group by organization and calculate average scores
354
+ org_performance = {}
355
+ for org in df["Organization"].unique():
356
+ org_df = df[df["Organization"] == org]
357
+ scores = {}
358
+ for game in GAME_ORDER:
359
+ game_scores = org_df[f"{game} Score"].apply(lambda x: float(x) if x != "_" else 0)
360
+ scores[game] = game_scores.mean()
361
+ org_performance[org] = scores
362
+
363
+ # Create radar chart
364
+ return create_radar_charts(pd.DataFrame([org_performance]))
365
+
366
+ def create_top_players_radar_chart(rank_data, n=5):
367
+ """
368
+ Create radar chart for top N players
369
+ """
370
+ # Get combined leaderboard with all games
371
+ df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})
372
+
373
+ # Get top N players
374
+ top_players = df["Player"].head(n).tolist()
375
+
376
+ # Create radar chart for top players
377
+ return create_radar_charts(df[df["Player"].isin(top_players)])
378
+
379
+ def create_player_radar_chart(rank_data, player_name):
380
+ """
381
+ Create radar chart for a specific player
382
+ """
383
+ # Get combined leaderboard with all games
384
+ df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})
385
+
386
+ # Get player's data
387
+ player_df = df[df["Player"] == player_name]
388
+
389
+ if player_df.empty:
390
+ return None
391
+
392
+ # Create radar chart for the player
393
+ return create_radar_charts(player_df)
394
+
395
+ def create_group_bar_chart(df):
396
+ """
397
+ Create a grouped bar chart comparing AI model performance across different games
398
+
399
+ Args:
400
+ df (pd.DataFrame): DataFrame containing the combined leaderboard data
401
+
402
+ Returns:
403
+ matplotlib.figure.Figure: The generated group bar chart figure
404
+ """
405
+ # Close any existing figures to prevent memory leaks
406
+ plt.close('all')
407
+
408
+ # Create figure and axis with better styling
409
+ sns.set_style("whitegrid")
410
+ fig = plt.figure(figsize=(20, 11))
411
+
412
+ # Create subplot with specific spacing
413
+ ax = plt.subplot(111)
414
+
415
+ # Adjust the subplot parameters
416
+ plt.subplots_adjust(top=0.90, # Add more space at the top
417
+ bottom=0.15, # Add more space at the bottom
418
+ right=0.85, # Add more space for legend
419
+ left=0.05) # Add space on the left
420
+
421
+ # Get unique models
422
+ models = df['Player'].unique()
423
+
424
+ # Get active games (those that have score columns in the DataFrame)
425
+ active_games = []
426
+ for game in GAME_ORDER:
427
+ score_col = f"{game} Score" # Use the same column name for all games
428
+ if score_col in df.columns:
429
+ active_games.append(game)
430
+
431
+ n_games = len(active_games)
432
+ if n_games == 0:
433
+ return fig # Return empty figure if no games are selected
434
+
435
+ # Keep track of which models have data in any game
436
+ models_with_data = set()
437
+
438
+ # Calculate normalized scores for each game
439
+ for game_idx, game in enumerate(active_games):
440
+ # Get all scores for this game
441
+ game_scores = []
442
+
443
+ # Use the same score column name for all games
444
+ score_col = f"{game} Score"
445
+
446
+ for model in models:
447
+ try:
448
+ score = df[df['Player'] == model][score_col].values[0]
449
+ if score != '_' and float(score) > 0: # Only include non-zero scores
450
+ game_scores.append((model, float(score)))
451
+ models_with_data.add(model) # Add model to set if it has valid data
452
+ except (IndexError, ValueError):
453
+ continue
454
+
455
+ if not game_scores: # Skip if no valid scores for this game
456
+ continue
457
+
458
+ # Sort scores from highest to lowest
459
+ game_scores.sort(key=lambda x: x[1], reverse=True)
460
+
461
+ # Extract sorted models and scores
462
+ sorted_models = [x[0] for x in game_scores]
463
+ scores = [x[1] for x in game_scores]
464
+
465
+ # Calculate mean and std for normalization
466
+ mean = np.mean(scores)
467
+ std = np.std(scores)
468
+
469
+ # Normalize scores
470
+ normalized_scores = normalize_values(scores, mean, std)
471
+
472
+ # Calculate bar width based on number of models in this game
473
+ n_models_in_game = len(sorted_models)
474
+ bar_width = 0.8 / n_models_in_game if n_models_in_game > 0 else 0.8
475
+
476
+ # Plot bars for each model
477
+ for i, (model, score) in enumerate(zip(sorted_models, normalized_scores)):
478
+ # Only add to legend if first appearance and model has data
479
+ should_label = model in models_with_data and model not in [l.get_text() for l in ax.get_legend().get_texts()] if ax.get_legend() else True
480
+
481
+ # Get color from MODEL_COLORS, use a default if not found
482
+ color = MODEL_COLORS.get(model, f"C{i % 10}") # Use matplotlib default colors as fallback
483
+
484
+ ax.bar(game_idx + i*bar_width, score,
485
+ width=bar_width,
486
+ label=model if should_label else "",
487
+ color=color,
488
+ alpha=0.8)
489
+
490
+ # Customize the plot
491
+ ax.set_xticks(np.arange(n_games))
492
+ ax.set_xticklabels(active_games, rotation=45, ha='right', fontsize=10)
493
+ ax.set_ylabel('Normalized Performance Score', fontsize=12)
494
+ ax.set_title('AI Model Performance Comparison Across Gaming Tasks',
495
+ fontsize=14, pad=20)
496
+
497
+ # Add grid lines
498
+ ax.grid(True, axis='y', linestyle='--', alpha=0.3)
499
+
500
+ # Create legend with unique entries
501
+ handles, labels = ax.get_legend_handles_labels()
502
+ by_label = dict(zip(labels, handles))
503
+
504
+ # Sort models by their first appearance in active games
505
+ model_order = []
506
+ for game in active_games:
507
+ score_col = f"{game} Score" # Use the same column name for all games
508
+ for model in models:
509
+ try:
510
+ score = df[df['Player'] == model][score_col].values[0]
511
+ if score != '_' and float(score) > 0 and model not in model_order:
512
+ model_order.append(model)
513
+ except (IndexError, ValueError):
514
+ continue
515
+
516
+ # Create legend with sorted models
517
+ sorted_handles = [by_label[model] for model in model_order if model in by_label]
518
+ sorted_labels = [model for model in model_order if model in by_label]
519
+
520
+ ax.legend(sorted_handles, sorted_labels,
521
+ bbox_to_anchor=(1.00, 1), # Moved from (1.15, 1) to (1.05, 1) to shift left
522
+ loc='upper left',
523
+ fontsize=9,
524
+ title='AI Models',
525
+ title_fontsize=10)
526
+
527
+ # No need for tight_layout() as we're manually controlling the spacing
528
+
529
+ return fig
530
+
531
+ def get_combined_leaderboard_with_group_bar(rank_data, selected_games):
532
+ """
533
+ Get combined leaderboard and create group bar chart
534
+
535
+ Args:
536
+ rank_data (dict): Dictionary containing rank data
537
+ selected_games (dict): Dictionary of game names and their selection status
538
+
539
+ Returns:
540
+ tuple: (DataFrame, matplotlib.figure.Figure) containing the leaderboard data and group bar chart
541
+ """
542
+ df = get_combined_leaderboard(rank_data, selected_games)
543
+ group_bar_fig = create_group_bar_chart(df)
544
+ return df, group_bar_fig
545
+
546
+ def save_visualization(fig, filename):
547
+ """
548
+ Save visualization to file
549
+ """
550
+ fig.savefig(filename, bbox_inches='tight', dpi=300)
leaderboard_utils.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ import numpy as np
4
+
5
+ # Define game order
6
+ GAME_ORDER = [
7
+ "Super Mario Bros",
8
+ "Sokoban",
9
+ "2048",
10
+ "Candy Crash",
11
+ "Tetris (complete)",
12
+ "Tetris (planning only)"
13
+ ]
14
+
15
+ def get_organization(model_name):
16
+ m = model_name.lower()
17
+ if "claude" in m:
18
+ return "anthropic"
19
+ elif "gemini" in m:
20
+ return "google"
21
+ elif "o1" in m or "gpt" in m or "o3" in m:
22
+ return "openai"
23
+ elif "deepseek" in m:
24
+ return "deepseek"
25
+ else:
26
+ return "unknown"
27
+
28
+ def get_mario_leaderboard(rank_data):
29
+ data = rank_data.get("Super Mario Bros", {}).get("results", [])
30
+ df = pd.DataFrame(data)
31
+ df = df.rename(columns={
32
+ "model": "Player",
33
+ "progress": "Progress (current/total)",
34
+ "score": "Score",
35
+ "time_s": "Time (s)"
36
+ })
37
+ df["Organization"] = df["Player"].apply(get_organization)
38
+ df = df[["Player", "Organization", "Progress (current/total)", "Score", "Time (s)"]]
39
+ return df
40
+
41
+ def get_sokoban_leaderboard(rank_data):
42
+ data = rank_data.get("Sokoban", {}).get("results", [])
43
+ df = pd.DataFrame(data)
44
+ df = df.rename(columns={
45
+ "model": "Player",
46
+ "levels_cracked": "Levels Cracked",
47
+ "steps": "Steps"
48
+ })
49
+ df["Organization"] = df["Player"].apply(get_organization)
50
+ df = df[["Player", "Organization", "Levels Cracked", "Steps"]]
51
+ return df
52
+
53
+ def get_2048_leaderboard(rank_data):
54
+ data = rank_data.get("2048", {}).get("results", [])
55
+ df = pd.DataFrame(data)
56
+ df = df.rename(columns={
57
+ "model": "Player",
58
+ "score": "Score",
59
+ "steps": "Steps",
60
+ "time": "Time"
61
+ })
62
+ df["Organization"] = df["Player"].apply(get_organization)
63
+ df = df[["Player", "Organization", "Score", "Steps", "Time"]]
64
+ return df
65
+
66
+ def get_candy_leaderboard(rank_data):
67
+ data = rank_data.get("Candy Crash", {}).get("results", [])
68
+ df = pd.DataFrame(data)
69
+ df = df.rename(columns={
70
+ "model": "Player",
71
+ "score_runs": "Score Runs",
72
+ "average_score": "Average Score",
73
+ "steps": "Steps"
74
+ })
75
+ df["Organization"] = df["Player"].apply(get_organization)
76
+ df = df[["Player", "Organization", "Score Runs", "Average Score", "Steps"]]
77
+ return df
78
+
79
+ def get_tetris_leaderboard(rank_data):
80
+ data = rank_data.get("Tetris (complete)", {}).get("results", [])
81
+ df = pd.DataFrame(data)
82
+ df = df.rename(columns={
83
+ "model": "Player",
84
+ "score": "Score",
85
+ "steps_blocks": "Steps"
86
+ })
87
+ df["Organization"] = df["Player"].apply(get_organization)
88
+ df = df[["Player", "Organization", "Score", "Steps"]]
89
+ return df
90
+
91
+ def get_tetris_planning_leaderboard(rank_data):
92
+ data = rank_data.get("Tetris (planning only)", {}).get("results", [])
93
+ df = pd.DataFrame(data)
94
+ df = df.rename(columns={
95
+ "model": "Player",
96
+ "score": "Score",
97
+ "steps_blocks": "Steps"
98
+ })
99
+ df["Organization"] = df["Player"].apply(get_organization)
100
+ df = df[["Player", "Organization", "Score", "Steps"]]
101
+ return df
102
+
103
+ def calculate_rank_and_completeness(rank_data, selected_games):
104
+ # Dictionary to store DataFrames for each game
105
+ game_dfs = {}
106
+
107
+ # Get DataFrames for selected games
108
+ if selected_games.get("Super Mario Bros"):
109
+ game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
110
+ if selected_games.get("Sokoban"):
111
+ game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
112
+ if selected_games.get("2048"):
113
+ game_dfs["2048"] = get_2048_leaderboard(rank_data)
114
+ if selected_games.get("Candy Crash"):
115
+ game_dfs["Candy Crash"] = get_candy_leaderboard(rank_data)
116
+ if selected_games.get("Tetris (complete)"):
117
+ game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
118
+ if selected_games.get("Tetris (planning only)"):
119
+ game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
120
+
121
+ # Get all unique players
122
+ all_players = set()
123
+ for df in game_dfs.values():
124
+ all_players.update(df["Player"].unique())
125
+ all_players = sorted(list(all_players))
126
+
127
+ # Create results DataFrame
128
+ results = []
129
+ for player in all_players:
130
+ player_data = {
131
+ "Player": player,
132
+ "Organization": get_organization(player)
133
+ }
134
+ ranks = []
135
+ games_played = 0
136
+
137
+ # Calculate rank and completeness for each game
138
+ for game in GAME_ORDER:
139
+ if game in game_dfs:
140
+ df = game_dfs[game]
141
+ if player in df["Player"].values:
142
+ games_played += 1
143
+ # Get player's score based on game type
144
+ if game == "Super Mario Bros":
145
+ player_score = df[df["Player"] == player]["Score"].iloc[0]
146
+ rank = len(df[df["Score"] > player_score]) + 1
147
+ elif game == "Sokoban":
148
+ # Parse Sokoban score string and get maximum level
149
+ levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
150
+ try:
151
+ # Split by semicolon, strip whitespace, filter empty strings, convert to integers
152
+ levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
153
+ player_score = max(levels) if levels else 0
154
+ except:
155
+ player_score = 0
156
+ # Calculate rank based on maximum level
157
+ rank = len(df[df["Levels Cracked"].apply(
158
+ lambda x: max([int(y.strip()) for y in x.split(";") if y.strip()]) > player_score
159
+ )]) + 1
160
+ elif game == "2048":
161
+ player_score = df[df["Player"] == player]["Score"].iloc[0]
162
+ rank = len(df[df["Score"] > player_score]) + 1
163
+ elif game == "Candy Crash":
164
+ player_score = df[df["Player"] == player]["Average Score"].iloc[0]
165
+ rank = len(df[df["Average Score"] > player_score]) + 1
166
+ elif game == "Tetris (complete)":
167
+ player_score = df[df["Player"] == player]["Score"].iloc[0]
168
+ rank = len(df[df["Score"] > player_score]) + 1
169
+ elif game == "Tetris (planning only)":
170
+ player_score = df[df["Player"] == player]["Score"].iloc[0]
171
+ rank = len(df[df["Score"] > player_score]) + 1
172
+
173
+ ranks.append(rank)
174
+ player_data[f"{game} Score"] = player_score
175
+ else:
176
+ player_data[f"{game} Score"] = "_"
177
+
178
+ # Calculate average rank and completeness for sorting only
179
+ if ranks:
180
+ player_data["Sort Rank"] = round(np.mean(ranks), 2)
181
+ player_data["Games Played"] = games_played
182
+ else:
183
+ player_data["Sort Rank"] = float('inf')
184
+ player_data["Games Played"] = 0
185
+
186
+ results.append(player_data)
187
+
188
+ # Create DataFrame and sort by average rank and completeness
189
+ df_results = pd.DataFrame(results)
190
+ if not df_results.empty:
191
+ # Sort by average rank (ascending) and completeness (descending)
192
+ df_results = df_results.sort_values(
193
+ by=["Sort Rank", "Games Played"],
194
+ ascending=[True, False]
195
+ )
196
+ # Drop the sorting columns
197
+ df_results = df_results.drop(["Sort Rank", "Games Played"], axis=1)
198
+
199
+ return df_results
200
+
201
+ def get_combined_leaderboard(rank_data, selected_games):
202
+ """
203
+ Get combined leaderboard for selected games
204
+
205
+ Args:
206
+ rank_data (dict): Dictionary containing rank data
207
+ selected_games (dict): Dictionary of game names and their selection status
208
+
209
+ Returns:
210
+ pd.DataFrame: Combined leaderboard DataFrame
211
+ """
212
+ # Dictionary to store DataFrames for each game
213
+ game_dfs = {}
214
+
215
+ # Get DataFrames for selected games
216
+ if selected_games.get("Super Mario Bros"):
217
+ game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
218
+ if selected_games.get("Sokoban"):
219
+ game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
220
+ if selected_games.get("2048"):
221
+ game_dfs["2048"] = get_2048_leaderboard(rank_data)
222
+ if selected_games.get("Candy Crash"):
223
+ game_dfs["Candy Crash"] = get_candy_leaderboard(rank_data)
224
+ if selected_games.get("Tetris (complete)"):
225
+ game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
226
+ if selected_games.get("Tetris (planning only)"):
227
+ game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
228
+
229
+ # Get all unique players
230
+ all_players = set()
231
+ for df in game_dfs.values():
232
+ all_players.update(df["Player"].unique())
233
+ all_players = sorted(list(all_players))
234
+
235
+ # Create results DataFrame
236
+ results = []
237
+ for player in all_players:
238
+ player_data = {
239
+ "Player": player,
240
+ "Organization": get_organization(player)
241
+ }
242
+
243
+ # Add scores for each game
244
+ for game in GAME_ORDER:
245
+ if game in game_dfs:
246
+ df = game_dfs[game]
247
+ if player in df["Player"].values:
248
+ if game == "Super Mario Bros":
249
+ player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
250
+ elif game == "Sokoban":
251
+ # Parse Sokoban score string and get maximum level
252
+ levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
253
+ try:
254
+ levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
255
+ player_data[f"{game} Score"] = max(levels) if levels else 0
256
+ except:
257
+ player_data[f"{game} Score"] = 0
258
+ elif game == "2048":
259
+ player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
260
+ elif game == "Candy Crash":
261
+ player_data[f"{game} Score"] = df[df["Player"] == player]["Average Score"].iloc[0]
262
+ elif game in ["Tetris (complete)", "Tetris (planning only)"]:
263
+ player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
264
+ else:
265
+ player_data[f"{game} Score"] = "_"
266
+
267
+ results.append(player_data)
268
+
269
+ # Create DataFrame
270
+ df_results = pd.DataFrame(results)
271
+
272
+ # Sort by total score across all games
273
+ if not df_results.empty:
274
+ # Calculate total score for each player
275
+ df_results["Total Score"] = 0
276
+ for game in GAME_ORDER:
277
+ if f"{game} Score" in df_results.columns:
278
+ df_results["Total Score"] += df_results[f"{game} Score"].apply(
279
+ lambda x: float(x) if x != "_" else 0
280
+ )
281
+
282
+ # Sort by total score in descending order
283
+ df_results = df_results.sort_values("Total Score", ascending=False)
284
+
285
+ # Drop the temporary total score column
286
+ df_results = df_results.drop("Total Score", axis=1)
287
+
288
+ return df_results
rank_data_03_25_2025.json ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Super Mario Bros": {
3
+ "runs": 5,
4
+ "results": [
5
+ {
6
+ "model": "claude-3-7-sonnet-20250219",
7
+ "score": 710,
8
+ "progress": "1-1",
9
+ "time_s": 64.2,
10
+ "rank": 1
11
+ },
12
+ {
13
+ "model": "gpt-4o-2024-11-20",
14
+ "score": 560,
15
+ "progress": "1-1",
16
+ "time_s": 58.6,
17
+ "rank": 2
18
+ },
19
+ {
20
+ "model": "gemini-2.0-flash",
21
+ "score": 320,
22
+ "progress": "1-1",
23
+ "time_s": 51.8,
24
+ "rank": 3
25
+ },
26
+ {
27
+ "model": "claude-3-5-haiku-20241022",
28
+ "score": 140,
29
+ "progress": "1-1",
30
+ "time_s": 76.4,
31
+ "rank": 4
32
+ },
33
+ {
34
+ "model": "gpt-4.5-preview-2025-02-27",
35
+ "score": 160,
36
+ "progress": "1-1",
37
+ "time_s": 62.8,
38
+ "rank": 5
39
+ }
40
+ ]
41
+ },
42
+ "2048": {
43
+ "runs": 1,
44
+ "results": [
45
+ {
46
+ "model": "claude-3-7-sonnet-20250219(thinking)",
47
+ "score": 256,
48
+ "steps": 114,
49
+ "time": ">200",
50
+ "rank": 1
51
+ },
52
+ {
53
+ "model": "o1-2024-12-17",
54
+ "score": 256,
55
+ "steps": 116,
56
+ "time": ">200",
57
+ "rank": 2
58
+ },
59
+ {
60
+ "model": "claude-3-7-sonnet-20250219",
61
+ "score": 256,
62
+ "steps": 130,
63
+ "time": "20:36",
64
+ "rank": 3
65
+ },
66
+ {
67
+ "model": "deepseek-v3",
68
+ "score": 256,
69
+ "steps": 216,
70
+ "time": "54.02",
71
+ "rank": 4
72
+ },
73
+ {
74
+ "model": "gemini-2.0-flash",
75
+ "score": 128,
76
+ "steps": 111,
77
+ "time": "18:43",
78
+ "rank": 5
79
+ },
80
+ {
81
+ "model": "gemini-2.0-flash-thinking-exp-1219",
82
+ "score": 128,
83
+ "steps": 132,
84
+ "time": ">100",
85
+ "rank": 6
86
+ },
87
+ {
88
+ "model": "gemini-2.5-pro-exp-03-25",
89
+ "score": 128,
90
+ "steps": 138,
91
+ "time": "169",
92
+ "rank": 7
93
+ },
94
+ {
95
+ "model": "claude-3-5-sonnet-20241022",
96
+ "score": 64,
97
+ "steps": 92,
98
+ "time": "9:2",
99
+ "rank": 9
100
+ },
101
+ {
102
+ "model": "gpt-4.5-preview-2025-02-27",
103
+ "score": 34,
104
+ "steps": 34,
105
+ "time": "8:25",
106
+ "rank": 10
107
+ },
108
+ {
109
+ "model": "gpt-4o-2024-11-20",
110
+ "score": 16,
111
+ "steps": 21,
112
+ "time": "1:17",
113
+ "rank": 11
114
+ },
115
+ {
116
+ "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
117
+ "score": 128,
118
+ "steps": 145,
119
+ "time": ">100",
120
+ "rank": 8
121
+ }
122
+ ]
123
+ },
124
+ "Tetris (complete)": {
125
+ "runs": 3,
126
+ "results": [
127
+ {
128
+ "model": "claude-3-7-sonnet-20250219",
129
+ "score": 95,
130
+ "steps_blocks": 27,
131
+ "rank": 1
132
+ },
133
+ {
134
+ "model": "claude-3-5-haiku-20241022",
135
+ "score": 90,
136
+ "steps_blocks": 25,
137
+ "rank": 2
138
+ },
139
+ {
140
+ "model": "gemini-2.0-flash",
141
+ "score": 82,
142
+ "steps_blocks": 23,
143
+ "rank": 3
144
+ },
145
+ {
146
+ "model": "gpt-4o-2024-11-20",
147
+ "score": 54,
148
+ "steps_blocks": 19,
149
+ "rank": 4
150
+ }
151
+ ]
152
+ },
153
+ "Tetris (planning only)": {
154
+ "runs": 3,
155
+ "results": [
156
+ {
157
+ "model": "claude-3-7-sonnet-20250219",
158
+ "score": 110,
159
+ "steps_blocks": 29,
160
+ "rank": 1
161
+ },
162
+ {
163
+ "model": "claude-3-5-haiku-20241022",
164
+ "score": 92,
165
+ "steps_blocks": 25,
166
+ "rank": 2
167
+ },
168
+ {
169
+ "model": "gemini-2.0-flash",
170
+ "score": 87,
171
+ "steps_blocks": 24,
172
+ "rank": 3
173
+ },
174
+ {
175
+ "model": "gpt-4o-2024-11-20",
176
+ "score": 56,
177
+ "steps_blocks": 20,
178
+ "rank": 4
179
+ }
180
+ ]
181
+ },
182
+ "Candy Crash": {
183
+ "runs": 3,
184
+ "results": [
185
+ {
186
+ "model": "o3-mini-2025-01-31(medium)",
187
+ "score_runs": "90;109;120",
188
+ "average_score": 106.33,
189
+ "steps": 25,
190
+ "rank": 1
191
+ },
192
+ {
193
+ "model": "o1-2024-12-17",
194
+ "score_runs": "96;114;83",
195
+ "average_score": 97.67,
196
+ "steps": 25,
197
+ "rank": 2
198
+ },
199
+ {
200
+ "model": "deepseek-r1",
201
+ "score_runs": "62;108;105",
202
+ "average_score": 91.67,
203
+ "steps": 25,
204
+ "rank": 3
205
+ },
206
+ {
207
+ "model": "gemini-2.5-pro-exp-03-25",
208
+ "score_runs": "50;36;68",
209
+ "average_score": 51.33,
210
+ "steps": 25,
211
+ "rank": 4
212
+ },
213
+ {
214
+ "model": "claude-3-7-sonnet-20250219(thinking)",
215
+ "score_runs": "36;46;24",
216
+ "average_score": 35.33,
217
+ "steps": 25,
218
+ "rank": 5
219
+ },
220
+ {
221
+ "model": "gemini-2.0-flash-thinking-exp-1219",
222
+ "score_runs": "0;15;39",
223
+ "average_score": 18,
224
+ "steps": 25,
225
+ "rank": 6
226
+ },
227
+ {
228
+ "model": "claude-3-5-sonnet-20241022",
229
+ "score_runs": "3;0;0",
230
+ "average_score": 1,
231
+ "steps": 25,
232
+ "rank": 7
233
+ },
234
+ {
235
+ "model": "deepseek-v3",
236
+ "score_runs": "0;0;0",
237
+ "average_score": 0,
238
+ "steps": 25,
239
+ "rank":9
240
+ },
241
+ {
242
+ "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
243
+ "score_runs": "6;0;0",
244
+ "average_score": 2,
245
+ "steps": 25,
246
+ "rank": 8
247
+ }
248
+ ]
249
+ },
250
+ "Sokoban": {
251
+ "runs": 3,
252
+ "results": [
253
+ {
254
+ "model": "o3-mini-2025-01-31(medium)",
255
+ "levels_cracked": "2; 3; 2",
256
+ "steps": "[17,52,68];[24,58,78,91];[19,44,64]",
257
+ "rank": 1
258
+ },
259
+ {
260
+ "model": "gemini-2.5-pro-exp-03-25",
261
+ "levels_cracked": "2;2;3",
262
+ "steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
263
+ "rank": 2
264
+ },
265
+ {
266
+ "model": "claude-3-7-sonnet-20250219(thinking)",
267
+ "levels_cracked": "1; 2; 0",
268
+ "steps": "[17,35];[15,40,43];[4]",
269
+ "rank": 3
270
+ },
271
+ {
272
+ "model": "o1-2024-12-17",
273
+ "levels_cracked": "1; 1; 1",
274
+ "steps": null,
275
+ "rank": 4
276
+ },
277
+ {
278
+ "model": "deepseek-r1",
279
+ "levels_cracked": "1; 0; 1",
280
+ "steps": "[19,42];[13];[19,36]",
281
+ "note": "stuck",
282
+ "rank": 5
283
+ },
284
+ {
285
+ "model": "o1-mini-2024-09-12",
286
+ "levels_cracked": "0;1;0",
287
+ "steps": null,
288
+ "rank": 6
289
+ },
290
+ {
291
+ "model": "gemini-2.0-flash-thinking-exp-1219",
292
+ "levels_cracked": "0; 0; 0",
293
+ "steps": "[23]; [14]; [14]",
294
+ "rank": 7
295
+ },
296
+ {
297
+ "model": "gpt-4o-2024-11-20",
298
+ "levels_cracked": "0; 0; 0",
299
+ "steps": "[68];[105];[168]",
300
+ "note": "stuck in a loop",
301
+ "rank": 8
302
+ },
303
+ {
304
+ "model": "claude-3-5-sonnet-20241022",
305
+ "levels_cracked": "0; 0; 0",
306
+ "steps": "[21]; [30]; [51]",
307
+ "note": "stuck in a loop",
308
+ "rank": 9
309
+ },
310
+ {
311
+ "model": "deepseek-v3",
312
+ "levels_cracked": "0; 0; 0",
313
+ "steps": "[9]; [47]; [64]",
314
+ "rank": 10
315
+ },
316
+ {
317
+ "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
318
+ "levels_cracked": "0;0;0",
319
+ "steps": "[5]",
320
+ "rank": 11
321
+ }
322
+ ]
323
+ }
324
+ }
requirements.txt CHANGED
@@ -13,4 +13,7 @@ python-dateutil
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
- sentencepiece
 
 
 
 
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
+ sentencepiece
17
+ seaborn>=0.12.0
18
+ Pillow>=10.0.0
19
+ plotly>=5.15.0
src/about.py DELETED
@@ -1,72 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
-
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
-
21
-
22
-
23
- # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
-
26
- # What does your leaderboard evaluate?
27
- INTRODUCTION_TEXT = """
28
- Intro text
29
- """
30
-
31
- # Which evaluations are you running? how can people reproduce what you have?
32
- LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
-
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
-
38
- """
39
-
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
-
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
60
-
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
-
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
- """
69
-
70
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
- CITATION_BUTTON_TEXT = r"""
72
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/css_html_js.py DELETED
@@ -1,105 +0,0 @@
1
- custom_css = """
2
-
3
- .markdown-text {
4
- font-size: 16px !important;
5
- }
6
-
7
- #models-to-add-text {
8
- font-size: 18px !important;
9
- }
10
-
11
- #citation-button span {
12
- font-size: 16px !important;
13
- }
14
-
15
- #citation-button textarea {
16
- font-size: 16px !important;
17
- }
18
-
19
- #citation-button > label > button {
20
- margin: 6px;
21
- transform: scale(1.3);
22
- }
23
-
24
- #leaderboard-table {
25
- margin-top: 15px
26
- }
27
-
28
- #leaderboard-table-lite {
29
- margin-top: 15px
30
- }
31
-
32
- #search-bar-table-box > div:first-child {
33
- background: none;
34
- border: none;
35
- }
36
-
37
- #search-bar {
38
- padding: 0px;
39
- }
40
-
41
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
- #leaderboard-table td:nth-child(2),
43
- #leaderboard-table th:nth-child(2) {
44
- max-width: 400px;
45
- overflow: auto;
46
- white-space: nowrap;
47
- }
48
-
49
- .tab-buttons button {
50
- font-size: 20px;
51
- }
52
-
53
- #scale-logo {
54
- border-style: none !important;
55
- box-shadow: none;
56
- display: block;
57
- margin-left: auto;
58
- margin-right: auto;
59
- max-width: 600px;
60
- }
61
-
62
- #scale-logo .download {
63
- display: none;
64
- }
65
- #filter_type{
66
- border: 0;
67
- padding-left: 0;
68
- padding-top: 0;
69
- }
70
- #filter_type label {
71
- display: flex;
72
- }
73
- #filter_type label > span{
74
- margin-top: var(--spacing-lg);
75
- margin-right: 0.5em;
76
- }
77
- #filter_type label > .wrap{
78
- width: 103px;
79
- }
80
- #filter_type label > .wrap .wrap-inner{
81
- padding: 2px;
82
- }
83
- #filter_type label > .wrap .wrap-inner input{
84
- width: 1px
85
- }
86
- #filter-columns-type{
87
- border:0;
88
- padding:0.5;
89
- }
90
- #filter-columns-size{
91
- border:0;
92
- padding:0.5;
93
- }
94
- #box-filter > .form{
95
- border: 0
96
- }
97
- """
98
-
99
- get_window_url_params = """
100
- function(url_params) {
101
- const params = new URLSearchParams(window.location.search);
102
- url_params = Object.fromEntries(params);
103
- return url_params;
104
- }
105
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py DELETED
@@ -1,27 +0,0 @@
1
- def model_hyperlink(link, model_name):
2
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
-
4
-
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
8
-
9
-
10
- def styled_error(error):
11
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
-
13
-
14
- def styled_warning(warn):
15
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
-
17
-
18
- def styled_message(message):
19
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
-
21
-
22
- def has_no_nan_values(df, columns):
23
- return df[columns].notna().all(axis=1)
24
-
25
-
26
- def has_nan_values(df, columns):
27
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py DELETED
@@ -1,110 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
-
4
- import pandas as pd
5
-
6
- from src.about import Tasks
7
-
8
- def fields(raw_class):
9
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
-
11
-
12
- # These classes are for user facing column names,
13
- # to avoid having to change them all around the code
14
- # when a modif is needed
15
- @dataclass
16
- class ColumnContent:
17
- name: str
18
- type: str
19
- displayed_by_default: bool
20
- hidden: bool = False
21
- never_hidden: bool = False
22
-
23
- ## Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
-
46
- ## For the queue columns in the submission tab
47
- @dataclass(frozen=True)
48
- class EvalQueueColumn: # Queue column
49
- model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
- status = ColumnContent("status", "str", True)
55
-
56
- ## All the model information that we might need
57
- @dataclass
58
- class ModelDetails:
59
- name: str
60
- display_name: str = ""
61
- symbol: str = "" # emoji
62
-
63
-
64
- class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
- Unknown = ModelDetails(name="", symbol="?")
70
-
71
- def to_str(self, separator=" "):
72
- return f"{self.value.symbol}{separator}{self.value.name}"
73
-
74
- @staticmethod
75
- def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "⭕" in type:
83
- return ModelType.IFT
84
- return ModelType.Unknown
85
-
86
- class WeightType(Enum):
87
- Adapter = ModelDetails("Adapter")
88
- Original = ModelDetails("Original")
89
- Delta = ModelDetails("Delta")
90
-
91
- class Precision(Enum):
92
- float16 = ModelDetails("float16")
93
- bfloat16 = ModelDetails("bfloat16")
94
- Unknown = ModelDetails("?")
95
-
96
- def from_str(precision):
97
- if precision in ["torch.float16", "float16"]:
98
- return Precision.float16
99
- if precision in ["torch.bfloat16", "bfloat16"]:
100
- return Precision.bfloat16
101
- return Precision.Unknown
102
-
103
- # Column selection
104
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
-
106
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
-
109
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py DELETED
@@ -1,25 +0,0 @@
1
- import os
2
-
3
- from huggingface_hub import HfApi
4
-
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
-
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
-
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
-
16
- # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
-
19
- # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
-
25
- API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py DELETED
@@ -1,196 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
-
14
-
15
- @dataclass
16
- class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- still_on_hub: bool = False
34
-
35
- @classmethod
36
- def init_from_json_file(self, json_filepath):
37
- """Inits the result from the specific model result file"""
38
- with open(json_filepath) as fp:
39
- data = json.load(fp)
40
-
41
- config = data.get("config")
42
-
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
-
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
-
69
- # Extract results available in this file (some results are split in several files)
70
- results = {}
71
- for task in Tasks:
72
- task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
-
82
- return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
- org=org,
86
- model=model,
87
- results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
92
- )
93
-
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
-
110
- def to_dict(self):
111
- """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
- data_dict = {
114
- "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
- }
128
-
129
- for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
-
132
- return data_dict
133
-
134
-
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
155
-
156
-
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
- """From the path of the results folder root, extract all needed info for results"""
159
- model_result_filepaths = []
160
-
161
- for root, _, files in os.walk(results_path):
162
- # We should only have json files in model results
163
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
- continue
165
-
166
- # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
-
172
- for file in files:
173
- model_result_filepaths.append(os.path.join(root, file))
174
-
175
- eval_results = {}
176
- for model_result_filepath in model_result_filepaths:
177
- # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
-
181
- # Store results of same eval together
182
- eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
- else:
186
- eval_results[eval_name] = eval_result
187
-
188
- results = []
189
- for v in eval_results.values():
190
- try:
191
- v.to_dict() # we test if the dict version is complete
192
- results.append(v)
193
- except KeyError: # not all eval values present
194
- continue
195
-
196
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py DELETED
@@ -1,58 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
-
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
-
24
-
25
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,99 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
- try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
- except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
-
77
- def already_submitted_models(requested_models_dir: str) -> set[str]:
78
- """Gather a list of already submitted models to avoid duplicates"""
79
- depth = 1
80
- file_names = []
81
- users_to_submission_dates = defaultdict(list)
82
-
83
- for root, _, files in os.walk(requested_models_dir):
84
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
- if current_depth == depth:
86
- for file in files:
87
- if not file.endswith(".json"):
88
- continue
89
- with open(os.path.join(root, file), "r") as f:
90
- info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
-
93
- # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
- continue
96
- organisation, _ = info["model"].split("/")
97
- users_to_submission_dates[organisation].append(info["submitted_time"])
98
-
99
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,119 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
-
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
-
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
-
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
- }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )