Yuxuan-Zhang-Dexter commited on
Commit
8e60916
·
1 Parent(s): b308c41

update latest leaderboard info

Browse files
app.py CHANGED
@@ -522,6 +522,14 @@ def build_app():
522
  max-height: none !important;
523
  position: relative !important;
524
  }
 
 
 
 
 
 
 
 
525
 
526
  /* Force scrolling to work on the main container */
527
  .gradio-container, #root, #app {
@@ -708,6 +716,7 @@ def build_app():
708
 
709
  # Add custom JavaScript for table header line breaks
710
  gr.HTML("""
 
711
  <script>
712
  // Function to add line breaks to table headers
713
  function formatTableHeaders() {
@@ -847,6 +856,7 @@ def build_app():
847
  with gr.Column(visible=True) as overall_visualizations:
848
  with gr.Tabs():
849
  with gr.Tab("📈 Radar Chart"):
 
850
  radar_visualization = gr.Plot(
851
  label="Comparative Analysis (Radar Chart)",
852
  elem_classes="visualization-container"
@@ -857,6 +867,10 @@ def build_app():
857
  # label="Comparative Analysis (Group Bar Chart)",
858
  # elem_classes="visualization-container"
859
  # )
 
 
 
 
860
 
861
  # Hidden placeholder for group bar visualization (to maintain code references)
862
  group_bar_visualization = gr.Plot(visible=False)
 
522
  max-height: none !important;
523
  position: relative !important;
524
  }
525
+ .radar-tip {
526
+ font-size: 14px;
527
+ color: #555;
528
+ margin-top: 5px;
529
+ margin-bottom: 20px;
530
+ font-style: italic;
531
+ }
532
+
533
 
534
  /* Force scrolling to work on the main container */
535
  .gradio-container, #root, #app {
 
716
 
717
  # Add custom JavaScript for table header line breaks
718
  gr.HTML("""
719
+
720
  <script>
721
  // Function to add line breaks to table headers
722
  function formatTableHeaders() {
 
856
  with gr.Column(visible=True) as overall_visualizations:
857
  with gr.Tabs():
858
  with gr.Tab("📈 Radar Chart"):
859
+
860
  radar_visualization = gr.Plot(
861
  label="Comparative Analysis (Radar Chart)",
862
  elem_classes="visualization-container"
 
867
  # label="Comparative Analysis (Group Bar Chart)",
868
  # elem_classes="visualization-container"
869
  # )
870
+ gr.Markdown(
871
+ "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
872
+ elem_classes="radar-tip"
873
+ )
874
 
875
  # Hidden placeholder for group bar visualization (to maintain code references)
876
  group_bar_visualization = gr.Plot(visible=False)
assets/model_color.json CHANGED
@@ -6,13 +6,17 @@
6
  "gemini-2.0-flash": "#FF4081",
7
  "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
8
  "gemini-2.5-pro-exp-03-25": "#FF80AB",
 
9
  "gpt-4o-2024-11-20": "#00BFA5",
10
  "gpt-4.5-preview-2025-02-27": "#00796B",
11
  "gpt-4.1-2025-04-14": "#00897B",
12
  "o1-2024-12-17": "#4DB6AC",
13
  "o1-mini-2024-09-12": "#26A69A",
14
  "o3-mini-2025-01-31(medium)": "#80CBC4",
 
 
 
15
  "deepseek-v3": "#FFC107",
16
  "deepseek-r1": "#FFA000",
17
- "Llama-4-Maverick-17B-128E-Instruct-FP8": "#8E24AA"
18
  }
 
6
  "gemini-2.0-flash": "#FF4081",
7
  "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
8
  "gemini-2.5-pro-exp-03-25": "#FF80AB",
9
+ "gemini-2.5-flash-preview-04-17": "#F06292",
10
  "gpt-4o-2024-11-20": "#00BFA5",
11
  "gpt-4.5-preview-2025-02-27": "#00796B",
12
  "gpt-4.1-2025-04-14": "#00897B",
13
  "o1-2024-12-17": "#4DB6AC",
14
  "o1-mini-2024-09-12": "#26A69A",
15
  "o3-mini-2025-01-31(medium)": "#80CBC4",
16
+ "o3": "#26C6DA",
17
+ "o4-mini": "#00ACC1",
18
+ "grok-3-beta": "#FF7043",
19
  "deepseek-v3": "#FFC107",
20
  "deepseek-r1": "#FFA000",
21
+ "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA"
22
  }
assets/news.json CHANGED
@@ -1,5 +1,11 @@
1
  {
2
  "news": [
 
 
 
 
 
 
3
  {
4
  "date": "2025-04-15",
5
  "video_link": "https://www.youtube.com/watch?v=q8PMW870yp8",
 
1
  {
2
  "news": [
3
+ {
4
+ "date": "2025-04-23",
5
+ "video_link": "https://www.youtube.com/watch?v=NB1-5aKV9v4",
6
+ "twitter_text": "Zero-Shot AI Gaming Showdown: O3 Multi-Modal Might Sweeps Sokoban & 2048, Lands Top-2 in Phoenix Wright & Candy Crush",
7
+ "twitter_link": "https://x.com/haoailab"
8
+ },
9
  {
10
  "date": "2025-04-15",
11
  "video_link": "https://www.youtube.com/watch?v=q8PMW870yp8",
data_visualization.py CHANGED
@@ -338,16 +338,21 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
338
  mean, std = vals.mean(), vals.std()
339
  df[f"norm_{col}"] = normalize_values(vals, mean, std)
340
 
341
- # Group players by prefix
342
  model_groups = {}
343
  for player in df["Player"]:
344
  prefix = get_model_prefix(player)
345
  model_groups.setdefault(prefix, []).append(player)
346
-
347
- # Order: grouped by prefix, then alphabetically
 
 
 
 
 
348
  grouped_players = []
349
- for prefix in sorted(model_groups):
350
- grouped_players.extend(sorted(model_groups[prefix]))
351
 
352
  fig = go.Figure()
353
 
@@ -363,12 +368,15 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
363
 
364
  r = [row[f"norm_{col}"] for col in game_cols]
365
 
 
 
 
366
  fig.add_trace(go.Scatterpolar(
367
  r=r + [r[0]],
368
  theta=categories + [categories[0]],
369
  mode='lines+markers',
370
  fill='toself',
371
- name=player,
372
  line=dict(color=color, width=4 if is_highlighted else 2),
373
  marker=dict(color=color),
374
  fillcolor=fillcolor,
@@ -379,7 +387,7 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
379
  fig.update_layout(
380
  autosize=False,
381
  width=1000,
382
- height=600,
383
  margin=dict(l=400, r=200, t=20, b=20),
384
  title=dict(
385
  text="AI Normalized Performance Across Games",
@@ -407,12 +415,12 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
407
  ),
408
  legend=dict(
409
  font=dict(size=12),
410
- title="Choose your model: ",
411
  itemsizing='trace',
412
- x=-1.2,
413
- y=0.8,
414
- xanchor='left',
415
  yanchor='top',
 
416
  bgcolor='rgba(255,255,255,0.6)',
417
  bordercolor='gray',
418
  borderwidth=1
 
338
  mean, std = vals.mean(), vals.std()
339
  df[f"norm_{col}"] = normalize_values(vals, mean, std)
340
 
341
+ # Group players by prefix and sort alphabetically
342
  model_groups = {}
343
  for player in df["Player"]:
344
  prefix = get_model_prefix(player)
345
  model_groups.setdefault(prefix, []).append(player)
346
+
347
+ # Sort each group alphabetically
348
+ for prefix in model_groups:
349
+ model_groups[prefix] = sorted(model_groups[prefix], key=str.lower)
350
+
351
+ # Get sorted prefixes and create ordered player list
352
+ sorted_prefixes = sorted(model_groups.keys(), key=str.lower)
353
  grouped_players = []
354
+ for prefix in sorted_prefixes:
355
+ grouped_players.extend(model_groups[prefix])
356
 
357
  fig = go.Figure()
358
 
 
368
 
369
  r = [row[f"norm_{col}"] for col in game_cols]
370
 
371
+ # Convert player name to lowercase for the legend
372
+ display_name = player.lower()
373
+
374
  fig.add_trace(go.Scatterpolar(
375
  r=r + [r[0]],
376
  theta=categories + [categories[0]],
377
  mode='lines+markers',
378
  fill='toself',
379
+ name=display_name, # Use lowercase name in legend
380
  line=dict(color=color, width=4 if is_highlighted else 2),
381
  marker=dict(color=color),
382
  fillcolor=fillcolor,
 
387
  fig.update_layout(
388
  autosize=False,
389
  width=1000,
390
+ height=620, # Increased height to accommodate legend
391
  margin=dict(l=400, r=200, t=20, b=20),
392
  title=dict(
393
  text="AI Normalized Performance Across Games",
 
415
  ),
416
  legend=dict(
417
  font=dict(size=12),
418
+ title="Choose your model 💡 (click / double-click)",
419
  itemsizing='trace',
420
+ x=-1.4, # Moved further left
421
+ y=0.8, # Moved to top
 
422
  yanchor='top',
423
+ xanchor='left',
424
  bgcolor='rgba(255,255,255,0.6)',
425
  bordercolor='gray',
426
  borderwidth=1
rank_data_03_25_2025.json CHANGED
@@ -2,122 +2,157 @@
2
  "Super Mario Bros": {
3
  "runs": 5,
4
  "results": [
 
 
 
 
 
 
 
5
  {
6
  "model": "claude-3-7-sonnet-20250219",
7
  "score": 710,
8
  "progress": "1-1",
9
  "time_s": 64.2,
10
- "rank": 1
11
  },
12
  {
13
  "model": "gpt-4o-2024-11-20",
14
  "score": 560,
15
  "progress": "1-1",
16
  "time_s": 58.6,
17
- "rank": 2
18
  },
19
  {
20
  "model": "gemini-2.0-flash",
21
  "score": 320,
22
  "progress": "1-1",
23
  "time_s": 51.8,
24
- "rank": 3
25
  },
26
  {
27
  "model": "claude-3-5-haiku-20241022",
28
  "score": 140,
29
  "progress": "1-1",
30
  "time_s": 76.4,
31
- "rank": 4
32
  },
33
  {
34
  "model": "gpt-4.5-preview-2025-02-27",
35
  "score": 160,
36
  "progress": "1-1",
37
  "time_s": 62.8,
38
- "rank": 5
39
  }
40
  ]
41
  },
42
  "2048": {
43
  "runs": 1,
44
  "results": [
 
 
 
 
 
 
 
45
  {
46
  "model": "claude-3-7-sonnet-20250219(thinking)",
47
  "score": 256,
48
  "steps": 114,
49
  "time": ">200",
50
- "rank": 1
51
  },
52
  {
53
  "model": "o1-2024-12-17",
54
  "score": 256,
55
  "steps": 116,
56
  "time": ">200",
57
- "rank": 2
58
  },
59
  {
60
  "model": "claude-3-7-sonnet-20250219",
61
  "score": 256,
62
  "steps": 130,
63
  "time": "20:36",
64
- "rank": 3
65
  },
66
  {
67
  "model": "deepseek-v3",
68
  "score": 256,
69
  "steps": 216,
70
  "time": "54.02",
71
- "rank": 4
 
 
 
 
 
 
 
72
  },
73
  {
74
  "model": "gemini-2.0-flash",
75
  "score": 128,
76
  "steps": 111,
77
  "time": "18:43",
78
- "rank": 5
79
  },
80
  {
81
  "model": "gemini-2.0-flash-thinking-exp-1219",
82
  "score": 128,
83
  "steps": 132,
84
  "time": ">100",
85
- "rank": 6
86
  },
87
  {
88
  "model": "gemini-2.5-pro-exp-03-25",
89
  "score": 128,
90
  "steps": 138,
91
  "time": "169",
92
- "rank": 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  },
94
  {
95
  "model": "claude-3-5-sonnet-20241022",
96
  "score": 64,
97
  "steps": 92,
98
  "time": "9:2",
99
- "rank": 9
100
  },
101
  {
102
  "model": "gpt-4.5-preview-2025-02-27",
103
  "score": 34,
104
  "steps": 34,
105
  "time": "8:25",
106
- "rank": 10
107
  },
108
  {
109
  "model": "gpt-4o-2024-11-20",
110
  "score": 16,
111
  "steps": 21,
112
  "time": "1:17",
113
- "rank": 11
114
- },
115
- {
116
- "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
117
- "score": 128,
118
- "steps": 145,
119
- "time": ">100",
120
- "rank": 8
121
  }
122
  ]
123
  },
@@ -182,142 +217,207 @@
182
  "Candy Crush": {
183
  "runs": 3,
184
  "results": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  {
186
  "model": "o3-mini-2025-01-31(medium)",
187
  "score_runs": "90;109;120",
188
  "average_score": 106.33,
189
  "steps": 25,
190
- "rank": 1
191
  },
192
  {
193
  "model": "o1-2024-12-17",
194
  "score_runs": "96;114;83",
195
  "average_score": 97.67,
196
  "steps": 25,
197
- "rank": 2
198
  },
199
  {
200
  "model": "deepseek-r1",
201
  "score_runs": "62;108;105",
202
  "average_score": 91.67,
203
  "steps": 25,
204
- "rank": 3
 
 
 
 
 
 
 
205
  },
206
  {
207
  "model": "gemini-2.5-pro-exp-03-25",
208
  "score_runs": "50;36;68",
209
  "average_score": 51.33,
210
  "steps": 25,
211
- "rank": 4
212
  },
213
  {
214
  "model": "claude-3-7-sonnet-20250219(thinking)",
215
  "score_runs": "36;46;24",
216
  "average_score": 35.33,
217
  "steps": 25,
218
- "rank": 5
219
  },
220
  {
221
  "model": "gemini-2.0-flash-thinking-exp-1219",
222
  "score_runs": "0;15;39",
223
  "average_score": 18,
224
  "steps": 25,
225
- "rank": 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  },
227
  {
228
  "model": "claude-3-5-sonnet-20241022",
229
  "score_runs": "3;0;0",
230
  "average_score": 1,
231
  "steps": 25,
232
- "rank": 7
233
  },
234
  {
235
  "model": "deepseek-v3",
236
  "score_runs": "0;0;0",
237
  "average_score": 0,
238
  "steps": 25,
239
- "rank": 9
240
- },
241
- {
242
- "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
243
- "score_runs": "6;0;0",
244
- "average_score": 2,
245
- "steps": 25,
246
- "rank": 8
247
  }
248
  ]
249
  },
250
  "Sokoban": {
251
  "runs": 3,
252
  "results": [
 
 
 
 
 
 
253
  {
254
  "model": "o3-mini-2025-01-31(medium)",
255
  "levels_cracked": "2; 3; 2",
256
  "steps": "[17,52,68];[24,58,78,91];[19,44,64]",
257
- "rank": 1
258
  },
259
  {
260
  "model": "gemini-2.5-pro-exp-03-25",
261
  "levels_cracked": "2;2;3",
262
  "steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
263
- "rank": 2
 
 
 
 
 
 
 
 
 
 
 
 
264
  },
265
  {
266
  "model": "claude-3-7-sonnet-20250219(thinking)",
267
  "levels_cracked": "1; 2; 0",
268
  "steps": "[17,35];[15,40,43];[4]",
269
- "rank": 3
270
  },
271
  {
272
  "model": "o1-2024-12-17",
273
  "levels_cracked": "1; 1; 1",
274
  "steps": null,
275
- "rank": 4
276
  },
277
  {
278
  "model": "deepseek-r1",
279
  "levels_cracked": "1; 0; 1",
280
  "steps": "[19,42];[13];[19,36]",
281
  "note": "stuck",
282
- "rank": 5
283
  },
284
  {
285
  "model": "o1-mini-2024-09-12",
286
  "levels_cracked": "0;1;0",
287
  "steps": null,
288
- "rank": 6
289
  },
290
  {
291
  "model": "gemini-2.0-flash-thinking-exp-1219",
292
  "levels_cracked": "0; 0; 0",
293
  "steps": "[23]; [14]; [14]",
294
- "rank": 7
295
  },
296
  {
297
  "model": "gpt-4o-2024-11-20",
298
  "levels_cracked": "0; 0; 0",
299
  "steps": "[68];[105];[168]",
300
  "note": "stuck in a loop",
301
- "rank": 8
302
  },
303
  {
304
  "model": "claude-3-5-sonnet-20241022",
305
  "levels_cracked": "0; 0; 0",
306
  "steps": "[21]; [30]; [51]",
307
  "note": "stuck in a loop",
308
- "rank": 9
309
  },
310
  {
311
  "model": "deepseek-v3",
312
  "levels_cracked": "0; 0; 0",
313
  "steps": "[9]; [47]; [64]",
314
- "rank": 10
 
 
 
 
 
 
 
 
 
 
 
 
315
  },
316
  {
317
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
318
  "levels_cracked": "0;0;0",
319
  "steps": "[5]",
320
- "rank": 11
321
  }
322
  ]
323
  },
@@ -333,12 +433,21 @@
333
  "score": 26,
334
  "note": "stuck at the end not present evidence"
335
  },
 
 
 
 
 
 
 
 
 
336
  {
337
  "model": "gemini-2.5-pro-exp-03-25",
338
  "levels_cracked": "2; 3",
339
  "lives_left": "[5,5,0]; [5, 5, 4, 0]",
340
  "cracked_details": "4: 0/8",
341
- "rank": 2,
342
  "score": 20,
343
  "note": "failed to present evidence"
344
  },
@@ -347,7 +456,7 @@
347
  "levels_cracked": "1; 1",
348
  "lives_left": "[3,0]; [5,0]",
349
  "cracked_details": "2: 3/9",
350
- "rank": 3,
351
  "score": 8,
352
  "note": "failed to present evidence"
353
  },
@@ -356,7 +465,7 @@
356
  "levels_cracked": "1",
357
  "lives_left": "5, 5",
358
  "cracked_details": "1:1/8",
359
- "rank": 4,
360
  "score": 6,
361
  "note": "stuck in loop"
362
  },
@@ -365,16 +474,25 @@
365
  "levels_cracked": "1",
366
  "lives_left": "[4,5]",
367
  "cracked_details": "1: 1/8",
368
- "rank": 5,
369
  "score": 6,
370
  "note": "stuck in loop"
371
  },
 
 
 
 
 
 
 
 
 
372
  {
373
  "model": "gemini-2.0-flash-thinking-exp-1219",
374
  "levels_cracked": "0",
375
  "lives_left": "0",
376
  "cracked_details": "1: 4/5",
377
- "rank": 6,
378
  "score": 4,
379
  "note": "stuck in the last option section"
380
  },
@@ -383,16 +501,34 @@
383
  "levels_cracked": "0",
384
  "lives_left": "0",
385
  "cracked_details": "1: 4/5",
386
- "rank": 7,
387
  "score": 4,
388
  "note": "stuck in the 3rd evidence present"
389
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  {
391
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
392
  "levels_cracked": "0",
393
  "lives_left": "0",
394
  "cracked_details": "0:0/5",
395
- "rank": 8,
396
  "score": 0,
397
  "note": "failed to present evidence"
398
  }
 
2
  "Super Mario Bros": {
3
  "runs": 5,
4
  "results": [
5
+ {
6
+ "model": "gpt-4.1-2025-04-14",
7
+ "score": 740,
8
+ "progress": "1-1",
9
+ "time_s": 68.6,
10
+ "rank": 1
11
+ },
12
  {
13
  "model": "claude-3-7-sonnet-20250219",
14
  "score": 710,
15
  "progress": "1-1",
16
  "time_s": 64.2,
17
+ "rank": 2
18
  },
19
  {
20
  "model": "gpt-4o-2024-11-20",
21
  "score": 560,
22
  "progress": "1-1",
23
  "time_s": 58.6,
24
+ "rank": 3
25
  },
26
  {
27
  "model": "gemini-2.0-flash",
28
  "score": 320,
29
  "progress": "1-1",
30
  "time_s": 51.8,
31
+ "rank": 4
32
  },
33
  {
34
  "model": "claude-3-5-haiku-20241022",
35
  "score": 140,
36
  "progress": "1-1",
37
  "time_s": 76.4,
38
+ "rank": 5
39
  },
40
  {
41
  "model": "gpt-4.5-preview-2025-02-27",
42
  "score": 160,
43
  "progress": "1-1",
44
  "time_s": 62.8,
45
+ "rank": 6
46
  }
47
  ]
48
  },
49
  "2048": {
50
  "runs": 1,
51
  "results": [
52
+ {
53
+ "model": "o3",
54
+ "score": 256,
55
+ "steps": 108,
56
+ "time": "58:09",
57
+ "rank": 1
58
+ },
59
  {
60
  "model": "claude-3-7-sonnet-20250219(thinking)",
61
  "score": 256,
62
  "steps": 114,
63
  "time": ">200",
64
+ "rank": 2
65
  },
66
  {
67
  "model": "o1-2024-12-17",
68
  "score": 256,
69
  "steps": 116,
70
  "time": ">200",
71
+ "rank": 3
72
  },
73
  {
74
  "model": "claude-3-7-sonnet-20250219",
75
  "score": 256,
76
  "steps": 130,
77
  "time": "20:36",
78
+ "rank": 4
79
  },
80
  {
81
  "model": "deepseek-v3",
82
  "score": 256,
83
  "steps": 216,
84
  "time": "54.02",
85
+ "rank": 5
86
+ },
87
+ {
88
+ "model": "gemini-2.5-flash-preview-04-17",
89
+ "score": 128,
90
+ "steps": 71,
91
+ "time": "41:42",
92
+ "rank": 6
93
  },
94
  {
95
  "model": "gemini-2.0-flash",
96
  "score": 128,
97
  "steps": 111,
98
  "time": "18:43",
99
+ "rank": 7
100
  },
101
  {
102
  "model": "gemini-2.0-flash-thinking-exp-1219",
103
  "score": 128,
104
  "steps": 132,
105
  "time": ">100",
106
+ "rank": 8
107
  },
108
  {
109
  "model": "gemini-2.5-pro-exp-03-25",
110
  "score": 128,
111
  "steps": 138,
112
  "time": "169",
113
+ "rank": 9
114
+ },
115
+ {
116
+ "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
117
+ "score": 128,
118
+ "steps": 145,
119
+ "time": ">100",
120
+ "rank": 10
121
+ },
122
+ {
123
+ "model": "o4-mini",
124
+ "score": 128,
125
+ "steps": "",
126
+ "time": "",
127
+ "rank": 11
128
+ },
129
+ {
130
+ "model": "grok3-beta",
131
+ "score": 128,
132
+ "steps": "",
133
+ "time": "",
134
+ "rank": 12
135
  },
136
  {
137
  "model": "claude-3-5-sonnet-20241022",
138
  "score": 64,
139
  "steps": 92,
140
  "time": "9:2",
141
+ "rank": 13
142
  },
143
  {
144
  "model": "gpt-4.5-preview-2025-02-27",
145
  "score": 34,
146
  "steps": 34,
147
  "time": "8:25",
148
+ "rank": 14
149
  },
150
  {
151
  "model": "gpt-4o-2024-11-20",
152
  "score": 16,
153
  "steps": 21,
154
  "time": "1:17",
155
+ "rank": 15
 
 
 
 
 
 
 
156
  }
157
  ]
158
  },
 
217
  "Candy Crush": {
218
  "runs": 3,
219
  "results": [
220
+ {
221
+ "model": "o4-mini",
222
+ "score_runs": "123,131",
223
+ "average_score": 127,
224
+ "steps": 25,
225
+ "rank": 1
226
+ },
227
+ {
228
+ "model": "o3",
229
+ "score_runs": "115, 122",
230
+ "average_score": 118.5,
231
+ "steps": 25,
232
+ "rank": 2
233
+ },
234
  {
235
  "model": "o3-mini-2025-01-31(medium)",
236
  "score_runs": "90;109;120",
237
  "average_score": 106.33,
238
  "steps": 25,
239
+ "rank": 3
240
  },
241
  {
242
  "model": "o1-2024-12-17",
243
  "score_runs": "96;114;83",
244
  "average_score": 97.67,
245
  "steps": 25,
246
+ "rank": 4
247
  },
248
  {
249
  "model": "deepseek-r1",
250
  "score_runs": "62;108;105",
251
  "average_score": 91.67,
252
  "steps": 25,
253
+ "rank": 5
254
+ },
255
+ {
256
+ "model": "gemini-2.5-flash-preview-04-17",
257
+ "score_runs": "59",
258
+ "average_score": 59,
259
+ "steps": 25,
260
+ "rank": 6
261
  },
262
  {
263
  "model": "gemini-2.5-pro-exp-03-25",
264
  "score_runs": "50;36;68",
265
  "average_score": 51.33,
266
  "steps": 25,
267
+ "rank": 7
268
  },
269
  {
270
  "model": "claude-3-7-sonnet-20250219(thinking)",
271
  "score_runs": "36;46;24",
272
  "average_score": 35.33,
273
  "steps": 25,
274
+ "rank": 8
275
  },
276
  {
277
  "model": "gemini-2.0-flash-thinking-exp-1219",
278
  "score_runs": "0;15;39",
279
  "average_score": 18,
280
  "steps": 25,
281
+ "rank": 9
282
+ },
283
+ {
284
+ "model": "grok-3-beta",
285
+ "score_runs": "11",
286
+ "average_score": 11,
287
+ "steps": 25,
288
+ "rank": 10
289
+ },
290
+ {
291
+ "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
292
+ "score_runs": "6;0;0",
293
+ "average_score": 2,
294
+ "steps": 25,
295
+ "rank": 11
296
+ },
297
+ {
298
+ "model": "gpt-4.1-2025-04-14",
299
+ "score_runs": "0;3;3",
300
+ "average_score": 2,
301
+ "steps": 25,
302
+ "rank": 12
303
  },
304
  {
305
  "model": "claude-3-5-sonnet-20241022",
306
  "score_runs": "3;0;0",
307
  "average_score": 1,
308
  "steps": 25,
309
+ "rank": 13
310
  },
311
  {
312
  "model": "deepseek-v3",
313
  "score_runs": "0;0;0",
314
  "average_score": 0,
315
  "steps": 25,
316
+ "rank": 14
 
 
 
 
 
 
 
317
  }
318
  ]
319
  },
320
  "Sokoban": {
321
  "runs": 3,
322
  "results": [
323
+ {
324
+ "model": "o3",
325
+ "levels_cracked": "4",
326
+ "steps": "[16, 40, 59, 110]",
327
+ "rank": 1
328
+ },
329
  {
330
  "model": "o3-mini-2025-01-31(medium)",
331
  "levels_cracked": "2; 3; 2",
332
  "steps": "[17,52,68];[24,58,78,91];[19,44,64]",
333
+ "rank": 2
334
  },
335
  {
336
  "model": "gemini-2.5-pro-exp-03-25",
337
  "levels_cracked": "2;2;3",
338
  "steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
339
+ "rank": 3
340
+ },
341
+ {
342
+ "model": "gemini-2.5-flash-preview-04-17",
343
+ "levels_cracked": "2",
344
+ "steps": "[24, 50, 60]",
345
+ "rank": 4
346
+ },
347
+ {
348
+ "model": "o4-mini",
349
+ "levels_cracked": "2",
350
+ "steps": "",
351
+ "rank": 5
352
  },
353
  {
354
  "model": "claude-3-7-sonnet-20250219(thinking)",
355
  "levels_cracked": "1; 2; 0",
356
  "steps": "[17,35];[15,40,43];[4]",
357
+ "rank": 6
358
  },
359
  {
360
  "model": "o1-2024-12-17",
361
  "levels_cracked": "1; 1; 1",
362
  "steps": null,
363
+ "rank": 7
364
  },
365
  {
366
  "model": "deepseek-r1",
367
  "levels_cracked": "1; 0; 1",
368
  "steps": "[19,42];[13];[19,36]",
369
  "note": "stuck",
370
+ "rank": 8
371
  },
372
  {
373
  "model": "o1-mini-2024-09-12",
374
  "levels_cracked": "0;1;0",
375
  "steps": null,
376
+ "rank": 9
377
  },
378
  {
379
  "model": "gemini-2.0-flash-thinking-exp-1219",
380
  "levels_cracked": "0; 0; 0",
381
  "steps": "[23]; [14]; [14]",
382
+ "rank": 10
383
  },
384
  {
385
  "model": "gpt-4o-2024-11-20",
386
  "levels_cracked": "0; 0; 0",
387
  "steps": "[68];[105];[168]",
388
  "note": "stuck in a loop",
389
+ "rank": 11
390
  },
391
  {
392
  "model": "claude-3-5-sonnet-20241022",
393
  "levels_cracked": "0; 0; 0",
394
  "steps": "[21]; [30]; [51]",
395
  "note": "stuck in a loop",
396
+ "rank": 12
397
  },
398
  {
399
  "model": "deepseek-v3",
400
  "levels_cracked": "0; 0; 0",
401
  "steps": "[9]; [47]; [64]",
402
+ "rank": 13
403
+ },
404
+ {
405
+ "model": "gpt-4.1-2025-04-14",
406
+ "levels_cracked": "0; 0; 0",
407
+ "steps": "[9]; [47]; [64]",
408
+ "rank": 14
409
+ },
410
+ {
411
+ "model": "grok-3-beta",
412
+ "levels_cracked": "0",
413
+ "steps": "",
414
+ "rank": 15
415
  },
416
  {
417
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
418
  "levels_cracked": "0;0;0",
419
  "steps": "[5]",
420
+ "rank": 16
421
  }
422
  ]
423
  },
 
433
  "score": 26,
434
  "note": "stuck at the end not present evidence"
435
  },
436
+ {
437
+ "model": "o3",
438
+ "levels_cracked": "3",
439
+ "lives_left": "[5, 3, 3, 0]",
440
+ "cracked_details": "4: 4/8",
441
+ "rank": 2,
442
+ "score": 23,
443
+ "note": "failed to present evidence"
444
+ },
445
  {
446
  "model": "gemini-2.5-pro-exp-03-25",
447
  "levels_cracked": "2; 3",
448
  "lives_left": "[5,5,0]; [5, 5, 4, 0]",
449
  "cracked_details": "4: 0/8",
450
+ "rank": 3,
451
  "score": 20,
452
  "note": "failed to present evidence"
453
  },
 
456
  "levels_cracked": "1; 1",
457
  "lives_left": "[3,0]; [5,0]",
458
  "cracked_details": "2: 3/9",
459
+ "rank": 4,
460
  "score": 8,
461
  "note": "failed to present evidence"
462
  },
 
465
  "levels_cracked": "1",
466
  "lives_left": "5, 5",
467
  "cracked_details": "1:1/8",
468
+ "rank": 5,
469
  "score": 6,
470
  "note": "stuck in loop"
471
  },
 
474
  "levels_cracked": "1",
475
  "lives_left": "[4,5]",
476
  "cracked_details": "1: 1/8",
477
+ "rank": 6,
478
  "score": 6,
479
  "note": "stuck in loop"
480
  },
481
+ {
482
+ "model": "gemini-2.5-flash-preview-04-17",
483
+ "levels_cracked": "0",
484
+ "lives_left": "0",
485
+ "cracked_details": "1: 4/5",
486
+ "rank": 7,
487
+ "score": 4,
488
+ "note": "stuck in the last option section"
489
+ },
490
  {
491
  "model": "gemini-2.0-flash-thinking-exp-1219",
492
  "levels_cracked": "0",
493
  "lives_left": "0",
494
  "cracked_details": "1: 4/5",
495
+ "rank": 8,
496
  "score": 4,
497
  "note": "stuck in the last option section"
498
  },
 
501
  "levels_cracked": "0",
502
  "lives_left": "0",
503
  "cracked_details": "1: 4/5",
504
+ "rank": 9,
505
  "score": 4,
506
  "note": "stuck in the 3rd evidence present"
507
  },
508
+ {
509
+ "model": "o4-mini",
510
+ "levels_cracked": "0",
511
+ "lives_left": "0",
512
+ "cracked_details": "1:1/5",
513
+ "rank": 10,
514
+ "score": 1,
515
+ "note": "failed to present evidence"
516
+ },
517
+ {
518
+ "model": "grok-3-beta",
519
+ "levels_cracked": "0",
520
+ "lives_left": "0",
521
+ "cracked_details": "1:1/5",
522
+ "rank": 11,
523
+ "score": 1,
524
+ "note": "failed to present evidence"
525
+ },
526
  {
527
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
528
  "levels_cracked": "0",
529
  "lives_left": "0",
530
  "cracked_details": "0:0/5",
531
+ "rank": 12,
532
  "score": 0,
533
  "note": "failed to present evidence"
534
  }