Yuxuan-Zhang-Dexter commited on
Commit
bce84cc
·
1 Parent(s): 5d62091

update new leaderboard data

Browse files
app.py CHANGED
@@ -10,7 +10,7 @@ from datetime import datetime, timedelta
10
  import matplotlib.pyplot as plt
11
  from leaderboard_utils import (
12
  get_organization,
13
- get_mario_leaderboard,
14
  get_sokoban_leaderboard,
15
  get_2048_leaderboard,
16
  get_candy_leaderboard,
@@ -50,20 +50,22 @@ with open(TIME_POINTS["03/25/2025"], "r") as f:
50
  leaderboard_state = {
51
  "current_game": None,
52
  "previous_overall": {
53
- "Super Mario Bros": True,
 
54
  "Sokoban": True,
55
  "2048": True,
56
  "Candy Crush": True,
57
- "Tetris (complete)": True,
58
  "Tetris (planning only)": True,
59
  "Ace Attorney": True
60
  },
61
  "previous_details": {
62
- "Super Mario Bros": False,
 
63
  "Sokoban": False,
64
  "2048": False,
65
  "Candy Crush": False,
66
- "Tetris (complete)": False,
67
  "Tetris (planning only)": False,
68
  "Ace Attorney": False
69
  }
@@ -107,7 +109,7 @@ def prepare_dataframe_for_display(df, for_game=None):
107
  if col.endswith(' Score'):
108
  display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
109
 
110
- # If we're in detailed view, add a formatted rank column
111
  if for_game:
112
  # Sort by relevant score column
113
  score_col = f"{for_game} Score"
@@ -116,10 +118,30 @@ def prepare_dataframe_for_display(df, for_game=None):
116
  display_df[score_col] = pd.to_numeric(display_df[score_col], errors='coerce')
117
  # Sort by score in descending order
118
  display_df = display_df.sort_values(by=score_col, ascending=False)
119
- # Add rank column based on the sort
120
- display_df.insert(0, 'Rank', range(1, len(display_df) + 1))
121
  # Filter out models that didn't participate
122
  display_df = display_df[~display_df[score_col].isna()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  # Add line breaks to column headers
125
  new_columns = {}
@@ -129,9 +151,6 @@ def prepare_dataframe_for_display(df, for_game=None):
129
  game_name = col.replace(' Score', '')
130
  new_col = f"{game_name}\nScore"
131
  new_columns[col] = new_col
132
- # Keep Organization without line breaks
133
- # elif col == 'Organization':
134
- # new_columns[col] = 'Organi-\nzation'
135
 
136
  # Rename columns with new line breaks
137
  if new_columns:
@@ -158,32 +177,35 @@ def update_df_with_height(df):
158
  # max_height=None, # Remove height limitation - COMMENTED OUT
159
  column_widths=col_widths)
160
 
161
- def update_leaderboard(mario_overall, mario_details,
 
162
  sokoban_overall, sokoban_details,
163
  _2048_overall, _2048_details,
164
  candy_overall, candy_details,
165
- tetris_overall, tetris_details,
166
  tetris_plan_overall, tetris_plan_details,
167
  ace_attorney_overall, ace_attorney_details):
168
  global leaderboard_state
169
 
170
  # Convert current checkbox states to dictionary for easier comparison
171
  current_overall = {
172
- "Super Mario Bros": mario_overall,
 
173
  "Sokoban": sokoban_overall,
174
  "2048": _2048_overall,
175
  "Candy Crush": candy_overall,
176
- "Tetris (complete)": tetris_overall,
177
  "Tetris (planning only)": tetris_plan_overall,
178
  "Ace Attorney": ace_attorney_overall
179
  }
180
 
181
  current_details = {
182
- "Super Mario Bros": mario_details,
 
183
  "Sokoban": sokoban_details,
184
  "2048": _2048_details,
185
  "Candy Crush": candy_details,
186
- "Tetris (complete)": tetris_details,
187
  "Tetris (planning only)": tetris_plan_details,
188
  "Ace Attorney": ace_attorney_details
189
  }
@@ -266,11 +288,12 @@ def update_leaderboard(mario_overall, mario_details,
266
 
267
  # Build dictionary for selected games
268
  selected_games = {
269
- "Super Mario Bros": current_overall["Super Mario Bros"],
 
270
  "Sokoban": current_overall["Sokoban"],
271
  "2048": current_overall["2048"],
272
  "Candy Crush": current_overall["Candy Crush"],
273
- "Tetris (complete)": current_overall["Tetris (complete)"],
274
  "Tetris (planning only)": current_overall["Tetris (planning only)"],
275
  "Ace Attorney": current_overall["Ace Attorney"]
276
  }
@@ -278,54 +301,49 @@ def update_leaderboard(mario_overall, mario_details,
278
  # Get the appropriate DataFrame and charts based on current state
279
  if leaderboard_state["current_game"]:
280
  # For detailed view
281
- if leaderboard_state["current_game"] == "Super Mario Bros":
282
- df = get_mario_leaderboard(rank_data)
 
 
283
  elif leaderboard_state["current_game"] == "Sokoban":
284
  df = get_sokoban_leaderboard(rank_data)
285
  elif leaderboard_state["current_game"] == "2048":
286
  df = get_2048_leaderboard(rank_data)
287
  elif leaderboard_state["current_game"] == "Candy Crush":
288
  df = get_candy_leaderboard(rank_data)
289
- elif leaderboard_state["current_game"] == "Tetris (complete)":
290
- df = get_tetris_leaderboard(rank_data)
291
  elif leaderboard_state["current_game"] == "Tetris (planning only)":
292
  df = get_tetris_planning_leaderboard(rank_data)
293
  elif leaderboard_state["current_game"] == "Ace Attorney":
294
  df = get_ace_attorney_leaderboard(rank_data)
 
 
295
 
296
- # Format the DataFrame for display
297
  display_df = prepare_dataframe_for_display(df, leaderboard_state["current_game"])
298
-
299
- # Always create a new chart for detailed view
300
  chart = create_horizontal_bar_chart(df, leaderboard_state["current_game"])
301
- # Use the same chart for all visualizations in detailed view
302
- radar_chart = chart
303
- group_bar_chart = chart
304
  else:
305
  # For overall view
306
- df, _ = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
307
- # Format the DataFrame for display
308
  display_df = prepare_dataframe_for_display(df)
309
- # Use the same selected_games for radar chart
310
  _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
311
- chart = radar_chart
312
- group_bar_chart = radar_chart # Use radar chart instead of bar chart
313
 
314
- # Return exactly 18 values to match the expected outputs
315
- return (update_df_with_height(display_df), chart, radar_chart, radar_chart,
316
- current_overall["Super Mario Bros"], current_details["Super Mario Bros"],
317
  current_overall["Sokoban"], current_details["Sokoban"],
318
  current_overall["2048"], current_details["2048"],
319
  current_overall["Candy Crush"], current_details["Candy Crush"],
320
- current_overall["Tetris (complete)"], current_details["Tetris (complete)"],
321
  current_overall["Tetris (planning only)"], current_details["Tetris (planning only)"],
322
  current_overall["Ace Attorney"], current_details["Ace Attorney"])
323
 
324
- def update_leaderboard_with_time(time_point, mario_overall, mario_details,
 
325
  sokoban_overall, sokoban_details,
326
  _2048_overall, _2048_details,
327
  candy_overall, candy_details,
328
- tetris_overall, tetris_details,
329
  tetris_plan_overall, tetris_plan_details,
330
  ace_attorney_overall, ace_attorney_details):
331
  # Load rank data for the selected time point
@@ -334,12 +352,13 @@ def update_leaderboard_with_time(time_point, mario_overall, mario_details,
334
  if new_rank_data is not None:
335
  rank_data = new_rank_data
336
 
337
- # Use the existing update_leaderboard function
338
- return update_leaderboard(mario_overall, mario_details,
 
339
  sokoban_overall, sokoban_details,
340
  _2048_overall, _2048_details,
341
  candy_overall, candy_details,
342
- tetris_overall, tetris_details,
343
  tetris_plan_overall, tetris_plan_details,
344
  ace_attorney_overall, ace_attorney_details)
345
 
@@ -348,20 +367,22 @@ def get_initial_state():
348
  return {
349
  "current_game": None,
350
  "previous_overall": {
351
- "Super Mario Bros": True,
 
352
  "Sokoban": True,
353
  "2048": True,
354
  "Candy Crush": True,
355
- "Tetris (complete)": True,
356
  "Tetris (planning only)": True,
357
  "Ace Attorney": True
358
  },
359
  "previous_details": {
360
- "Super Mario Bros": False,
 
361
  "Sokoban": False,
362
  "2048": False,
363
  "Candy Crush": False,
364
- "Tetris (complete)": False,
365
  "Tetris (planning only)": False,
366
  "Ace Attorney": False
367
  }
@@ -370,36 +391,27 @@ def get_initial_state():
370
  def clear_filters():
371
  global leaderboard_state
372
 
373
- # Reset all checkboxes to default state
374
  selected_games = {
375
- "Super Mario Bros": True,
376
  "Sokoban": True,
377
  "2048": True,
378
  "Candy Crush": True,
379
- "Tetris (complete)": True,
380
  "Tetris (planning only)": True,
381
  "Ace Attorney": True
382
  }
383
 
384
- # Get the combined leaderboard and group bar chart
385
  df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
386
-
387
- # Format the DataFrame for display
388
  display_df = prepare_dataframe_for_display(df)
389
-
390
- # Get the radar chart using the same selected games
391
  _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
392
 
393
- # Reset the leaderboard state to match the default checkbox states
394
  leaderboard_state = get_initial_state()
395
 
396
- # Return exactly 16 values to match the expected outputs
397
- return (update_df_with_height(display_df), radar_chart, radar_chart, radar_chart,
398
- True, False, # mario
399
  True, False, # sokoban
400
  True, False, # 2048
401
  True, False, # candy
402
- True, False, # tetris
403
  True, False, # tetris plan
404
  True, False) # ace attorney
405
 
@@ -712,7 +724,7 @@ def build_app():
712
  margin-top: 40px !important;
713
  }
714
  """) as demo:
715
- gr.Markdown("# 🎮 Game Arena: Gaming Agent 🎲")
716
 
717
  # Add custom JavaScript for table header line breaks
718
  gr.HTML("""
@@ -861,29 +873,34 @@ def build_app():
861
  label="Comparative Analysis (Radar Chart)",
862
  elem_classes="visualization-container"
863
  )
864
- # Comment out the Group Bar Chart tab
865
- # with gr.Tab("📊 Group Bar Chart"):
866
- # group_bar_visualization = gr.Plot(
867
- # label="Comparative Analysis (Group Bar Chart)",
868
- # elem_classes="visualization-container"
869
- # )
870
  gr.Markdown(
871
- "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
872
- elem_classes="radar-tip"
 
 
 
 
 
 
873
  )
 
874
 
875
  # Hidden placeholder for group bar visualization (to maintain code references)
876
- group_bar_visualization = gr.Plot(visible=False)
877
 
878
  # Game selection section
879
  with gr.Row():
880
  gr.Markdown("### 🎮 Game Selection")
881
  with gr.Row():
882
- with gr.Column():
883
- gr.Markdown("**🎮 Super Mario Bros**")
884
- mario_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
885
- mario_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
886
- with gr.Column():
 
 
 
 
887
  gr.Markdown("**📦 Sokoban**")
888
  sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
889
  sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
@@ -895,10 +912,10 @@ def build_app():
895
  gr.Markdown("**🍬 Candy Crush**")
896
  candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
897
  candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
898
- with gr.Column():
899
- gr.Markdown("**🎯 Tetris (complete)**")
900
- tetris_overall = gr.Checkbox(label="Tetris (complete) Score", value=True)
901
- tetris_details = gr.Checkbox(label="Tetris (complete) Details", value=False)
902
  with gr.Column():
903
  gr.Markdown("**📋 Tetris (planning)**")
904
  tetris_plan_overall = gr.Checkbox(label="Tetris (planning) Score", value=True)
@@ -927,11 +944,12 @@ def build_app():
927
 
928
  # Get initial leaderboard dataframe
929
  initial_df = get_combined_leaderboard(rank_data, {
930
- "Super Mario Bros": True,
 
931
  "Sokoban": True,
932
  "2048": True,
933
  "Candy Crush": True,
934
- "Tetris (complete)": True,
935
  "Tetris (planning only)": True,
936
  "Ace Attorney": True
937
  })
@@ -967,13 +985,14 @@ def build_app():
967
  with gr.Row():
968
  score_note = add_score_note()
969
 
970
- # List of all checkboxes
971
  checkbox_list = [
972
- mario_overall, mario_details,
 
973
  sokoban_overall, sokoban_details,
974
  _2048_overall, _2048_details,
975
  candy_overall, candy_details,
976
- tetris_overall, tetris_details,
977
  tetris_plan_overall, tetris_plan_details,
978
  ace_attorney_overall, ace_attorney_details
979
  ]
@@ -981,10 +1000,14 @@ def build_app():
981
  # Update visualizations when checkboxes change
982
  def update_visualizations(*checkbox_states):
983
  # Check if any details checkbox is selected
 
984
  is_details_view = any([
985
- checkbox_states[1], checkbox_states[3], checkbox_states[5],
986
- checkbox_states[7], checkbox_states[9], checkbox_states[11],
987
- checkbox_states[13] # Ace Attorney details checkbox
 
 
 
988
  ])
989
 
990
  # Update visibility of visualization blocks
@@ -1010,7 +1033,7 @@ def build_app():
1010
  leaderboard_df,
1011
  detailed_visualization,
1012
  radar_visualization,
1013
- group_bar_visualization
1014
  ] + checkbox_list
1015
  )
1016
 
@@ -1022,7 +1045,7 @@ def build_app():
1022
  leaderboard_df,
1023
  detailed_visualization,
1024
  radar_visualization,
1025
- group_bar_visualization
1026
  ] + checkbox_list
1027
  )
1028
 
@@ -1034,7 +1057,7 @@ def build_app():
1034
  leaderboard_df,
1035
  detailed_visualization,
1036
  radar_visualization,
1037
- group_bar_visualization
1038
  ] + checkbox_list
1039
  )
1040
 
 
10
  import matplotlib.pyplot as plt
11
  from leaderboard_utils import (
12
  get_organization,
13
+ get_mario_planning_leaderboard,
14
  get_sokoban_leaderboard,
15
  get_2048_leaderboard,
16
  get_candy_leaderboard,
 
50
  leaderboard_state = {
51
  "current_game": None,
52
  "previous_overall": {
53
+ # "Super Mario Bros": True, # Commented out
54
+ "Super Mario Bros (planning only)": True,
55
  "Sokoban": True,
56
  "2048": True,
57
  "Candy Crush": True,
58
+ # "Tetris (complete)", # Commented out
59
  "Tetris (planning only)": True,
60
  "Ace Attorney": True
61
  },
62
  "previous_details": {
63
+ # "Super Mario Bros": False, # Commented out
64
+ "Super Mario Bros (planning only)": False,
65
  "Sokoban": False,
66
  "2048": False,
67
  "Candy Crush": False,
68
+ # "Tetris (complete)": False, # Commented out
69
  "Tetris (planning only)": False,
70
  "Ace Attorney": False
71
  }
 
109
  if col.endswith(' Score'):
110
  display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
111
 
112
+ # If we're in detailed view, sort by score
113
  if for_game:
114
  # Sort by relevant score column
115
  score_col = f"{for_game} Score"
 
118
  display_df[score_col] = pd.to_numeric(display_df[score_col], errors='coerce')
119
  # Sort by score in descending order
120
  display_df = display_df.sort_values(by=score_col, ascending=False)
 
 
121
  # Filter out models that didn't participate
122
  display_df = display_df[~display_df[score_col].isna()]
123
+ else:
124
+ # For overall view, sort by average of game scores (implicitly used for ranking)
125
+ # but we won't add an explicit 'Rank' or 'Average Rank' column to the final display_df
126
+
127
+ # Calculate an internal sorting key based on average scores, but don't add it to the display_df
128
+ score_cols = [col for col in display_df.columns if col.endswith(' Score')]
129
+ if score_cols:
130
+ temp_sort_df = display_df.copy()
131
+ for col in score_cols:
132
+ temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
133
+
134
+ # Calculate average of the game scores (use mean of ranks from utils for actual ranking logic if different)
135
+ # For display sorting, let's use a simple average of available scores.
136
+ # The actual ranking for 'Average Rank' in leaderboard_utils uses mean of ranks, which is more robust.
137
+ # Here we just need a consistent sort order.
138
+
139
+ # Create a temporary column for sorting
140
+ temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
141
+
142
+ # Sort by this temporary average score (higher is better for scores)
143
+ # and then by Player name as a tie-breaker
144
+ display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
145
 
146
  # Add line breaks to column headers
147
  new_columns = {}
 
151
  game_name = col.replace(' Score', '')
152
  new_col = f"{game_name}\nScore"
153
  new_columns[col] = new_col
 
 
 
154
 
155
  # Rename columns with new line breaks
156
  if new_columns:
 
177
  # max_height=None, # Remove height limitation - COMMENTED OUT
178
  column_widths=col_widths)
179
 
180
+ def update_leaderboard(# mario_overall, mario_details, # Commented out
181
+ mario_plan_overall, mario_plan_details, # Added
182
  sokoban_overall, sokoban_details,
183
  _2048_overall, _2048_details,
184
  candy_overall, candy_details,
185
+ # tetris_overall, tetris_details, # Commented out
186
  tetris_plan_overall, tetris_plan_details,
187
  ace_attorney_overall, ace_attorney_details):
188
  global leaderboard_state
189
 
190
  # Convert current checkbox states to dictionary for easier comparison
191
  current_overall = {
192
+ # "Super Mario Bros": mario_overall, # Commented out
193
+ "Super Mario Bros (planning only)": mario_plan_overall,
194
  "Sokoban": sokoban_overall,
195
  "2048": _2048_overall,
196
  "Candy Crush": candy_overall,
197
+ # "Tetris (complete)": tetris_overall, # Commented out
198
  "Tetris (planning only)": tetris_plan_overall,
199
  "Ace Attorney": ace_attorney_overall
200
  }
201
 
202
  current_details = {
203
+ # "Super Mario Bros": mario_details, # Commented out
204
+ "Super Mario Bros (planning only)": mario_plan_details,
205
  "Sokoban": sokoban_details,
206
  "2048": _2048_details,
207
  "Candy Crush": candy_details,
208
+ # "Tetris (complete)": tetris_details, # Commented out
209
  "Tetris (planning only)": tetris_plan_details,
210
  "Ace Attorney": ace_attorney_details
211
  }
 
288
 
289
  # Build dictionary for selected games
290
  selected_games = {
291
+ # "Super Mario Bros": current_overall["Super Mario Bros"], # Commented out
292
+ "Super Mario Bros (planning only)": current_overall["Super Mario Bros (planning only)"],
293
  "Sokoban": current_overall["Sokoban"],
294
  "2048": current_overall["2048"],
295
  "Candy Crush": current_overall["Candy Crush"],
296
+ # "Tetris (complete)": current_overall["Tetris (complete)"], # Commented out
297
  "Tetris (planning only)": current_overall["Tetris (planning only)"],
298
  "Ace Attorney": current_overall["Ace Attorney"]
299
  }
 
301
  # Get the appropriate DataFrame and charts based on current state
302
  if leaderboard_state["current_game"]:
303
  # For detailed view
304
+ # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
305
+ # df = get_mario_leaderboard(rank_data)
306
+ if leaderboard_state["current_game"] == "Super Mario Bros (planning only)":
307
+ df = get_mario_planning_leaderboard(rank_data)
308
  elif leaderboard_state["current_game"] == "Sokoban":
309
  df = get_sokoban_leaderboard(rank_data)
310
  elif leaderboard_state["current_game"] == "2048":
311
  df = get_2048_leaderboard(rank_data)
312
  elif leaderboard_state["current_game"] == "Candy Crush":
313
  df = get_candy_leaderboard(rank_data)
 
 
314
  elif leaderboard_state["current_game"] == "Tetris (planning only)":
315
  df = get_tetris_planning_leaderboard(rank_data)
316
  elif leaderboard_state["current_game"] == "Ace Attorney":
317
  df = get_ace_attorney_leaderboard(rank_data)
318
+ else: # Should not happen if current_game is one of the known games
319
+ df = pd.DataFrame() # Empty df
320
 
 
321
  display_df = prepare_dataframe_for_display(df, leaderboard_state["current_game"])
 
 
322
  chart = create_horizontal_bar_chart(df, leaderboard_state["current_game"])
323
+ radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
324
+ group_bar_chart = chart
 
325
  else:
326
  # For overall view
327
+ df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
 
328
  display_df = prepare_dataframe_for_display(df)
 
329
  _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
330
+ chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
 
331
 
332
+ # Return values, including all four plot placeholders
333
+ return (update_df_with_height(display_df), chart, radar_chart, group_bar_chart,
334
+ current_overall["Super Mario Bros (planning only)"], current_details["Super Mario Bros (planning only)"],
335
  current_overall["Sokoban"], current_details["Sokoban"],
336
  current_overall["2048"], current_details["2048"],
337
  current_overall["Candy Crush"], current_details["Candy Crush"],
 
338
  current_overall["Tetris (planning only)"], current_details["Tetris (planning only)"],
339
  current_overall["Ace Attorney"], current_details["Ace Attorney"])
340
 
341
+ def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # Commented out
342
+ mario_plan_overall, mario_plan_details, # Added
343
  sokoban_overall, sokoban_details,
344
  _2048_overall, _2048_details,
345
  candy_overall, candy_details,
346
+ # tetris_overall, tetris_details, # Commented out
347
  tetris_plan_overall, tetris_plan_details,
348
  ace_attorney_overall, ace_attorney_details):
349
  # Load rank data for the selected time point
 
352
  if new_rank_data is not None:
353
  rank_data = new_rank_data
354
 
355
+ # Use the existing update_leaderboard function, including Super Mario (planning only)
356
+ return update_leaderboard(# mario_overall, mario_details, # Commented out
357
+ mario_plan_overall, mario_plan_details, # Added
358
  sokoban_overall, sokoban_details,
359
  _2048_overall, _2048_details,
360
  candy_overall, candy_details,
361
+ # tetris_overall, tetris_details, # Commented out
362
  tetris_plan_overall, tetris_plan_details,
363
  ace_attorney_overall, ace_attorney_details)
364
 
 
367
  return {
368
  "current_game": None,
369
  "previous_overall": {
370
+ # "Super Mario Bros": True, # Commented out
371
+ "Super Mario Bros (planning only)": True,
372
  "Sokoban": True,
373
  "2048": True,
374
  "Candy Crush": True,
375
+ # "Tetris (complete)", # Commented out
376
  "Tetris (planning only)": True,
377
  "Ace Attorney": True
378
  },
379
  "previous_details": {
380
+ # "Super Mario Bros": False, # Commented out
381
+ "Super Mario Bros (planning only)": False,
382
  "Sokoban": False,
383
  "2048": False,
384
  "Candy Crush": False,
385
+ # "Tetris (complete)": False, # Commented out
386
  "Tetris (planning only)": False,
387
  "Ace Attorney": False
388
  }
 
391
  def clear_filters():
392
  global leaderboard_state
393
 
 
394
  selected_games = {
395
+ "Super Mario Bros (planning only)": True,
396
  "Sokoban": True,
397
  "2048": True,
398
  "Candy Crush": True,
 
399
  "Tetris (planning only)": True,
400
  "Ace Attorney": True
401
  }
402
 
 
403
  df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
 
 
404
  display_df = prepare_dataframe_for_display(df)
 
 
405
  _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
406
 
 
407
  leaderboard_state = get_initial_state()
408
 
409
+ # Return values, including all four plot placeholders
410
+ return (update_df_with_height(display_df), radar_chart, radar_chart, group_bar_chart,
411
+ True, False, # mario_plan
412
  True, False, # sokoban
413
  True, False, # 2048
414
  True, False, # candy
 
415
  True, False, # tetris plan
416
  True, False) # ace attorney
417
 
 
724
  margin-top: 40px !important;
725
  }
726
  """) as demo:
727
+ gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
728
 
729
  # Add custom JavaScript for table header line breaks
730
  gr.HTML("""
 
873
  label="Comparative Analysis (Radar Chart)",
874
  elem_classes="visualization-container"
875
  )
 
 
 
 
 
 
876
  gr.Markdown(
877
+ "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
878
+ elem_classes="radar-tip"
879
+ )
880
+ # Comment out the Group Bar Chart tab
881
+ with gr.Tab("📊 Group Bar Chart"):
882
+ group_bar_visualization = gr.Plot(
883
+ label="Comparative Analysis (Group Bar Chart)",
884
+ elem_classes="visualization-container"
885
  )
886
+
887
 
888
  # Hidden placeholder for group bar visualization (to maintain code references)
889
+ # group_bar_visualization = gr.Plot(visible=False)
890
 
891
  # Game selection section
892
  with gr.Row():
893
  gr.Markdown("### 🎮 Game Selection")
894
  with gr.Row():
895
+ # with gr.Column(): # Commented out Super Mario Bros UI
896
+ # gr.Markdown("**🎮 Super Mario Bros**")
897
+ # mario_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
898
+ # mario_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
899
+ with gr.Column(): # Added Super Mario Bros (planning only) UI
900
+ gr.Markdown("**📝 Super Mario Bros (planning only)**")
901
+ mario_plan_overall = gr.Checkbox(label="Super Mario Bros (planning only) Score", value=True)
902
+ mario_plan_details = gr.Checkbox(label="Super Mario Bros (planning only) Details", value=False)
903
+ with gr.Column(): # Sokoban is now after mario_plan
904
  gr.Markdown("**📦 Sokoban**")
905
  sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
906
  sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
 
912
  gr.Markdown("**🍬 Candy Crush**")
913
  candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
914
  candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
915
+ # with gr.Column(): # Commented out Tetris (complete) UI
916
+ # gr.Markdown("**🎯 Tetris (complete)**")
917
+ # tetris_overall = gr.Checkbox(label="Tetris (complete) Score", value=True)
918
+ # tetris_details = gr.Checkbox(label="Tetris (complete) Details", value=False)
919
  with gr.Column():
920
  gr.Markdown("**📋 Tetris (planning)**")
921
  tetris_plan_overall = gr.Checkbox(label="Tetris (planning) Score", value=True)
 
944
 
945
  # Get initial leaderboard dataframe
946
  initial_df = get_combined_leaderboard(rank_data, {
947
+ # "Super Mario Bros": True, # Commented out
948
+ "Super Mario Bros (planning only)": True,
949
  "Sokoban": True,
950
  "2048": True,
951
  "Candy Crush": True,
952
+ # "Tetris (complete)": True, # Commented out
953
  "Tetris (planning only)": True,
954
  "Ace Attorney": True
955
  })
 
985
  with gr.Row():
986
  score_note = add_score_note()
987
 
988
+ # List of all checkboxes, including Super Mario Bros (planning only)
989
  checkbox_list = [
990
+ # mario_overall, mario_details, # Commented out
991
+ mario_plan_overall, mario_plan_details,
992
  sokoban_overall, sokoban_details,
993
  _2048_overall, _2048_details,
994
  candy_overall, candy_details,
995
+ # tetris_overall, tetris_details, # Commented out
996
  tetris_plan_overall, tetris_plan_details,
997
  ace_attorney_overall, ace_attorney_details
998
  ]
 
1000
  # Update visualizations when checkboxes change
1001
  def update_visualizations(*checkbox_states):
1002
  # Check if any details checkbox is selected
1003
+ # Adjusted indices due to addition of Super Mario (planning only)
1004
  is_details_view = any([
1005
+ checkbox_states[1], # Mario Plan details
1006
+ checkbox_states[3], # Sokoban details
1007
+ checkbox_states[5], # 2048 details
1008
+ checkbox_states[7], # Candy Crush details
1009
+ checkbox_states[9], # Tetris (planning only) details
1010
+ checkbox_states[11] # Ace Attorney details
1011
  ])
1012
 
1013
  # Update visibility of visualization blocks
 
1033
  leaderboard_df,
1034
  detailed_visualization,
1035
  radar_visualization,
1036
+ group_bar_visualization # RESTORED
1037
  ] + checkbox_list
1038
  )
1039
 
 
1045
  leaderboard_df,
1046
  detailed_visualization,
1047
  radar_visualization,
1048
+ group_bar_visualization # RESTORED
1049
  ] + checkbox_list
1050
  )
1051
 
 
1057
  leaderboard_df,
1058
  detailed_visualization,
1059
  radar_visualization,
1060
+ group_bar_visualization # RESTORED
1061
  ] + checkbox_list
1062
  )
1063
 
assets/model_color.json CHANGED
@@ -1,12 +1,14 @@
1
  {
2
  "claude-3-7-sonnet-20250219": "#4A90E2",
3
- "claude-3-7-sonnet-20250219(thinking)": "#2E5C8A",
4
  "claude-3-5-haiku-20241022": "#7FB5E6",
5
  "claude-3-5-sonnet-20241022": "#1A4C7C",
6
  "gemini-2.0-flash": "#FF4081",
7
  "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
8
  "gemini-2.5-pro-exp-03-25": "#FF80AB",
9
  "gemini-2.5-flash-preview-04-17": "#F06292",
 
 
10
  "gpt-4o-2024-11-20": "#00BFA5",
11
  "gpt-4.5-preview-2025-02-27": "#00796B",
12
  "gpt-4.1-2025-04-14": "#00897B",
@@ -17,7 +19,9 @@
17
  "o4-mini-2025-04-16": "#00ACC1",
18
  "grok-3-beta": "#FF7043",
19
  "grok-3-mini-beta": "#FF8A65",
 
20
  "deepseek-v3": "#FFC107",
21
  "deepseek-r1": "#FFA000",
22
- "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA"
 
23
  }
 
1
  {
2
  "claude-3-7-sonnet-20250219": "#4A90E2",
3
+ "claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
4
  "claude-3-5-haiku-20241022": "#7FB5E6",
5
  "claude-3-5-sonnet-20241022": "#1A4C7C",
6
  "gemini-2.0-flash": "#FF4081",
7
  "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
8
  "gemini-2.5-pro-exp-03-25": "#FF80AB",
9
  "gemini-2.5-flash-preview-04-17": "#F06292",
10
+ "gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
11
+ "gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
12
  "gpt-4o-2024-11-20": "#00BFA5",
13
  "gpt-4.5-preview-2025-02-27": "#00796B",
14
  "gpt-4.1-2025-04-14": "#00897B",
 
19
  "o4-mini-2025-04-16": "#00ACC1",
20
  "grok-3-beta": "#FF7043",
21
  "grok-3-mini-beta": "#FF8A65",
22
+ "grok-3-mini-beta (thinking)": "#F57C00",
23
  "deepseek-v3": "#FFC107",
24
  "deepseek-r1": "#FFA000",
25
+ "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
26
+ "Random (x30)": "#9E9E9E"
27
  }
data_visualization.py CHANGED
@@ -56,76 +56,76 @@ def simplify_model_name(name):
56
  return '-'.join(parts[:4]) + '-...' if len(parts) > 4 else name
57
 
58
  def create_horizontal_bar_chart(df, game_name):
59
- if game_name == "Super Mario Bros":
60
- score_col = "Score"
61
- df_sorted = df.sort_values(by=score_col, ascending=True)
62
- elif game_name == "Sokoban":
63
- # Process Sokoban scores by splitting and getting max level
64
- def get_max_level(levels_str):
65
- try:
66
- # Split by semicolon, strip whitespace, filter empty strings, convert to integers
67
- levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
68
- return max(levels) if levels else 0
69
- except:
70
- return 0
71
-
72
- # Create a temporary column with max levels
73
- df['Max Level'] = df['Levels Cracked'].apply(get_max_level)
74
- df_sorted = df.sort_values(by='Max Level', ascending=True)
75
- score_col = 'Max Level'
76
- elif game_name == "2048":
77
- score_col = "Score"
78
- df_sorted = df.sort_values(by=score_col, ascending=True)
79
- elif game_name == "Candy Crush":
80
- score_col = "Average Score"
81
- df_sorted = df.sort_values(by=score_col, ascending=True)
82
- elif game_name in ["Tetris (complete)", "Tetris (planning only)"]:
83
- score_col = "Score"
84
- df_sorted = df.sort_values(by=score_col, ascending=True)
85
- elif game_name == "Ace Attorney":
86
- score_col = "Score"
87
- df_sorted = df.sort_values(by=score_col, ascending=True)
88
- else:
89
- return None
90
-
91
- x = df_sorted[score_col]
92
- y = [f"{row['Player']} [{row['Organization']}]" for _, row in df_sorted.iterrows()]
93
- colors = [MODEL_COLORS.get(row['Player'], '#808080') for _, row in df_sorted.iterrows()]
94
- texts = [f"{v:.1f}" if game_name == "Candy Crush" else f"{int(v)}" for v in x]
95
-
96
- fig = go.Figure(go.Bar(
97
- x=x,
98
- y=y,
99
- orientation='h',
100
- marker_color=colors,
101
- text=texts,
102
- textposition='auto',
103
- hovertemplate='%{y}<br>Score: %{x}<extra></extra>'
104
- ))
 
 
 
 
105
 
106
  fig.update_layout(
107
- autosize=False,
108
- width=1000,
109
- height=600,
110
- margin=dict(l=200, r=200, t=20, b=20),
111
  title=dict(
112
- text=f"{game_name} Performance",
113
- pad=dict(t=10),
114
- font=dict(size=20)
115
  ),
116
- yaxis=dict(automargin=True),
117
- legend=dict(
118
- font=dict(size=12),
119
- itemsizing='trace',
120
- x=1.1,
121
- y=1,
122
- xanchor='left',
123
- yanchor='top',
124
- bgcolor='rgba(255,255,255,0.6)',
125
- bordercolor='gray',
126
- borderwidth=1
127
- )
128
  )
 
129
  return fig
130
 
131
  def create_radar_charts(df):
@@ -324,8 +324,10 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
324
  # Format game names
325
  formatted_games = []
326
  for game in selected_games:
327
- if game == 'Super Mario Bros':
328
  formatted_games.append('Super Mario') # Simplified name
 
 
329
  else:
330
  formatted_games.append(game) # Keep other names as is
331
 
@@ -387,7 +389,7 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
387
  fig.update_layout(
388
  autosize=False,
389
  width=1000,
390
- height=620, # Increased height to accommodate legend
391
  margin=dict(l=400, r=200, t=20, b=20),
392
  title=dict(
393
  text="AI Normalized Performance Across Games",
 
56
  return '-'.join(parts[:4]) + '-...' if len(parts) > 4 else name
57
 
58
  def create_horizontal_bar_chart(df, game_name):
59
+ """Creates a horizontal bar chart for a given game's leaderboard data."""
60
+
61
+ if df is None or df.empty:
62
+ # Return a placeholder or an empty figure if there's no data
63
+ fig = go.Figure()
64
+ fig.update_layout(
65
+ title=f"No data available for {game_name}",
66
+ xaxis_title="Score",
67
+ yaxis_title="Player",
68
+ plot_bgcolor='rgba(0,0,0,0)',
69
+ paper_bgcolor='rgba(0,0,0,0)',
70
+ font=dict(color='#2c3e50')
71
+ )
72
+ return fig
73
+
74
+ score_col = "Score" # Standardized score column name
75
+
76
+ if score_col not in df.columns:
77
+ fig = go.Figure()
78
+ fig.update_layout(title=f"'{score_col}' column not found for {game_name}")
79
+ return fig
80
+
81
+ # Ensure the score column is numeric for sorting and plotting
82
+ df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
83
+ df_cleaned = df.dropna(subset=[score_col]) # Remove rows where score is NaN after conversion
84
+
85
+ if df_cleaned.empty:
86
+ fig = go.Figure()
87
+ fig.update_layout(title=f"No valid score data to plot for {game_name}")
88
+ return fig
89
+
90
+ # Sort values for chart display (lowest score at the top of the chart)
91
+ # The input df is already sorted descending by score from leaderboard_utils
92
+ # Re-sorting ascending=True here means player with lowest score is at the top of the y-axis categories
93
+ df_sorted = df_cleaned.sort_values(by=score_col, ascending=True)
94
+
95
+ fig = go.Figure(
96
+ go.Bar(
97
+ y=df_sorted['Player'],
98
+ x=df_sorted[score_col],
99
+ orientation='h',
100
+ marker=dict(
101
+ color=df_sorted[score_col],
102
+ colorscale='Viridis', # Example colorscale, can be changed
103
+ line=dict(color='#2c3e50', width=1)
104
+ ),
105
+ hovertext=df_sorted[score_col].round(2).astype(str) + ' points',
106
+ hoverinfo='y+text'
107
+ )
108
+ )
109
 
110
  fig.update_layout(
 
 
 
 
111
  title=dict(
112
+ text=f'{game_name} Scores',
113
+ x=0.5,
114
+ font=dict(size=20, color='#2c3e50')
115
  ),
116
+ xaxis_title="Score",
117
+ yaxis_title="Player",
118
+ plot_bgcolor='rgba(0,0,0,0)', # Transparent plot background
119
+ paper_bgcolor='rgba(0,0,0,0)', # Transparent paper background
120
+ font=dict(color='#2c3e50'), # Dark text for better readability on light backgrounds
121
+ margin=dict(l=150, r=20, t=50, b=50), # Adjust margins for player names
122
+ yaxis=dict(
123
+ automargin=True,
124
+ tickfont=dict(size=10)
125
+ ),
126
+ xaxis=dict(gridcolor='#e0e0e0') # Light gridlines for x-axis
 
127
  )
128
+
129
  return fig
130
 
131
  def create_radar_charts(df):
 
324
  # Format game names
325
  formatted_games = []
326
  for game in selected_games:
327
+ if game == 'Super Mario Bros (planning only)':
328
  formatted_games.append('Super Mario') # Simplified name
329
+ elif game == 'Tetris (planning only)':
330
+ formatted_games.append('Tetris')
331
  else:
332
  formatted_games.append(game) # Keep other names as is
333
 
 
389
  fig.update_layout(
390
  autosize=False,
391
  width=1000,
392
+ height=700, # Increased height to accommodate legend
393
  margin=dict(l=400, r=200, t=20, b=20),
394
  title=dict(
395
  text="AI Normalized Performance Across Games",
leaderboard_utils.py CHANGED
@@ -4,11 +4,12 @@ import numpy as np
4
 
5
  # Define game order
6
  GAME_ORDER = [
7
- "Super Mario Bros",
 
8
  "Sokoban",
9
  "2048",
10
  "Candy Crush",
11
- "Tetris (complete)",
12
  "Tetris (planning only)",
13
  "Ace Attorney"
14
  ]
@@ -41,31 +42,86 @@ def get_mario_leaderboard(rank_data):
41
  })
42
  df["Organization"] = df["Player"].apply(get_organization)
43
  df = df[["Player", "Organization", "Progress (current/total)", "Score", "Time (s)"]]
 
 
44
  return df
45
 
46
  def get_sokoban_leaderboard(rank_data):
47
  data = rank_data.get("Sokoban", {}).get("results", [])
48
  df = pd.DataFrame(data)
49
  df = df.rename(columns={
50
- "model": "Player",
51
- "levels_cracked": "Levels Cracked",
52
- "steps": "Steps"
 
 
53
  })
54
  df["Organization"] = df["Player"].apply(get_organization)
55
- df = df[["Player", "Organization", "Levels Cracked", "Steps"]]
 
 
 
 
 
 
 
 
 
56
  return df
57
 
58
  def get_2048_leaderboard(rank_data):
59
  data = rank_data.get("2048", {}).get("results", [])
 
 
 
 
 
 
 
 
60
  df = pd.DataFrame(data)
 
 
61
  df = df.rename(columns={
62
- "model": "Player",
63
- "score": "Score",
64
- "steps": "Steps",
65
- "time": "Time"
 
66
  })
67
- df["Organization"] = df["Player"].apply(get_organization)
68
- df = df[["Player", "Organization", "Score", "Steps", "Time"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  return df
70
 
71
  def get_candy_leaderboard(rank_data):
@@ -73,12 +129,18 @@ def get_candy_leaderboard(rank_data):
73
  df = pd.DataFrame(data)
74
  df = df.rename(columns={
75
  "model": "Player",
76
- "score_runs": "Score Runs",
77
- "average_score": "Average Score",
78
- "steps": "Steps"
79
  })
80
  df["Organization"] = df["Player"].apply(get_organization)
81
- df = df[["Player", "Organization", "Score Runs", "Average Score", "Steps"]]
 
 
 
 
 
 
 
82
  return df
83
 
84
  def get_tetris_leaderboard(rank_data):
@@ -98,11 +160,19 @@ def get_tetris_planning_leaderboard(rank_data):
98
  df = pd.DataFrame(data)
99
  df = df.rename(columns={
100
  "model": "Player",
101
- "score": "Score",
102
- "steps_blocks": "Steps"
 
103
  })
104
  df["Organization"] = df["Player"].apply(get_organization)
105
- df = df[["Player", "Organization", "Score", "Steps"]]
 
 
 
 
 
 
 
106
  return df
107
 
108
  def get_ace_attorney_leaderboard(rank_data):
@@ -110,14 +180,41 @@ def get_ace_attorney_leaderboard(rank_data):
110
  df = pd.DataFrame(data)
111
  df = df.rename(columns={
112
  "model": "Player",
113
- "levels_cracked": "Levels Cracked",
114
- "lives_left": "Lives Left",
115
- "cracked_details": "Progress",
116
  "score": "Score",
117
- "note": "Notes"
 
118
  })
119
  df["Organization"] = df["Player"].apply(get_organization)
120
- df = df[["Player", "Organization", "Levels Cracked", "Lives Left", "Progress", "Score", "Notes"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  return df
122
 
123
  def calculate_rank_and_completeness(rank_data, selected_games):
@@ -125,16 +222,18 @@ def calculate_rank_and_completeness(rank_data, selected_games):
125
  game_dfs = {}
126
 
127
  # Get DataFrames for selected games
128
- if selected_games.get("Super Mario Bros"):
129
- game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
 
 
130
  if selected_games.get("Sokoban"):
131
  game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
132
  if selected_games.get("2048"):
133
  game_dfs["2048"] = get_2048_leaderboard(rank_data)
134
  if selected_games.get("Candy Crush"):
135
  game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
136
- if selected_games.get("Tetris (complete)"):
137
- game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
138
  if selected_games.get("Tetris (planning only)"):
139
  game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
140
  if selected_games.get("Ace Attorney"):
@@ -163,29 +262,22 @@ def calculate_rank_and_completeness(rank_data, selected_games):
163
  if player in df["Player"].values:
164
  games_played += 1
165
  # Get player's score based on game type
166
- if game == "Super Mario Bros":
 
 
 
167
  player_score = df[df["Player"] == player]["Score"].iloc[0]
168
  rank = len(df[df["Score"] > player_score]) + 1
169
  elif game == "Sokoban":
170
- # Parse Sokoban score string and get maximum level
171
- levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
172
- try:
173
- # Split by semicolon, strip whitespace, filter empty strings, convert to integers
174
- levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
175
- player_score = max(levels) if levels else 0
176
- except:
177
- player_score = 0
178
- # Calculate rank based on maximum level
179
- rank = len(df[df["Levels Cracked"].apply(
180
- lambda x: max([int(y.strip()) for y in x.split(";") if y.strip()]) > player_score
181
- )]) + 1
182
  elif game == "2048":
183
  player_score = df[df["Player"] == player]["Score"].iloc[0]
184
  rank = len(df[df["Score"] > player_score]) + 1
185
  elif game == "Candy Crush":
186
- player_score = df[df["Player"] == player]["Average Score"].iloc[0]
187
- rank = len(df[df["Average Score"] > player_score]) + 1
188
- elif game in ["Tetris (complete)", "Tetris (planning only)"]:
189
  player_score = df[df["Player"] == player]["Score"].iloc[0]
190
  rank = len(df[df["Score"] > player_score]) + 1
191
  elif game == "Ace Attorney":
@@ -197,12 +289,12 @@ def calculate_rank_and_completeness(rank_data, selected_games):
197
  else:
198
  player_data[f"{game} Score"] = 'n/a'
199
 
200
- # Calculate average rank and completeness for sorting only
201
  if ranks:
202
- player_data["Sort Rank"] = round(np.mean(ranks), 2)
203
  player_data["Games Played"] = games_played
204
  else:
205
- player_data["Sort Rank"] = float('inf')
206
  player_data["Games Played"] = 0
207
 
208
  results.append(player_data)
@@ -210,13 +302,13 @@ def calculate_rank_and_completeness(rank_data, selected_games):
210
  # Create DataFrame and sort by average rank and completeness
211
  df_results = pd.DataFrame(results)
212
  if not df_results.empty:
213
- # Sort by average rank (ascending) and completeness (descending)
214
  df_results = df_results.sort_values(
215
- by=["Sort Rank", "Games Played"],
216
  ascending=[True, False]
217
  )
218
  # Drop the sorting columns
219
- df_results = df_results.drop(["Sort Rank", "Games Played"], axis=1)
220
 
221
  return df_results
222
 
@@ -235,16 +327,18 @@ def get_combined_leaderboard(rank_data, selected_games):
235
  game_dfs = {}
236
 
237
  # Get DataFrames for selected games
238
- if selected_games.get("Super Mario Bros"):
239
- game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
 
 
240
  if selected_games.get("Sokoban"):
241
  game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
242
  if selected_games.get("2048"):
243
  game_dfs["2048"] = get_2048_leaderboard(rank_data)
244
  if selected_games.get("Candy Crush"):
245
  game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
246
- if selected_games.get("Tetris (complete)"):
247
- game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
248
  if selected_games.get("Tetris (planning only)"):
249
  game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
250
  if selected_games.get("Ace Attorney"):
@@ -269,21 +363,17 @@ def get_combined_leaderboard(rank_data, selected_games):
269
  if game in game_dfs:
270
  df = game_dfs[game]
271
  if player in df["Player"].values:
272
- if game == "Super Mario Bros":
 
 
273
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
274
  elif game == "Sokoban":
275
- # Parse Sokoban score string and get maximum level
276
- levels_str = df[df["Player"] == player]["Levels Cracked"].iloc[0]
277
- try:
278
- levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
279
- player_data[f"{game} Score"] = max(levels) if levels else 0
280
- except:
281
- player_data[f"{game} Score"] = 0
282
  elif game == "2048":
283
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
284
  elif game == "Candy Crush":
285
- player_data[f"{game} Score"] = df[df["Player"] == player]["Average Score"].iloc[0]
286
- elif game in ["Tetris (complete)", "Tetris (planning only)"]:
287
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
288
  elif game == "Ace Attorney":
289
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
 
4
 
5
  # Define game order
6
  GAME_ORDER = [
7
+ # "Super Mario Bros", # Commented out
8
+ "Super Mario Bros (planning only)",
9
  "Sokoban",
10
  "2048",
11
  "Candy Crush",
12
+ # "Tetris (complete)", # Commented out
13
  "Tetris (planning only)",
14
  "Ace Attorney"
15
  ]
 
42
  })
43
  df["Organization"] = df["Player"].apply(get_organization)
44
  df = df[["Player", "Organization", "Progress (current/total)", "Score", "Time (s)"]]
45
+ if "Score" in df.columns:
46
+ df = df.sort_values("Score", ascending=False)
47
  return df
48
 
49
  def get_sokoban_leaderboard(rank_data):
50
  data = rank_data.get("Sokoban", {}).get("results", [])
51
  df = pd.DataFrame(data)
52
  df = df.rename(columns={
53
+ "model": "Player",
54
+ "score": "Score",
55
+ "steps": "Steps",
56
+ "detail_box_on_target": "Detail Box On Target",
57
+ "cracked_levels": "Levels Cracked"
58
  })
59
  df["Organization"] = df["Player"].apply(get_organization)
60
+
61
+ # Define columns to keep, ensuring 'Score' is present
62
+ columns_to_keep = ["Player", "Organization", "Score", "Levels Cracked", "Detail Box On Target", "Steps"]
63
+ # Filter to only columns that actually exist in the DataFrame after renaming
64
+ df_columns = [col for col in columns_to_keep if col in df.columns]
65
+ df = df[df_columns]
66
+
67
+ if "Score" in df.columns:
68
+ df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
69
+ df = df.sort_values("Score", ascending=False)
70
  return df
71
 
72
  def get_2048_leaderboard(rank_data):
73
  data = rank_data.get("2048", {}).get("results", [])
74
+ # --- Diagnostic Print Removed ---
75
+ # if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
76
+ # print(f"DEBUG_UTILS: Keys in first item of raw data for 2048: {list(data[0].keys())}")
77
+ # elif not data:
78
+ # print("DEBUG_UTILS: Raw data for 2048 is empty.")
79
+ # else:
80
+ # print("DEBUG_UTILS: Raw data for 2048 is not in the expected list of dicts format.")
81
+ # --- End Diagnostic Print Removed ---
82
  df = pd.DataFrame(data)
83
+ # print(f"DEBUG_UTILS: Columns after pd.DataFrame(data): {df.columns.tolist()}") # REMOVED
84
+
85
  df = df.rename(columns={
86
+ "model": "Player",
87
+ "score": "Score", # From new JSON structure
88
+ "details": "Details", # From new JSON structure
89
+ "highest_tail": "Highest Tail" # Added new column
90
+ # Old fields like "steps", "time", "rank" are removed
91
  })
92
+ # print(f"DEBUG_UTILS: Columns after rename: {df.columns.tolist()}") # REMOVED
93
+
94
+ # Ensure 'Player' column exists before applying get_organization
95
+ if "Player" in df.columns:
96
+ df["Organization"] = df["Player"].apply(get_organization)
97
+ else:
98
+ # Handle case where 'Player' column might be missing after rename (should not happen with current logic)
99
+ # print("DEBUG_UTILS: 'Player' column not found after rename, skipping Organization.") # REMOVED
100
+ df["Organization"] = "unknown" # Fallback
101
+
102
+ columns_to_keep = ["Player", "Organization", "Score", "Highest Tail", "Details"] # Added "Highest Tail"
103
+
104
+ # Defensive check for 'Highest Tail' before filtering - REMOVED
105
+ # if 'highest_tail' in df.columns and 'Highest Tail' not in df.columns:
106
+ # print("DEBUG_UTILS: 'highest_tail' (lowercase) found, but 'Highest Tail' (capitalized) not. This indicates a rename issue.")
107
+ # elif 'Highest Tail' not in df.columns and 'highest_tail' not in df.columns:
108
+ # print("DEBUG_UTILS: Neither 'Highest Tail' nor 'highest_tail' found in columns before filtering.")
109
+
110
+ # df_columns = [col for col in columns_to_keep if col in df.columns] # REMOVED logic that used df_columns
111
+ # print(f"DEBUG_UTILS: df_columns selected (columns that are in columns_to_keep AND in df.columns): {df_columns}") # REMOVED
112
+
113
+ # Ensure all columns in columns_to_keep exist in df, fill with np.nan if not
114
+ for col_k in columns_to_keep:
115
+ if col_k not in df.columns:
116
+ # print(f"DEBUG_UTILS: Column '{col_k}' from columns_to_keep not found in DataFrame. Adding it with NaN values.") # REMOVED
117
+ df[col_k] = np.nan # Or some other default like 'n/a' if appropriate
118
+
119
+ df = df[columns_to_keep] # Use columns_to_keep directly after ensuring they exist
120
+ # print(f"DEBUG_UTILS: Columns after final selection: {df.columns.tolist()}") # REMOVED
121
+
122
+ if "Score" in df.columns:
123
+ df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
124
+ df = df.sort_values("Score", ascending=False)
125
  return df
126
 
127
  def get_candy_leaderboard(rank_data):
 
129
  df = pd.DataFrame(data)
130
  df = df.rename(columns={
131
  "model": "Player",
132
+ "score": "Score",
133
+ "details": "Details"
 
134
  })
135
  df["Organization"] = df["Player"].apply(get_organization)
136
+
137
+ columns_to_keep = ["Player", "Organization", "Score", "Details"]
138
+ df_columns = [col for col in columns_to_keep if col in df.columns]
139
+ df = df[df_columns]
140
+
141
+ if "Score" in df.columns:
142
+ df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
143
+ df = df.sort_values("Score", ascending=False)
144
  return df
145
 
146
  def get_tetris_leaderboard(rank_data):
 
160
  df = pd.DataFrame(data)
161
  df = df.rename(columns={
162
  "model": "Player",
163
+ "score": "Score", # From new JSON structure
164
+ "details": "Details" # From new JSON structure
165
+ # Old fields like "steps_blocks", "rank" are removed
166
  })
167
  df["Organization"] = df["Player"].apply(get_organization)
168
+
169
+ columns_to_keep = ["Player", "Organization", "Score", "Details"]
170
+ df_columns = [col for col in columns_to_keep if col in df.columns]
171
+ df = df[df_columns]
172
+
173
+ if "Score" in df.columns:
174
+ df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
175
+ df = df.sort_values("Score", ascending=False)
176
  return df
177
 
178
  def get_ace_attorney_leaderboard(rank_data):
 
180
  df = pd.DataFrame(data)
181
  df = df.rename(columns={
182
  "model": "Player",
 
 
 
183
  "score": "Score",
184
+ "progress": "Progress",
185
+ "evaluator result": "Evaluator Result"
186
  })
187
  df["Organization"] = df["Player"].apply(get_organization)
188
+
189
+ # Define columns to keep, including Evaluator Result
190
+ columns_to_keep = ["Player", "Organization", "Score", "Progress", "Evaluator Result"]
191
+ # Filter to only columns that actually exist in the DataFrame after renaming
192
+ df_columns = [col for col in columns_to_keep if col in df.columns]
193
+ df = df[df_columns]
194
+
195
+ if "Score" in df.columns:
196
+ df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
197
+ df = df.sort_values("Score", ascending=False) # Higher score is better
198
+ return df
199
+
200
+ def get_mario_planning_leaderboard(rank_data):
201
+ data = rank_data.get("Super Mario Bros (planning only)", {}).get("results", [])
202
+ df = pd.DataFrame(data)
203
+ df = df.rename(columns={
204
+ "model": "Player",
205
+ "score": "Score",
206
+ "detail_data": "Detail Data",
207
+ "progress": "Progress"
208
+ })
209
+ df["Organization"] = df["Player"].apply(get_organization)
210
+ # Define columns to keep
211
+ columns_to_keep = ["Player", "Organization", "Score", "Progress", "Detail Data"]
212
+ df_columns = [col for col in columns_to_keep if col in df.columns]
213
+ df = df[df_columns]
214
+
215
+ if "Score" in df.columns:
216
+ df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
217
+ df = df.sort_values("Score", ascending=False)
218
  return df
219
 
220
  def calculate_rank_and_completeness(rank_data, selected_games):
 
222
  game_dfs = {}
223
 
224
  # Get DataFrames for selected games
225
+ # if selected_games.get("Super Mario Bros"): # Commented out
226
+ # game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
227
+ if selected_games.get("Super Mario Bros (planning only)"):
228
+ game_dfs["Super Mario Bros (planning only)"] = get_mario_planning_leaderboard(rank_data)
229
  if selected_games.get("Sokoban"):
230
  game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
231
  if selected_games.get("2048"):
232
  game_dfs["2048"] = get_2048_leaderboard(rank_data)
233
  if selected_games.get("Candy Crush"):
234
  game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
235
+ # if selected_games.get("Tetris (complete)"): # Commented out
236
+ # game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
237
  if selected_games.get("Tetris (planning only)"):
238
  game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
239
  if selected_games.get("Ace Attorney"):
 
262
  if player in df["Player"].values:
263
  games_played += 1
264
  # Get player's score based on game type
265
+ # if game == "Super Mario Bros": # Commented out
266
+ # player_score = df[df["Player"] == player]["Score"].iloc[0]
267
+ # rank = len(df[df["Score"] > player_score]) + 1
268
+ if game == "Super Mario Bros (planning only)":
269
  player_score = df[df["Player"] == player]["Score"].iloc[0]
270
  rank = len(df[df["Score"] > player_score]) + 1
271
  elif game == "Sokoban":
272
+ player_score = df[df["Player"] == player]["Score"].iloc[0]
273
+ rank = len(df[df["Score"] > player_score]) + 1
 
 
 
 
 
 
 
 
 
 
274
  elif game == "2048":
275
  player_score = df[df["Player"] == player]["Score"].iloc[0]
276
  rank = len(df[df["Score"] > player_score]) + 1
277
  elif game == "Candy Crush":
278
+ player_score = df[df["Player"] == player]["Score"].iloc[0]
279
+ rank = len(df[df["Score"] > player_score]) + 1
280
+ elif game in ["Tetris (planning only)"]:
281
  player_score = df[df["Player"] == player]["Score"].iloc[0]
282
  rank = len(df[df["Score"] > player_score]) + 1
283
  elif game == "Ace Attorney":
 
289
  else:
290
  player_data[f"{game} Score"] = 'n/a'
291
 
292
+ # Calculate average rank and completeness for sorting
293
  if ranks:
294
+ player_data["Average Rank"] = round(np.mean(ranks), 2)
295
  player_data["Games Played"] = games_played
296
  else:
297
+ player_data["Average Rank"] = float('inf')
298
  player_data["Games Played"] = 0
299
 
300
  results.append(player_data)
 
302
  # Create DataFrame and sort by average rank and completeness
303
  df_results = pd.DataFrame(results)
304
  if not df_results.empty:
305
+ # Sort by average rank (ascending) and games played (descending)
306
  df_results = df_results.sort_values(
307
+ by=["Average Rank", "Games Played"],
308
  ascending=[True, False]
309
  )
310
  # Drop the sorting columns
311
+ df_results = df_results.drop(["Average Rank", "Games Played"], axis=1)
312
 
313
  return df_results
314
 
 
327
  game_dfs = {}
328
 
329
  # Get DataFrames for selected games
330
+ # if selected_games.get("Super Mario Bros"): # Commented out
331
+ # game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
332
+ if selected_games.get("Super Mario Bros (planning only)"):
333
+ game_dfs["Super Mario Bros (planning only)"] = get_mario_planning_leaderboard(rank_data)
334
  if selected_games.get("Sokoban"):
335
  game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
336
  if selected_games.get("2048"):
337
  game_dfs["2048"] = get_2048_leaderboard(rank_data)
338
  if selected_games.get("Candy Crush"):
339
  game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
340
+ # if selected_games.get("Tetris (complete)"): # Commented out
341
+ # game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
342
  if selected_games.get("Tetris (planning only)"):
343
  game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
344
  if selected_games.get("Ace Attorney"):
 
363
  if game in game_dfs:
364
  df = game_dfs[game]
365
  if player in df["Player"].values:
366
+ # if game == "Super Mario Bros": # Commented out
367
+ # player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
368
+ if game == "Super Mario Bros (planning only)":
369
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
370
  elif game == "Sokoban":
371
+ player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
 
 
 
 
 
 
372
  elif game == "2048":
373
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
374
  elif game == "Candy Crush":
375
+ player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
376
+ elif game in ["Tetris (planning only)"]:
377
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
378
  elif game == "Ace Attorney":
379
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
rank_data_03_25_2025.json CHANGED
@@ -3,156 +3,200 @@
3
  "runs": 5,
4
  "results": [
5
  {
6
- "model": "gpt-4.1-2025-04-14",
7
- "score": 740,
8
  "progress": "1-1",
9
- "time_s": 68.6,
10
- "rank": 1
11
  },
12
  {
13
- "model": "claude-3-7-sonnet-20250219",
14
- "score": 710,
15
  "progress": "1-1",
16
- "time_s": 64.2,
17
- "rank": 2
18
  },
19
  {
20
  "model": "gpt-4o-2024-11-20",
21
  "score": 560,
22
  "progress": "1-1",
23
- "time_s": 58.6,
24
- "rank": 3
25
  },
26
  {
27
  "model": "gemini-2.0-flash",
28
  "score": 320,
29
  "progress": "1-1",
30
- "time_s": 51.8,
31
- "rank": 4
32
  },
33
  {
34
  "model": "claude-3-5-haiku-20241022",
35
  "score": 140,
36
  "progress": "1-1",
37
- "time_s": 76.4,
38
- "rank": 5
39
  },
40
  {
41
  "model": "gpt-4.5-preview-2025-02-27",
42
  "score": 160,
43
  "progress": "1-1",
44
- "time_s": 62.8,
45
- "rank": 6
46
  }
47
  ]
48
  },
49
- "2048": {
50
- "runs": 1,
51
  "results": [
52
  {
53
- "model": "claude-3-7-sonnet-20250219(thinking)",
54
- "score": 256,
55
- "steps": 114,
56
- "time": ">200",
57
- "rank": 1
58
  },
59
  {
60
- "model": "grok-3-mini-beta",
61
- "score": 256,
62
- "steps": 108,
63
- "time": "58:09",
64
- "rank": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  },
66
  {
67
  "model": "o1-2024-12-17",
68
- "score": 256,
69
- "steps": 116,
70
- "time": ">200",
71
- "rank": 1
72
  },
73
  {
74
  "model": "o3-2025-04-16",
75
- "score": 256,
76
- "steps": 108,
77
- "time": "58:09",
78
- "rank": 1
79
  },
80
  {
81
- "model": "claude-3-7-sonnet-20250219",
82
- "score": 256,
83
- "steps": 130,
84
- "time": "20:36",
85
- "rank": 4
86
  },
87
  {
88
- "model": "deepseek-v3",
89
- "score": 256,
90
- "steps": 216,
91
- "time": "54.02",
92
- "rank": 5
93
- },
 
 
 
 
94
  {
95
- "model": "gemini-2.5-flash-preview-04-17",
96
- "score": 128,
97
- "steps": 71,
98
- "time": "41:42",
99
- "rank": 6
100
  },
101
  {
102
- "model": "gemini-2.0-flash",
103
- "score": 128,
104
- "steps": 111,
105
- "time": "18:43",
106
- "rank": 7
107
  },
108
  {
109
- "model": "gemini-2.0-flash-thinking-exp-1219",
110
- "score": 128,
111
- "steps": 132,
112
- "time": ">100",
113
- "rank": 8
114
  },
115
  {
116
- "model": "gemini-2.5-pro-exp-03-25",
117
- "score": 128,
118
- "steps": 138,
119
- "time": "169",
120
- "rank": 9
121
  },
122
  {
123
- "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
124
- "score": 128,
125
- "steps": 145,
126
- "time": ">100",
127
- "rank": 10
128
  },
129
  {
130
- "model": "o4-mini-2025-04-16",
131
- "score": 128,
132
- "steps": "",
133
- "time": "",
134
- "rank": 11
135
  },
136
  {
137
- "model": "claude-3-5-sonnet-20241022",
138
- "score": 64,
139
- "steps": 92,
140
- "time": "9:2",
141
- "rank": 13
142
  },
143
  {
144
- "model": "gpt-4.5-preview-2025-02-27",
145
- "score": 34,
146
- "steps": 34,
147
- "time": "8:25",
148
- "rank": 14
149
  },
150
  {
151
  "model": "gpt-4o-2024-11-20",
152
- "score": 16,
153
- "steps": 21,
154
- "time": "1:17",
155
- "rank": 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  }
157
  ]
158
  },
@@ -189,28 +233,74 @@
189
  "runs": 3,
190
  "results": [
191
  {
192
- "model": "claude-3-7-sonnet-20250219",
193
- "score": 110,
194
- "steps_blocks": 29,
195
- "rank": 1
196
  },
197
  {
198
- "model": "claude-3-5-haiku-20241022",
199
- "score": 92,
200
- "steps_blocks": 25,
201
- "rank": 2
202
  },
203
  {
204
- "model": "gemini-2.0-flash",
205
- "score": 87,
206
- "steps_blocks": 24,
207
- "rank": 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  },
209
  {
210
  "model": "gpt-4o-2024-11-20",
211
- "score": 56,
212
- "steps_blocks": 20,
213
- "rank": 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  }
215
  ]
216
  },
@@ -218,102 +308,74 @@
218
  "runs": 3,
219
  "results": [
220
  {
221
- "model": "o4-mini-2025-04-16",
222
- "score_runs": "123,131",
223
- "average_score": 127,
224
- "steps": 25,
225
- "rank": 1
226
  },
227
  {
228
- "model": "o3-2025-04-16",
229
- "score_runs": "115, 122",
230
- "average_score": 118.5,
231
- "steps": 25,
232
- "rank": 2
233
  },
234
  {
235
- "model": "o3-mini-2025-01-31(medium)",
236
- "score_runs": "90;109;120",
237
- "average_score": 106.33,
238
- "steps": 25,
239
- "rank": 3
240
  },
241
  {
242
- "model": "grok-3-mini-beta",
243
- "score_runs": "106",
244
- "average_score": 106,
245
- "steps": 25,
246
- "rank": 4
247
  },
248
  {
249
- "model": "o1-2024-12-17",
250
- "score_runs": "96;114;83",
251
- "average_score": 97.67,
252
- "steps": 25,
253
- "rank": 5
254
  },
255
  {
256
- "model": "deepseek-r1",
257
- "score_runs": "62;108;105",
258
- "average_score": 91.67,
259
- "steps": 25,
260
- "rank": 6
261
  },
262
  {
263
- "model": "gemini-2.5-flash-preview-04-17",
264
- "score_runs": "59",
265
- "average_score": 59,
266
- "steps": 25,
267
- "rank": 7
268
  },
269
  {
270
- "model": "gemini-2.5-pro-exp-03-25",
271
- "score_runs": "50;36;68",
272
- "average_score": 51.33,
273
- "steps": 25,
274
- "rank": 8
275
  },
276
  {
277
- "model": "claude-3-7-sonnet-20250219(thinking)",
278
- "score_runs": "36;46;24",
279
- "average_score": 35.33,
280
- "steps": 25,
281
- "rank": 9
282
  },
283
  {
284
- "model": "gemini-2.0-flash-thinking-exp-1219",
285
- "score_runs": "0;15;39",
286
- "average_score": 18,
287
- "steps": 25,
288
- "rank": 10
289
  },
290
  {
291
- "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
292
- "score_runs": "6;0;0",
293
- "average_score": 2,
294
- "steps": 25,
295
- "rank": 11
296
  },
297
  {
298
- "model": "gpt-4.1-2025-04-14",
299
- "score_runs": "0;3;3",
300
- "average_score": 2,
301
- "steps": 25,
302
- "rank": 12
303
  },
304
  {
305
- "model": "claude-3-5-sonnet-20241022",
306
- "score_runs": "3;0;0",
307
- "average_score": 1,
308
- "steps": 25,
309
- "rank": 13
310
  },
311
  {
312
- "model": "deepseek-v3",
313
- "score_runs": "0;0;0",
314
- "average_score": 0,
315
- "steps": 25,
316
- "rank": 14
317
  }
318
  ]
319
  },
@@ -321,216 +383,177 @@
321
  "runs": 3,
322
  "results": [
323
  {
324
- "model": "o3-2025-04-16",
325
- "levels_cracked": "5",
326
- "steps": "[16, 40, 59, 110]",
327
- "rank": 1
328
- },
329
- {
330
- "model": "grok-3-mini-beta",
331
- "levels_cracked": "3",
332
- "steps": "[14, 36, 55, 78]",
333
- "rank": 2
334
  },
335
  {
336
- "model": "o3-mini-2025-01-31(medium)",
337
- "levels_cracked": "2; 3; 2",
338
- "steps": "[17,52,68];[24,58,78,91];[19,44,64]",
339
- "rank": 3
340
  },
341
  {
342
- "model": "gemini-2.5-pro-exp-03-25",
343
- "levels_cracked": "2;2;3",
344
- "steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
345
- "rank": 4
346
- },
347
- {
348
- "model": "gemini-2.5-flash-preview-04-17",
349
- "levels_cracked": "2",
350
- "steps": "[24, 50, 60]",
351
- "rank": 5
352
  },
353
  {
354
- "model": "o4-mini-2025-04-16",
355
- "levels_cracked": "2",
356
- "steps": "",
357
- "rank": 6
358
  },
359
  {
360
- "model": "claude-3-7-sonnet-20250219(thinking)",
361
- "levels_cracked": "1; 2; 0",
362
- "steps": "[17,35];[15,40,43];[4]",
363
- "rank": 7
364
  },
365
  {
366
- "model": "o1-2024-12-17",
367
- "levels_cracked": "1; 1; 1",
368
- "steps": null,
369
- "rank": 8
370
  },
371
  {
372
- "model": "deepseek-r1",
373
- "levels_cracked": "1; 0; 1",
374
- "steps": "[19,42];[13];[19,36]",
375
- "note": "stuck",
376
- "rank": 9
377
  },
378
  {
379
- "model": "o1-mini-2024-09-12",
380
- "levels_cracked": "0;1;0",
381
- "steps": null,
382
- "rank": 10
383
  },
384
  {
385
- "model": "gemini-2.0-flash-thinking-exp-1219",
386
- "levels_cracked": "0; 0; 0",
387
- "steps": "[23]; [14]; [14]",
388
- "rank": 11
389
  },
390
  {
391
- "model": "gpt-4o-2024-11-20",
392
- "levels_cracked": "0; 0; 0",
393
- "steps": "[68];[105];[168]",
394
- "note": "stuck in a loop",
395
- "rank": 12
396
  },
397
  {
398
- "model": "claude-3-5-sonnet-20241022",
399
- "levels_cracked": "0; 0; 0",
400
- "steps": "[21]; [30]; [51]",
401
- "note": "stuck in a loop",
402
- "rank": 13
403
  },
404
  {
405
- "model": "deepseek-v3",
406
- "levels_cracked": "0; 0; 0",
407
- "steps": "[9]; [47]; [64]",
408
- "rank": 14
409
  },
410
  {
411
- "model": "gpt-4.1-2025-04-14",
412
- "levels_cracked": "0; 0; 0",
413
- "steps": "[9]; [47]; [64]",
414
- "rank": 15
415
  },
416
  {
417
- "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
418
- "levels_cracked": "0;0;0",
419
- "steps": "[5]",
420
- "rank": 17
421
  }
422
  ]
423
  },
424
  "Ace Attorney": {
425
- "runs": 2,
426
  "results": [
427
  {
428
- "model": "o1-2024-12-17",
429
- "levels_cracked": "3; 3",
430
- "lives_left": "[5, 3, 3, 0],[4, 5, 3, 0]",
431
- "cracked_details": "4: 7/8",
432
- "rank": 1,
433
- "score": 26,
434
- "note": "stuck at the end not present evidence"
435
  },
436
  {
437
- "model": "o3-2025-04-16",
438
- "levels_cracked": "3",
439
- "lives_left": "[5, 3, 3, 0]",
440
- "cracked_details": "4: 4/8",
441
- "rank": 2,
442
- "score": 23,
443
- "note": "failed to present evidence"
444
- },
445
- {
446
- "model": "gemini-2.5-pro-exp-03-25",
447
- "levels_cracked": "2; 3",
448
- "lives_left": "[5,5,0]; [5, 5, 4, 0]",
449
- "cracked_details": "4: 0/8",
450
- "rank": 3,
451
- "score": 20,
452
- "note": "failed to present evidence"
453
- },
454
- {
455
- "model": "claude-3-7-sonnet-20250219(thinking)",
456
- "levels_cracked": "1; 1",
457
- "lives_left": "[3,0]; [5,0]",
458
- "cracked_details": "2: 3/9",
459
- "rank": 4,
460
- "score": 8,
461
- "note": "failed to present evidence"
462
  },
463
  {
464
- "model": "grok-3-mini-beta",
465
- "levels_cracked": "1",
466
- "lives_left": "[3, 0]",
467
- "cracked_details": "2: 2/9",
468
- "rank": 5,
 
 
 
 
 
 
 
 
469
  "score": 7,
470
- "note": "failed to present evidence"
 
471
  },
472
  {
473
- "model": "claude-3-5-sonnet-20241022",
474
- "levels_cracked": "1",
475
- "lives_left": "5, 5",
476
- "cracked_details": "1:1/8",
477
- "rank": 6,
478
- "score": 6,
479
- "note": "stuck in loop"
 
 
 
480
  },
481
  {
482
  "model": "gpt-4.1-2025-04-14",
483
- "levels_cracked": "1",
484
- "lives_left": "[4,5]",
485
- "cracked_details": "1: 1/8",
486
- "rank": 7,
487
- "score": 6,
488
- "note": "stuck in loop"
489
- },
490
- {
491
- "model": "gemini-2.5-flash-preview-04-17",
492
- "levels_cracked": "0",
493
- "lives_left": "0",
494
- "cracked_details": "1: 4/5",
495
- "rank": 8,
496
- "score": 4,
497
- "note": "stuck in the last option section"
498
  },
499
  {
500
- "model": "gemini-2.0-flash-thinking-exp-1219",
501
- "levels_cracked": "0",
502
- "lives_left": "0",
503
- "cracked_details": "1: 4/5",
504
- "rank": 9,
505
- "score": 4,
506
- "note": "stuck in the last option section"
507
  },
508
  {
509
- "model": "deepseek-r1",
510
- "levels_cracked": "0",
511
- "lives_left": "0",
512
- "cracked_details": "1: 4/5",
513
- "rank": 10,
514
- "score": 4,
515
- "note": "stuck in the 3rd evidence present"
 
 
 
 
 
 
 
 
 
516
  },
517
  {
518
  "model": "o4-mini-2025-04-16",
519
- "levels_cracked": "0",
520
- "lives_left": "0",
521
- "cracked_details": "1:1/5",
522
- "rank": 11,
523
- "score": 1,
524
- "note": "failed to present evidence"
525
- },
526
- {
527
- "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
528
- "levels_cracked": "0",
529
- "lives_left": "0",
530
- "cracked_details": "0:0/5",
531
- "rank": 13,
532
  "score": 0,
533
- "note": "failed to present evidence"
 
534
  }
535
  ]
536
  }
 
3
  "runs": 5,
4
  "results": [
5
  {
6
+ "model": "claude-3-7-sonnet-20250219",
7
+ "score": 710,
8
  "progress": "1-1",
9
+ "time_s": 64.2
 
10
  },
11
  {
12
+ "model": "gpt-4.1-2025-04-14",
13
+ "score": 740,
14
  "progress": "1-1",
15
+ "time_s": 68.6
 
16
  },
17
  {
18
  "model": "gpt-4o-2024-11-20",
19
  "score": 560,
20
  "progress": "1-1",
21
+ "time_s": 58.6
 
22
  },
23
  {
24
  "model": "gemini-2.0-flash",
25
  "score": 320,
26
  "progress": "1-1",
27
+ "time_s": 51.8
 
28
  },
29
  {
30
  "model": "claude-3-5-haiku-20241022",
31
  "score": 140,
32
  "progress": "1-1",
33
+ "time_s": 76.4
 
34
  },
35
  {
36
  "model": "gpt-4.5-preview-2025-02-27",
37
  "score": 160,
38
  "progress": "1-1",
39
+ "time_s": 62.8
 
40
  }
41
  ]
42
  },
43
+ "Super Mario Bros (planning only)": {
44
+ "runs": 3,
45
  "results": [
46
  {
47
+ "model": "claude-3-5-sonnet-20241022",
48
+ "score": 1267.7,
49
+ "detail_data": "709;1532;1562",
50
+ "progress": "1-1"
 
51
  },
52
  {
53
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
54
+ "score": 1418.7,
55
+ "detail_data": "2015;709;1532",
56
+ "progress": "1-1"
57
+ },
58
+ {
59
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
60
+ "score": 1385.0,
61
+ "detail_data": "1672;1266;1247",
62
+ "progress": "1-1"
63
+ },
64
+ {
65
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
66
+ "score": 1498.3,
67
+ "detail_data": "1561;1271;1663",
68
+ "progress": "1-1"
69
+ },
70
+ {
71
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
72
+ "score": 1468.7,
73
+ "detail_data": "898;2008;1500",
74
+ "progress": "1-1"
75
+ },
76
+ {
77
+ "model": "gpt-4.1-2025-04-14",
78
+ "score": 2126.3,
79
+ "detail_data": "1531;722;4126",
80
+ "progress": "1-1"
81
+ },
82
+ {
83
+ "model": "gpt-4o-2024-11-20",
84
+ "score": 2047.3,
85
+ "detail_data": "2017;2590;1535",
86
+ "progress": "1-1"
87
  },
88
  {
89
  "model": "o1-2024-12-17",
90
+ "score": 855,
91
+ "detail_data": "855",
92
+ "progress": "1-1"
 
93
  },
94
  {
95
  "model": "o3-2025-04-16",
96
+ "score": 3445,
97
+ "detail_data": "3445",
98
+ "progress": "1-1"
 
99
  },
100
  {
101
+ "model": "o4-mini-2025-04-16",
102
+ "score": 1448.0,
103
+ "detail_data": "1525;1263;1556",
104
+ "progress": "1-1"
 
105
  },
106
  {
107
+ "model": "Random (x30)",
108
+ "score": 986.97,
109
+ "detail_data": "986.97",
110
+ "progress": "1-1"
111
+ }
112
+ ]
113
+ },
114
+ "2048": {
115
+ "runs": 3,
116
+ "results": [
117
  {
118
+ "model": "claude-3-5-sonnet-20241022",
119
+ "score": 108.2,
120
+ "details": "1352;2860;1532",
121
+ "highest_tail": 128
 
122
  },
123
  {
124
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
125
+ "score": 113.3,
126
+ "details": "2560;3224;2088",
127
+ "highest_tail": 256
 
128
  },
129
  {
130
+ "model": "deepseek-r1",
131
+ "score": 105.2,
132
+ "details": "700;1240;3680",
133
+ "highest_tail": 128
 
134
  },
135
  {
136
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
137
+ "score": 106.6,
138
+ "details": "1304;1316;2472",
139
+ "highest_tail": 256
 
140
  },
141
  {
142
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
143
+ "score": 117.3,
144
+ "details": "5300;2400;3060",
145
+ "highest_tail": 256
 
146
  },
147
  {
148
+ "model": "grok-3-mini-beta (thinking)",
149
+ "score": 118.6,
150
+ "details": "6412;2492;3204",
151
+ "highest_tail": 256
 
152
  },
153
  {
154
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
155
+ "score": 106,
156
+ "details": "1404;1272;2084",
157
+ "highest_tail": 128
 
158
  },
159
  {
160
+ "model": "gpt-4.1-2025-04-14",
161
+ "score": 105.7,
162
+ "details": "1156;2664;1148",
163
+ "highest_tail": 128
 
164
  },
165
  {
166
  "model": "gpt-4o-2024-11-20",
167
+ "score": 106.7,
168
+ "details": "1604;1284;2080",
169
+ "highest_tail": 256
170
+ },
171
+ {
172
+ "model": "o1-2024-12-17",
173
+ "score": 128.9,
174
+ "details": "3132;2004;3136",
175
+ "highest_tail": 512
176
+ },
177
+ {
178
+ "model": "o1-mini-2024-09-12",
179
+ "score": 114.0,
180
+ "details": "21;86;37",
181
+ "highest_tail": 256
182
+ },
183
+ {
184
+ "model": "o3-2025-04-16",
185
+ "score": 128.0,
186
+ "details": "7120",
187
+ "highest_tail": 512
188
+ },
189
+ {
190
+ "model": "o4-mini-2025-04-16",
191
+ "score": 120.6,
192
+ "details": "4928;5456;2912",
193
+ "highest_tail": 256
194
+ },
195
+ {
196
+ "model": "Random (x30)",
197
+ "score": 100.4,
198
+ "details": "",
199
+ "highest_tail": 128
200
  }
201
  ]
202
  },
 
233
  "runs": 3,
234
  "results": [
235
  {
236
+ "model": "claude-3-5-sonnet-20241022",
237
+ "score": 14.7,
238
+ "details": "16;14;14"
 
239
  },
240
  {
241
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
242
+ "score": 16.3,
243
+ "details": "19;15;15"
 
244
  },
245
  {
246
+ "model": "deepseek-r1",
247
+ "score": 14.3,
248
+ "details": "15;14;14"
249
+ },
250
+ {
251
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
252
+ "score": 16.3,
253
+ "details": "20;14;15"
254
+ },
255
+ {
256
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
257
+ "score": 23.3,
258
+ "details": "23;23;24"
259
+ },
260
+ {
261
+ "model": "grok-3-mini-beta (thinking)",
262
+ "score": 21.3,
263
+ "details": "20;15;29"
264
+ },
265
+ {
266
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
267
+ "score": 10.3,
268
+ "details": "9;10;12"
269
+ },
270
+ {
271
+ "model": "gpt-4.1-2025-04-14",
272
+ "score": 13.7,
273
+ "details": "13;14;14"
274
  },
275
  {
276
  "model": "gpt-4o-2024-11-20",
277
+ "score": 14,
278
+ "details": "18;11;13"
279
+ },
280
+ {
281
+ "model": "o1-2024-12-17",
282
+ "score": 35,
283
+ "details": "35"
284
+ },
285
+ {
286
+ "model": "o1-mini-2024-09-12",
287
+ "score": 11.7,
288
+ "details": "11;11;13"
289
+ },
290
+ {
291
+ "model": "o3-2025-04-16",
292
+ "score": 42,
293
+ "details": "42"
294
+ },
295
+ {
296
+ "model": "o4-mini-2025-04-16",
297
+ "score": 25.3,
298
+ "details": "22;35;19"
299
+ },
300
+ {
301
+ "model": "Random (x30)",
302
+ "score": 10.2,
303
+ "details": ""
304
  }
305
  ]
306
  },
 
308
  "runs": 3,
309
  "results": [
310
  {
311
+ "model": "claude-3-5-sonnet-20241022",
312
+ "score": 106,
313
+ "details": "92;165;61"
 
 
314
  },
315
  {
316
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
317
+ "score": 484,
318
+ "details": "535;428;489"
 
 
319
  },
320
  {
321
+ "model": "deepseek-r1",
322
+ "score": 447.3,
323
+ "details": "409;436;497"
 
 
324
  },
325
  {
326
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
327
+ "score": 334.7,
328
+ "details": "259;372;373"
 
 
329
  },
330
  {
331
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
332
+ "score": 416.3,
333
+ "details": "411;414;424"
 
 
334
  },
335
  {
336
+ "model": "grok-3-mini-beta (thinking)",
337
+ "score": 254,
338
+ "details": "299;332;131"
 
 
339
  },
340
  {
341
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
342
+ "score": 128.7,
343
+ "details": "67;139;180"
 
 
344
  },
345
  {
346
+ "model": "gpt-4.1-2025-04-14",
347
+ "score": 182,
348
+ "details": "163;215;168"
 
 
349
  },
350
  {
351
+ "model": "gpt-4o-2024-11-20",
352
+ "score": 147.3,
353
+ "details": "131;104;207"
 
 
354
  },
355
  {
356
+ "model": "o1-2024-12-17",
357
+ "score": 159,
358
+ "details": "159"
 
 
359
  },
360
  {
361
+ "model": "o1-mini-2024-09-12",
362
+ "score": 48,
363
+ "details": "21;86;37"
 
 
364
  },
365
  {
366
+ "model": "o3-2025-04-16",
367
+ "score": 647,
368
+ "details": "647"
 
 
369
  },
370
  {
371
+ "model": "o4-mini-2025-04-16",
372
+ "score": 487.3,
373
+ "details": "259;591;612"
 
 
374
  },
375
  {
376
+ "model": "Random (x30)",
377
+ "score": 116.5,
378
+ "details": ""
 
 
379
  }
380
  ]
381
  },
 
383
  "runs": 3,
384
  "results": [
385
  {
386
+ "model": "claude-3-5-sonnet-20241022",
387
+ "score": 0,
388
+ "detail_box_on_target": "0;0;0",
389
+ "cracked_levels": "0;0;0"
 
 
 
 
 
 
390
  },
391
  {
392
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
393
+ "score": 2.33,
394
+ "detail_box_on_target": "2;4;1",
395
+ "cracked_levels": "1;2;0"
396
  },
397
  {
398
+ "model": "deepseek-r1",
399
+ "score": 1.33,
400
+ "detail_box_on_target": "2;0;2",
401
+ "cracked_levels": "1;0;1"
 
 
 
 
 
 
402
  },
403
  {
404
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
405
+ "score": 1.67,
406
+ "detail_box_on_target": "3;0;2",
407
+ "cracked_levels": "2;0;1"
408
  },
409
  {
410
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
411
+ "score": 4.33,
412
+ "detail_box_on_target": "4;4;5",
413
+ "cracked_levels": "2;2;3"
414
  },
415
  {
416
+ "model": "grok-3-mini-beta (thinking)",
417
+ "score": 5.67,
418
+ "detail_box_on_target": "5;6;6",
419
+ "cracked_levels": "3;3;3"
420
  },
421
  {
422
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
423
+ "score": 0,
424
+ "detail_box_on_target": "0;0;0",
425
+ "cracked_levels": "0;0;0"
 
426
  },
427
  {
428
+ "model": "gpt-4.1-2025-04-14",
429
+ "score": 0,
430
+ "detail_box_on_target": "0;0;0",
431
+ "cracked_levels": "0;0;0"
432
  },
433
  {
434
+ "model": "gpt-4o-2024-11-20",
435
+ "score": 0,
436
+ "detail_box_on_target": "0;0;0",
437
+ "cracked_levels": "0;0;0"
438
  },
439
  {
440
+ "model": "o1-2024-12-17",
441
+ "score": 2.33,
442
+ "detail_box_on_target": "2;2;3",
443
+ "cracked_levels": "1;1;2"
 
444
  },
445
  {
446
+ "model": "o1-mini-2024-09-12",
447
+ "score": 1.33,
448
+ "detail_box_on_target": "1;2;1",
449
+ "cracked_levels": "0;1;0"
 
450
  },
451
  {
452
+ "model": "o3-2025-04-16",
453
+ "score": 8,
454
+ "detail_box_on_target": "10;6",
455
+ "cracked_levels": "5;3"
456
  },
457
  {
458
+ "model": "o4-mini-2025-04-16",
459
+ "score": 5.33,
460
+ "detail_box_on_target": "4;6;6",
461
+ "cracked_levels": "2;2;3"
462
  },
463
  {
464
+ "model": "Random (x30)",
465
+ "score": 0,
466
+ "detail_box_on_target": "0,0,0",
467
+ "cracked_levels": "0,0,0"
468
  }
469
  ]
470
  },
471
  "Ace Attorney": {
472
+ "runs": 1,
473
  "results": [
474
  {
475
+ "model": "claude-3-5-sonnet-20241022",
476
+ "score": 2,
477
+ "progress": "1:2/5",
478
+ "evaluator result": "1/3"
 
 
 
479
  },
480
  {
481
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
482
+ "score": 7,
483
+ "progress": "2:2/9",
484
+ "evaluator result": "5/11"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  },
486
  {
487
+ "model": "deepseek-r1",
488
+ "score": 0,
489
+ "progress": "0",
490
+ "evaluator result": "1/5"
491
+ },
492
+ {
493
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
494
+ "score": 4,
495
+ "progress": "1:4/5",
496
+ "evaluator result": "1/7"
497
+ },
498
+ {
499
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
500
  "score": 7,
501
+ "progress": "2:2/9",
502
+ "evaluator result": "2/3"
503
  },
504
  {
505
+ "model": "grok-3-mini-beta (thinking)",
506
+ "score": 0,
507
+ "progress": "0",
508
+ "evaluator result": "0"
509
+ },
510
+ {
511
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
512
+ "score": 0,
513
+ "progress": "0",
514
+ "evaluator result": "0"
515
  },
516
  {
517
  "model": "gpt-4.1-2025-04-14",
518
+ "score": 2,
519
+ "progress": "1:2/5",
520
+ "evaluator result": "2/3"
 
 
 
 
 
 
 
 
 
 
 
 
521
  },
522
  {
523
+ "model": "gpt-4o-2024-11-20",
524
+ "score": 0,
525
+ "progress": "0",
526
+ "evaluator result": "0"
 
 
 
527
  },
528
  {
529
+ "model": "o1-2024-12-17",
530
+ "score": 16,
531
+ "progress": "3: 2/8",
532
+ "evaluator result": "6/11"
533
+ },
534
+ {
535
+ "model": "o1-mini-2024-09-12",
536
+ "score": 0,
537
+ "progress": "0",
538
+ "evaluator result": "1/5"
539
+ },
540
+ {
541
+ "model": "o3-2025-04-16",
542
+ "score": 16,
543
+ "progress": "3: 2/8",
544
+ "evaluator result": "1/2"
545
  },
546
  {
547
  "model": "o4-mini-2025-04-16",
548
+ "score": 4,
549
+ "progress": "1:4/5",
550
+ "evaluator result": "2/5"
551
+ },
552
+ {
553
+ "model": "Random (x30)",
 
 
 
 
 
 
 
554
  "score": 0,
555
+ "progress": "0",
556
+ "evaluator result": "0"
557
  }
558
  ]
559
  }