Yuxuan-Zhang-Dexter commited on
Commit
a62923a
·
1 Parent(s): 2f1c4f3

optiimize radar chart, replace -1 with n/a, and optimize text

Browse files
Files changed (3) hide show
  1. assets/news.json +1 -1
  2. data_visualization.py +82 -40
  3. leaderboard_utils.py +3 -3
assets/news.json CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "date": "2025-04-08",
5
  "video_link": "https://www.youtube.com/watch?v=yoEo2Bk7PGA",
6
- "twitter_text": "LLaMA 4 Maverick dazzles on static tasks but stumbles on dynamic gameplay—our transparent leaderboard exposes the true AI challenge.",
7
  "twitter_link": "https://x.com/haoailab/status/1909712259326394519"
8
  },
9
  {
 
3
  {
4
  "date": "2025-04-08",
5
  "video_link": "https://www.youtube.com/watch?v=yoEo2Bk7PGA",
6
+ "twitter_text": "LLaMA 4 Maverick hacks traditional benchmarks but struggles with real gameplay—our transparent leaderboard exposes the new AI challenge.",
7
  "twitter_link": "https://x.com/haoailab/status/1909712259326394519"
8
  },
9
  {
data_visualization.py CHANGED
@@ -46,7 +46,7 @@ def normalize_values(values, mean, std):
46
  return [50 if v > 0 else 0 for v in values] # Handle zero std case
47
  z_scores = [(v - mean) / std for v in values]
48
  # Scale z-scores to 0-100 range, with mean at 50
49
- scaled_values = [max(0, min(100, (z * 30) + 50)) for z in z_scores]
50
  return scaled_values
51
  def simplify_model_name(name):
52
  if name == "claude-3-7-sonnet-20250219(thinking)":
@@ -55,8 +55,6 @@ def simplify_model_name(name):
55
  return '-'.join(parts[:4]) + '-...' if len(parts) > 4 else name
56
 
57
  def create_horizontal_bar_chart(df, game_name):
58
-
59
-
60
  if game_name == "Super Mario Bros":
61
  score_col = "Score"
62
  df_sorted = df.sort_values(by=score_col, ascending=True)
@@ -86,10 +84,8 @@ def create_horizontal_bar_chart(df, game_name):
86
  else:
87
  return None
88
 
89
-
90
-
91
  x = df_sorted[score_col]
92
- y = [f"{simplify_model_name(row['Player'])} [{row['Organization']}]" for _, row in df_sorted.iterrows()]
93
  colors = [MODEL_COLORS.get(row['Player'], '#808080') for _, row in df_sorted.iterrows()]
94
  texts = [f"{v:.1f}" if game_name == "Candy Crash" else f"{int(v)}" for v in x]
95
 
@@ -105,16 +101,17 @@ def create_horizontal_bar_chart(df, game_name):
105
 
106
  fig.update_layout(
107
  autosize=False,
108
- width=800,
109
  height=600,
110
- margin=dict(l=150, r=150, t=40, b=200),
111
  title=dict(
112
  text=f"{game_name} Performance",
113
- pad=dict(t=10)
 
114
  ),
115
  yaxis=dict(automargin=True),
116
  legend=dict(
117
- font=dict(size=9),
118
  itemsizing='trace',
119
  x=1.1,
120
  y=1,
@@ -132,7 +129,7 @@ def create_radar_charts(df):
132
  categories = [c.replace(" Score", "") for c in game_cols]
133
 
134
  for col in game_cols:
135
- vals = df[col].replace("_", 0).astype(float)
136
  mean, std = vals.mean(), vals.std()
137
  df[f"norm_{col}"] = normalize_values(vals, mean, std)
138
 
@@ -159,7 +156,7 @@ def create_radar_charts(df):
159
  autosize=False,
160
  width=800,
161
  height=600,
162
- margin=dict(l=80, r=150, t=40, b=100),
163
  title=dict(
164
  text="Radar Chart of AI Performance (Normalized)",
165
  pad=dict(t=10)
@@ -190,7 +187,8 @@ def create_group_bar_chart(df):
190
  for game in GAME_ORDER:
191
  col = f"{game} Score"
192
  if col in df.columns:
193
- df[col] = df[col].replace("_", np.nan).astype(float)
 
194
  if df[col].notna().any():
195
  game_cols[game] = col
196
 
@@ -260,7 +258,7 @@ def create_group_bar_chart(df):
260
  continue
261
 
262
  fig.add_trace(go.Bar(
263
- name=simplify_model_name(player),
264
  x=[game_display_map[game] for game in sorted_games],
265
  y=y_vals,
266
  marker_color=MODEL_COLORS.get(player, '#808080'),
@@ -270,8 +268,8 @@ def create_group_bar_chart(df):
270
  fig.update_layout(
271
  autosize=False,
272
  width=1000,
273
- height=600,
274
- margin=dict(l=80, r=150, t=40, b=200),
275
  title=dict(text="Grouped Bar Chart of AI Models (Consistent Trace Grouping)", pad=dict(t=10)),
276
  xaxis_title="Games",
277
  yaxis_title="Normalized Score",
@@ -285,7 +283,7 @@ def create_group_bar_chart(df):
285
  bargroupgap=0.05, # Gap between bars in a group
286
  uniformtext=dict(mode='hide', minsize=8), # Hide text that doesn't fit
287
  legend=dict(
288
- font=dict(size=9),
289
  itemsizing='trace',
290
  x=1.1,
291
  y=1,
@@ -319,12 +317,26 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
319
  if selected_games is None:
320
  selected_games = ['Super Mario Bros', '2048', 'Candy Crash', 'Sokoban']
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  game_cols = [f"{game} Score" for game in selected_games]
323
- categories = selected_games
324
-
325
  # Normalize
326
  for col in game_cols:
327
- vals = df[col].replace("_", 0).astype(float)
328
  mean, std = vals.mean(), vals.std()
329
  df[f"norm_{col}"] = normalize_values(vals, mean, std)
330
 
@@ -358,28 +370,49 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
358
  theta=categories + [categories[0]],
359
  mode='lines+markers',
360
  fill='toself',
361
- name=simplify_model_name(row["Player"]),
362
  line=dict(color=color, width=4 if is_highlighted else 2),
363
  marker=dict(color=color),
364
  fillcolor=fillcolor,
365
- opacity=1.0 if is_highlighted else 0.7
 
366
  ))
367
 
368
  fig.update_layout(
369
  autosize=False,
370
- width=800,
371
  height=600,
372
- margin=dict(l=80, r=150, t=40, b=100),
373
  title=dict(
374
- text="Single Radar Chart (Normalized Performance)",
375
- pad=dict(t=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  ),
377
- polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
378
  legend=dict(
379
- font=dict(size=9),
 
380
  itemsizing='trace',
381
- x=1.4,
382
- y=1,
383
  xanchor='left',
384
  yanchor='top',
385
  bgcolor='rgba(255,255,255,0.6)',
@@ -388,6 +421,13 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
388
  )
389
  )
390
 
 
 
 
 
 
 
 
391
  return fig
392
 
393
  def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None):
@@ -405,7 +445,7 @@ def create_organization_radar_chart(rank_data):
405
 
406
  avg_df = pd.DataFrame([
407
  {
408
- **{col: df[df["Organization"] == org][col].replace("_", 0).astype(float).mean() for col in game_cols},
409
  "Organization": org
410
  }
411
  for org in orgs
@@ -431,7 +471,7 @@ def create_organization_radar_chart(rank_data):
431
  autosize=False,
432
  width=800,
433
  height=600,
434
- margin=dict(l=80, r=150, t=40, b=200),
435
  title=dict(
436
  text="Radar Chart: Organization Performance (Normalized)",
437
  pad=dict(t=10)
@@ -460,7 +500,8 @@ def create_top_players_radar_chart(rank_data, n=5):
460
  categories = [g.replace(" Score", "") for g in game_cols]
461
 
462
  for col in game_cols:
463
- vals = top_df[col].replace("_", 0).astype(float)
 
464
  mean, std = vals.mean(), vals.std()
465
  top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
466
 
@@ -472,14 +513,14 @@ def create_top_players_radar_chart(rank_data, n=5):
472
  theta=categories + [categories[0]],
473
  mode='lines+markers',
474
  fill='toself',
475
- name=simplify_model_name(row["Player"])
476
  ))
477
 
478
  fig.update_layout(
479
  autosize=False,
480
  width=800,
481
  height=600,
482
- margin=dict(l=80, r=150, t=40, b=200),
483
  title=dict(
484
  text=f"Top {n} Players Radar Chart (Normalized)",
485
  pad=dict(t=10)
@@ -515,8 +556,9 @@ def create_player_radar_chart(rank_data, player_name):
515
  categories = [g.replace(" Score", "") for g in game_cols]
516
 
517
  for col in game_cols:
518
- vals = player_df[col].replace("_", 0).astype(float)
519
- mean, std = df[col].replace("_", 0).astype(float).mean(), df[col].replace("_", 0).astype(float).std()
 
520
  player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
521
 
522
  fig = go.Figure()
@@ -527,16 +569,16 @@ def create_player_radar_chart(rank_data, player_name):
527
  theta=categories + [categories[0]],
528
  mode='lines+markers',
529
  fill='toself',
530
- name=simplify_model_name(row["Player"])
531
  ))
532
 
533
  fig.update_layout(
534
  autosize=False,
535
  width=800,
536
  height=600,
537
- margin=dict(l=80, r=150, t=40, b=200),
538
  title=dict(
539
- text=f"{simplify_model_name(player_name)} Radar Chart (Normalized)",
540
  pad=dict(t=10)
541
  ),
542
  polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
 
46
  return [50 if v > 0 else 0 for v in values] # Handle zero std case
47
  z_scores = [(v - mean) / std for v in values]
48
  # Scale z-scores to 0-100 range, with mean at 50
49
+ scaled_values = [max(0, min(100, (z * 30) + 35)) for z in z_scores]
50
  return scaled_values
51
  def simplify_model_name(name):
52
  if name == "claude-3-7-sonnet-20250219(thinking)":
 
55
  return '-'.join(parts[:4]) + '-...' if len(parts) > 4 else name
56
 
57
  def create_horizontal_bar_chart(df, game_name):
 
 
58
  if game_name == "Super Mario Bros":
59
  score_col = "Score"
60
  df_sorted = df.sort_values(by=score_col, ascending=True)
 
84
  else:
85
  return None
86
 
 
 
87
  x = df_sorted[score_col]
88
+ y = [f"{row['Player']} [{row['Organization']}]" for _, row in df_sorted.iterrows()]
89
  colors = [MODEL_COLORS.get(row['Player'], '#808080') for _, row in df_sorted.iterrows()]
90
  texts = [f"{v:.1f}" if game_name == "Candy Crash" else f"{int(v)}" for v in x]
91
 
 
101
 
102
  fig.update_layout(
103
  autosize=False,
104
+ width=1000,
105
  height=600,
106
+ margin=dict(l=200, r=200, t=20, b=20),
107
  title=dict(
108
  text=f"{game_name} Performance",
109
+ pad=dict(t=10),
110
+ font=dict(size=20)
111
  ),
112
  yaxis=dict(automargin=True),
113
  legend=dict(
114
+ font=dict(size=12),
115
  itemsizing='trace',
116
  x=1.1,
117
  y=1,
 
129
  categories = [c.replace(" Score", "") for c in game_cols]
130
 
131
  for col in game_cols:
132
+ vals = df[col].replace("n/a", 0).astype(float)
133
  mean, std = vals.mean(), vals.std()
134
  df[f"norm_{col}"] = normalize_values(vals, mean, std)
135
 
 
156
  autosize=False,
157
  width=800,
158
  height=600,
159
+ margin=dict(l=80, r=150, t=20, b=20),
160
  title=dict(
161
  text="Radar Chart of AI Performance (Normalized)",
162
  pad=dict(t=10)
 
187
  for game in GAME_ORDER:
188
  col = f"{game} Score"
189
  if col in df.columns:
190
+ # Replace "n/a" with np.nan and handle downcasting properly
191
+ df[col] = df[col].replace("n/a", np.nan).infer_objects(copy=False).astype(float)
192
  if df[col].notna().any():
193
  game_cols[game] = col
194
 
 
258
  continue
259
 
260
  fig.add_trace(go.Bar(
261
+ name=row["Player"],
262
  x=[game_display_map[game] for game in sorted_games],
263
  y=y_vals,
264
  marker_color=MODEL_COLORS.get(player, '#808080'),
 
268
  fig.update_layout(
269
  autosize=False,
270
  width=1000,
271
+ height=800,
272
+ margin=dict(l=200, r=200, t=20, b=20),
273
  title=dict(text="Grouped Bar Chart of AI Models (Consistent Trace Grouping)", pad=dict(t=10)),
274
  xaxis_title="Games",
275
  yaxis_title="Normalized Score",
 
283
  bargroupgap=0.05, # Gap between bars in a group
284
  uniformtext=dict(mode='hide', minsize=8), # Hide text that doesn't fit
285
  legend=dict(
286
+ font=dict(size=12),
287
  itemsizing='trace',
288
  x=1.1,
289
  y=1,
 
317
  if selected_games is None:
318
  selected_games = ['Super Mario Bros', '2048', 'Candy Crash', 'Sokoban']
319
 
320
+ # Format game names with line breaks
321
+ formatted_games = []
322
+ for game in selected_games:
323
+ if game == 'Super Mario Bros':
324
+ formatted_games.append('Super<br>Mario')
325
+ elif game == 'Candy Crash':
326
+ formatted_games.append('Candy<br>Crash')
327
+ elif game == 'Tetris (complete)':
328
+ formatted_games.append('Tetris<br>(complete)')
329
+ elif game == 'Tetris (planning only)':
330
+ formatted_games.append('Tetris<br>(planning)')
331
+ else:
332
+ formatted_games.append(game)
333
+
334
  game_cols = [f"{game} Score" for game in selected_games]
335
+ categories = formatted_games
336
+
337
  # Normalize
338
  for col in game_cols:
339
+ vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
340
  mean, std = vals.mean(), vals.std()
341
  df[f"norm_{col}"] = normalize_values(vals, mean, std)
342
 
 
370
  theta=categories + [categories[0]],
371
  mode='lines+markers',
372
  fill='toself',
373
+ name=player,
374
  line=dict(color=color, width=4 if is_highlighted else 2),
375
  marker=dict(color=color),
376
  fillcolor=fillcolor,
377
+ opacity=1.0 if is_highlighted else 0.7,
378
+ hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
379
  ))
380
 
381
  fig.update_layout(
382
  autosize=False,
383
+ width=1000,
384
  height=600,
385
+ margin=dict(l=400, r=200, t=20, b=20),
386
  title=dict(
387
+ text="AI Normalized Performance Across Games",
388
+ x=0.5,
389
+ xanchor='center',
390
+ yanchor='top',
391
+ y=0.95,
392
+ font=dict(size=20),
393
+ pad=dict(b=20)
394
+ ),
395
+ polar=dict(
396
+ radialaxis=dict(
397
+ visible=True,
398
+ range=[0, 100],
399
+ tickangle=45,
400
+ tickfont=dict(size=12),
401
+ gridcolor='lightgray',
402
+ gridwidth=1,
403
+ angle=45
404
+ ),
405
+ angularaxis=dict(
406
+ tickfont=dict(size=14, weight='bold'),
407
+ tickangle=0
408
+ )
409
  ),
 
410
  legend=dict(
411
+ font=dict(size=12),
412
+ title="Choose your model: ",
413
  itemsizing='trace',
414
+ x=-1.2,
415
+ y=0.8,
416
  xanchor='left',
417
  yanchor='top',
418
  bgcolor='rgba(255,255,255,0.6)',
 
421
  )
422
  )
423
 
424
+ fig.update_layout(
425
+ legend=dict(
426
+ itemclick="toggleothers", # This will make clicked item the only visible one
427
+ itemdoubleclick="toggle" # Double click toggles visibility
428
+ )
429
+ )
430
+
431
  return fig
432
 
433
  def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None):
 
445
 
446
  avg_df = pd.DataFrame([
447
  {
448
+ **{col: df[df["Organization"] == org][col].replace("n/a", 0).infer_objects(copy=False).astype(float).mean() for col in game_cols},
449
  "Organization": org
450
  }
451
  for org in orgs
 
471
  autosize=False,
472
  width=800,
473
  height=600,
474
+ margin=dict(l=80, r=150, t=20, b=20),
475
  title=dict(
476
  text="Radar Chart: Organization Performance (Normalized)",
477
  pad=dict(t=10)
 
500
  categories = [g.replace(" Score", "") for g in game_cols]
501
 
502
  for col in game_cols:
503
+ # Replace "n/a" with 0 and handle downcasting properly
504
+ vals = top_df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
505
  mean, std = vals.mean(), vals.std()
506
  top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
507
 
 
513
  theta=categories + [categories[0]],
514
  mode='lines+markers',
515
  fill='toself',
516
+ name=row["Player"]
517
  ))
518
 
519
  fig.update_layout(
520
  autosize=False,
521
  width=800,
522
  height=600,
523
+ margin=dict(l=80, r=150, t=20, b=20),
524
  title=dict(
525
  text=f"Top {n} Players Radar Chart (Normalized)",
526
  pad=dict(t=10)
 
556
  categories = [g.replace(" Score", "") for g in game_cols]
557
 
558
  for col in game_cols:
559
+ # Replace "n/a" with 0 and handle downcasting properly
560
+ vals = player_df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
561
+ mean, std = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float).mean(), df[col].replace("n/a", 0).infer_objects(copy=False).astype(float).std()
562
  player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
563
 
564
  fig = go.Figure()
 
569
  theta=categories + [categories[0]],
570
  mode='lines+markers',
571
  fill='toself',
572
+ name=row["Player"]
573
  ))
574
 
575
  fig.update_layout(
576
  autosize=False,
577
  width=800,
578
  height=600,
579
+ margin=dict(l=80, r=150, t=20, b=20),
580
  title=dict(
581
+ text=f"{row['Player']} Radar Chart (Normalized)",
582
  pad=dict(t=10)
583
  ),
584
  polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
leaderboard_utils.py CHANGED
@@ -175,7 +175,7 @@ def calculate_rank_and_completeness(rank_data, selected_games):
175
  ranks.append(rank)
176
  player_data[f"{game} Score"] = player_score
177
  else:
178
- player_data[f"{game} Score"] = -1
179
 
180
  # Calculate average rank and completeness for sorting only
181
  if ranks:
@@ -264,7 +264,7 @@ def get_combined_leaderboard(rank_data, selected_games):
264
  elif game in ["Tetris (complete)", "Tetris (planning only)"]:
265
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
266
  else:
267
- player_data[f"{game} Score"] = -1
268
 
269
  results.append(player_data)
270
 
@@ -278,7 +278,7 @@ def get_combined_leaderboard(rank_data, selected_games):
278
  for game in GAME_ORDER:
279
  if f"{game} Score" in df_results.columns:
280
  df_results["Total Score"] += df_results[f"{game} Score"].apply(
281
- lambda x: float(x) if x != -1 else 0
282
  )
283
 
284
  # Sort by total score in descending order
 
175
  ranks.append(rank)
176
  player_data[f"{game} Score"] = player_score
177
  else:
178
+ player_data[f"{game} Score"] = 'n/a'
179
 
180
  # Calculate average rank and completeness for sorting only
181
  if ranks:
 
264
  elif game in ["Tetris (complete)", "Tetris (planning only)"]:
265
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
266
  else:
267
+ player_data[f"{game} Score"] = 'n/a'
268
 
269
  results.append(player_data)
270
 
 
278
  for game in GAME_ORDER:
279
  if f"{game} Score" in df_results.columns:
280
  df_results["Total Score"] += df_results[f"{game} Score"].apply(
281
+ lambda x: float(x) if x != 'n/a' else 0
282
  )
283
 
284
  # Sort by total score in descending order