Spaces:
Running
Running
Yuxuan-Zhang-Dexter
commited on
Commit
·
a62923a
1
Parent(s):
2f1c4f3
optiimize radar chart, replace -1 with n/a, and optimize text
Browse files- assets/news.json +1 -1
- data_visualization.py +82 -40
- leaderboard_utils.py +3 -3
assets/news.json
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
{
|
4 |
"date": "2025-04-08",
|
5 |
"video_link": "https://www.youtube.com/watch?v=yoEo2Bk7PGA",
|
6 |
-
"twitter_text": "LLaMA 4 Maverick
|
7 |
"twitter_link": "https://x.com/haoailab/status/1909712259326394519"
|
8 |
},
|
9 |
{
|
|
|
3 |
{
|
4 |
"date": "2025-04-08",
|
5 |
"video_link": "https://www.youtube.com/watch?v=yoEo2Bk7PGA",
|
6 |
+
"twitter_text": "LLaMA 4 Maverick hacks traditional benchmarks but struggles with real gameplay—our transparent leaderboard exposes the new AI challenge.",
|
7 |
"twitter_link": "https://x.com/haoailab/status/1909712259326394519"
|
8 |
},
|
9 |
{
|
data_visualization.py
CHANGED
@@ -46,7 +46,7 @@ def normalize_values(values, mean, std):
|
|
46 |
return [50 if v > 0 else 0 for v in values] # Handle zero std case
|
47 |
z_scores = [(v - mean) / std for v in values]
|
48 |
# Scale z-scores to 0-100 range, with mean at 50
|
49 |
-
scaled_values = [max(0, min(100, (z * 30) +
|
50 |
return scaled_values
|
51 |
def simplify_model_name(name):
|
52 |
if name == "claude-3-7-sonnet-20250219(thinking)":
|
@@ -55,8 +55,6 @@ def simplify_model_name(name):
|
|
55 |
return '-'.join(parts[:4]) + '-...' if len(parts) > 4 else name
|
56 |
|
57 |
def create_horizontal_bar_chart(df, game_name):
|
58 |
-
|
59 |
-
|
60 |
if game_name == "Super Mario Bros":
|
61 |
score_col = "Score"
|
62 |
df_sorted = df.sort_values(by=score_col, ascending=True)
|
@@ -86,10 +84,8 @@ def create_horizontal_bar_chart(df, game_name):
|
|
86 |
else:
|
87 |
return None
|
88 |
|
89 |
-
|
90 |
-
|
91 |
x = df_sorted[score_col]
|
92 |
-
y = [f"{
|
93 |
colors = [MODEL_COLORS.get(row['Player'], '#808080') for _, row in df_sorted.iterrows()]
|
94 |
texts = [f"{v:.1f}" if game_name == "Candy Crash" else f"{int(v)}" for v in x]
|
95 |
|
@@ -105,16 +101,17 @@ def create_horizontal_bar_chart(df, game_name):
|
|
105 |
|
106 |
fig.update_layout(
|
107 |
autosize=False,
|
108 |
-
width=
|
109 |
height=600,
|
110 |
-
margin=dict(l=
|
111 |
title=dict(
|
112 |
text=f"{game_name} Performance",
|
113 |
-
pad=dict(t=10)
|
|
|
114 |
),
|
115 |
yaxis=dict(automargin=True),
|
116 |
legend=dict(
|
117 |
-
font=dict(size=
|
118 |
itemsizing='trace',
|
119 |
x=1.1,
|
120 |
y=1,
|
@@ -132,7 +129,7 @@ def create_radar_charts(df):
|
|
132 |
categories = [c.replace(" Score", "") for c in game_cols]
|
133 |
|
134 |
for col in game_cols:
|
135 |
-
vals = df[col].replace("
|
136 |
mean, std = vals.mean(), vals.std()
|
137 |
df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
138 |
|
@@ -159,7 +156,7 @@ def create_radar_charts(df):
|
|
159 |
autosize=False,
|
160 |
width=800,
|
161 |
height=600,
|
162 |
-
margin=dict(l=80, r=150, t=
|
163 |
title=dict(
|
164 |
text="Radar Chart of AI Performance (Normalized)",
|
165 |
pad=dict(t=10)
|
@@ -190,7 +187,8 @@ def create_group_bar_chart(df):
|
|
190 |
for game in GAME_ORDER:
|
191 |
col = f"{game} Score"
|
192 |
if col in df.columns:
|
193 |
-
|
|
|
194 |
if df[col].notna().any():
|
195 |
game_cols[game] = col
|
196 |
|
@@ -260,7 +258,7 @@ def create_group_bar_chart(df):
|
|
260 |
continue
|
261 |
|
262 |
fig.add_trace(go.Bar(
|
263 |
-
name=
|
264 |
x=[game_display_map[game] for game in sorted_games],
|
265 |
y=y_vals,
|
266 |
marker_color=MODEL_COLORS.get(player, '#808080'),
|
@@ -270,8 +268,8 @@ def create_group_bar_chart(df):
|
|
270 |
fig.update_layout(
|
271 |
autosize=False,
|
272 |
width=1000,
|
273 |
-
height=
|
274 |
-
margin=dict(l=
|
275 |
title=dict(text="Grouped Bar Chart of AI Models (Consistent Trace Grouping)", pad=dict(t=10)),
|
276 |
xaxis_title="Games",
|
277 |
yaxis_title="Normalized Score",
|
@@ -285,7 +283,7 @@ def create_group_bar_chart(df):
|
|
285 |
bargroupgap=0.05, # Gap between bars in a group
|
286 |
uniformtext=dict(mode='hide', minsize=8), # Hide text that doesn't fit
|
287 |
legend=dict(
|
288 |
-
font=dict(size=
|
289 |
itemsizing='trace',
|
290 |
x=1.1,
|
291 |
y=1,
|
@@ -319,12 +317,26 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
319 |
if selected_games is None:
|
320 |
selected_games = ['Super Mario Bros', '2048', 'Candy Crash', 'Sokoban']
|
321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
game_cols = [f"{game} Score" for game in selected_games]
|
323 |
-
categories =
|
324 |
-
|
325 |
# Normalize
|
326 |
for col in game_cols:
|
327 |
-
vals = df[col].replace("
|
328 |
mean, std = vals.mean(), vals.std()
|
329 |
df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
330 |
|
@@ -358,28 +370,49 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
358 |
theta=categories + [categories[0]],
|
359 |
mode='lines+markers',
|
360 |
fill='toself',
|
361 |
-
name=
|
362 |
line=dict(color=color, width=4 if is_highlighted else 2),
|
363 |
marker=dict(color=color),
|
364 |
fillcolor=fillcolor,
|
365 |
-
opacity=1.0 if is_highlighted else 0.7
|
|
|
366 |
))
|
367 |
|
368 |
fig.update_layout(
|
369 |
autosize=False,
|
370 |
-
width=
|
371 |
height=600,
|
372 |
-
margin=dict(l=
|
373 |
title=dict(
|
374 |
-
text="
|
375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
),
|
377 |
-
polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
|
378 |
legend=dict(
|
379 |
-
font=dict(size=
|
|
|
380 |
itemsizing='trace',
|
381 |
-
x
|
382 |
-
y=
|
383 |
xanchor='left',
|
384 |
yanchor='top',
|
385 |
bgcolor='rgba(255,255,255,0.6)',
|
@@ -388,6 +421,13 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
388 |
)
|
389 |
)
|
390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
return fig
|
392 |
|
393 |
def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None):
|
@@ -405,7 +445,7 @@ def create_organization_radar_chart(rank_data):
|
|
405 |
|
406 |
avg_df = pd.DataFrame([
|
407 |
{
|
408 |
-
**{col: df[df["Organization"] == org][col].replace("
|
409 |
"Organization": org
|
410 |
}
|
411 |
for org in orgs
|
@@ -431,7 +471,7 @@ def create_organization_radar_chart(rank_data):
|
|
431 |
autosize=False,
|
432 |
width=800,
|
433 |
height=600,
|
434 |
-
margin=dict(l=80, r=150, t=
|
435 |
title=dict(
|
436 |
text="Radar Chart: Organization Performance (Normalized)",
|
437 |
pad=dict(t=10)
|
@@ -460,7 +500,8 @@ def create_top_players_radar_chart(rank_data, n=5):
|
|
460 |
categories = [g.replace(" Score", "") for g in game_cols]
|
461 |
|
462 |
for col in game_cols:
|
463 |
-
|
|
|
464 |
mean, std = vals.mean(), vals.std()
|
465 |
top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
466 |
|
@@ -472,14 +513,14 @@ def create_top_players_radar_chart(rank_data, n=5):
|
|
472 |
theta=categories + [categories[0]],
|
473 |
mode='lines+markers',
|
474 |
fill='toself',
|
475 |
-
name=
|
476 |
))
|
477 |
|
478 |
fig.update_layout(
|
479 |
autosize=False,
|
480 |
width=800,
|
481 |
height=600,
|
482 |
-
margin=dict(l=80, r=150, t=
|
483 |
title=dict(
|
484 |
text=f"Top {n} Players Radar Chart (Normalized)",
|
485 |
pad=dict(t=10)
|
@@ -515,8 +556,9 @@ def create_player_radar_chart(rank_data, player_name):
|
|
515 |
categories = [g.replace(" Score", "") for g in game_cols]
|
516 |
|
517 |
for col in game_cols:
|
518 |
-
|
519 |
-
|
|
|
520 |
player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
521 |
|
522 |
fig = go.Figure()
|
@@ -527,16 +569,16 @@ def create_player_radar_chart(rank_data, player_name):
|
|
527 |
theta=categories + [categories[0]],
|
528 |
mode='lines+markers',
|
529 |
fill='toself',
|
530 |
-
name=
|
531 |
))
|
532 |
|
533 |
fig.update_layout(
|
534 |
autosize=False,
|
535 |
width=800,
|
536 |
height=600,
|
537 |
-
margin=dict(l=80, r=150, t=
|
538 |
title=dict(
|
539 |
-
text=f"{
|
540 |
pad=dict(t=10)
|
541 |
),
|
542 |
polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
|
|
|
46 |
return [50 if v > 0 else 0 for v in values] # Handle zero std case
|
47 |
z_scores = [(v - mean) / std for v in values]
|
48 |
# Scale z-scores to 0-100 range, with mean at 50
|
49 |
+
scaled_values = [max(0, min(100, (z * 30) + 35)) for z in z_scores]
|
50 |
return scaled_values
|
51 |
def simplify_model_name(name):
|
52 |
if name == "claude-3-7-sonnet-20250219(thinking)":
|
|
|
55 |
return '-'.join(parts[:4]) + '-...' if len(parts) > 4 else name
|
56 |
|
57 |
def create_horizontal_bar_chart(df, game_name):
|
|
|
|
|
58 |
if game_name == "Super Mario Bros":
|
59 |
score_col = "Score"
|
60 |
df_sorted = df.sort_values(by=score_col, ascending=True)
|
|
|
84 |
else:
|
85 |
return None
|
86 |
|
|
|
|
|
87 |
x = df_sorted[score_col]
|
88 |
+
y = [f"{row['Player']} [{row['Organization']}]" for _, row in df_sorted.iterrows()]
|
89 |
colors = [MODEL_COLORS.get(row['Player'], '#808080') for _, row in df_sorted.iterrows()]
|
90 |
texts = [f"{v:.1f}" if game_name == "Candy Crash" else f"{int(v)}" for v in x]
|
91 |
|
|
|
101 |
|
102 |
fig.update_layout(
|
103 |
autosize=False,
|
104 |
+
width=1000,
|
105 |
height=600,
|
106 |
+
margin=dict(l=200, r=200, t=20, b=20),
|
107 |
title=dict(
|
108 |
text=f"{game_name} Performance",
|
109 |
+
pad=dict(t=10),
|
110 |
+
font=dict(size=20)
|
111 |
),
|
112 |
yaxis=dict(automargin=True),
|
113 |
legend=dict(
|
114 |
+
font=dict(size=12),
|
115 |
itemsizing='trace',
|
116 |
x=1.1,
|
117 |
y=1,
|
|
|
129 |
categories = [c.replace(" Score", "") for c in game_cols]
|
130 |
|
131 |
for col in game_cols:
|
132 |
+
vals = df[col].replace("n/a", 0).astype(float)
|
133 |
mean, std = vals.mean(), vals.std()
|
134 |
df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
135 |
|
|
|
156 |
autosize=False,
|
157 |
width=800,
|
158 |
height=600,
|
159 |
+
margin=dict(l=80, r=150, t=20, b=20),
|
160 |
title=dict(
|
161 |
text="Radar Chart of AI Performance (Normalized)",
|
162 |
pad=dict(t=10)
|
|
|
187 |
for game in GAME_ORDER:
|
188 |
col = f"{game} Score"
|
189 |
if col in df.columns:
|
190 |
+
# Replace "n/a" with np.nan and handle downcasting properly
|
191 |
+
df[col] = df[col].replace("n/a", np.nan).infer_objects(copy=False).astype(float)
|
192 |
if df[col].notna().any():
|
193 |
game_cols[game] = col
|
194 |
|
|
|
258 |
continue
|
259 |
|
260 |
fig.add_trace(go.Bar(
|
261 |
+
name=row["Player"],
|
262 |
x=[game_display_map[game] for game in sorted_games],
|
263 |
y=y_vals,
|
264 |
marker_color=MODEL_COLORS.get(player, '#808080'),
|
|
|
268 |
fig.update_layout(
|
269 |
autosize=False,
|
270 |
width=1000,
|
271 |
+
height=800,
|
272 |
+
margin=dict(l=200, r=200, t=20, b=20),
|
273 |
title=dict(text="Grouped Bar Chart of AI Models (Consistent Trace Grouping)", pad=dict(t=10)),
|
274 |
xaxis_title="Games",
|
275 |
yaxis_title="Normalized Score",
|
|
|
283 |
bargroupgap=0.05, # Gap between bars in a group
|
284 |
uniformtext=dict(mode='hide', minsize=8), # Hide text that doesn't fit
|
285 |
legend=dict(
|
286 |
+
font=dict(size=12),
|
287 |
itemsizing='trace',
|
288 |
x=1.1,
|
289 |
y=1,
|
|
|
317 |
if selected_games is None:
|
318 |
selected_games = ['Super Mario Bros', '2048', 'Candy Crash', 'Sokoban']
|
319 |
|
320 |
+
# Format game names with line breaks
|
321 |
+
formatted_games = []
|
322 |
+
for game in selected_games:
|
323 |
+
if game == 'Super Mario Bros':
|
324 |
+
formatted_games.append('Super<br>Mario')
|
325 |
+
elif game == 'Candy Crash':
|
326 |
+
formatted_games.append('Candy<br>Crash')
|
327 |
+
elif game == 'Tetris (complete)':
|
328 |
+
formatted_games.append('Tetris<br>(complete)')
|
329 |
+
elif game == 'Tetris (planning only)':
|
330 |
+
formatted_games.append('Tetris<br>(planning)')
|
331 |
+
else:
|
332 |
+
formatted_games.append(game)
|
333 |
+
|
334 |
game_cols = [f"{game} Score" for game in selected_games]
|
335 |
+
categories = formatted_games
|
336 |
+
|
337 |
# Normalize
|
338 |
for col in game_cols:
|
339 |
+
vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
|
340 |
mean, std = vals.mean(), vals.std()
|
341 |
df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
342 |
|
|
|
370 |
theta=categories + [categories[0]],
|
371 |
mode='lines+markers',
|
372 |
fill='toself',
|
373 |
+
name=player,
|
374 |
line=dict(color=color, width=4 if is_highlighted else 2),
|
375 |
marker=dict(color=color),
|
376 |
fillcolor=fillcolor,
|
377 |
+
opacity=1.0 if is_highlighted else 0.7,
|
378 |
+
hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
|
379 |
))
|
380 |
|
381 |
fig.update_layout(
|
382 |
autosize=False,
|
383 |
+
width=1000,
|
384 |
height=600,
|
385 |
+
margin=dict(l=400, r=200, t=20, b=20),
|
386 |
title=dict(
|
387 |
+
text="AI Normalized Performance Across Games",
|
388 |
+
x=0.5,
|
389 |
+
xanchor='center',
|
390 |
+
yanchor='top',
|
391 |
+
y=0.95,
|
392 |
+
font=dict(size=20),
|
393 |
+
pad=dict(b=20)
|
394 |
+
),
|
395 |
+
polar=dict(
|
396 |
+
radialaxis=dict(
|
397 |
+
visible=True,
|
398 |
+
range=[0, 100],
|
399 |
+
tickangle=45,
|
400 |
+
tickfont=dict(size=12),
|
401 |
+
gridcolor='lightgray',
|
402 |
+
gridwidth=1,
|
403 |
+
angle=45
|
404 |
+
),
|
405 |
+
angularaxis=dict(
|
406 |
+
tickfont=dict(size=14, weight='bold'),
|
407 |
+
tickangle=0
|
408 |
+
)
|
409 |
),
|
|
|
410 |
legend=dict(
|
411 |
+
font=dict(size=12),
|
412 |
+
title="Choose your model: ",
|
413 |
itemsizing='trace',
|
414 |
+
x=-1.2,
|
415 |
+
y=0.8,
|
416 |
xanchor='left',
|
417 |
yanchor='top',
|
418 |
bgcolor='rgba(255,255,255,0.6)',
|
|
|
421 |
)
|
422 |
)
|
423 |
|
424 |
+
fig.update_layout(
|
425 |
+
legend=dict(
|
426 |
+
itemclick="toggleothers", # This will make clicked item the only visible one
|
427 |
+
itemdoubleclick="toggle" # Double click toggles visibility
|
428 |
+
)
|
429 |
+
)
|
430 |
+
|
431 |
return fig
|
432 |
|
433 |
def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None):
|
|
|
445 |
|
446 |
avg_df = pd.DataFrame([
|
447 |
{
|
448 |
+
**{col: df[df["Organization"] == org][col].replace("n/a", 0).infer_objects(copy=False).astype(float).mean() for col in game_cols},
|
449 |
"Organization": org
|
450 |
}
|
451 |
for org in orgs
|
|
|
471 |
autosize=False,
|
472 |
width=800,
|
473 |
height=600,
|
474 |
+
margin=dict(l=80, r=150, t=20, b=20),
|
475 |
title=dict(
|
476 |
text="Radar Chart: Organization Performance (Normalized)",
|
477 |
pad=dict(t=10)
|
|
|
500 |
categories = [g.replace(" Score", "") for g in game_cols]
|
501 |
|
502 |
for col in game_cols:
|
503 |
+
# Replace "n/a" with 0 and handle downcasting properly
|
504 |
+
vals = top_df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
|
505 |
mean, std = vals.mean(), vals.std()
|
506 |
top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
507 |
|
|
|
513 |
theta=categories + [categories[0]],
|
514 |
mode='lines+markers',
|
515 |
fill='toself',
|
516 |
+
name=row["Player"]
|
517 |
))
|
518 |
|
519 |
fig.update_layout(
|
520 |
autosize=False,
|
521 |
width=800,
|
522 |
height=600,
|
523 |
+
margin=dict(l=80, r=150, t=20, b=20),
|
524 |
title=dict(
|
525 |
text=f"Top {n} Players Radar Chart (Normalized)",
|
526 |
pad=dict(t=10)
|
|
|
556 |
categories = [g.replace(" Score", "") for g in game_cols]
|
557 |
|
558 |
for col in game_cols:
|
559 |
+
# Replace "n/a" with 0 and handle downcasting properly
|
560 |
+
vals = player_df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
|
561 |
+
mean, std = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float).mean(), df[col].replace("n/a", 0).infer_objects(copy=False).astype(float).std()
|
562 |
player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
563 |
|
564 |
fig = go.Figure()
|
|
|
569 |
theta=categories + [categories[0]],
|
570 |
mode='lines+markers',
|
571 |
fill='toself',
|
572 |
+
name=row["Player"]
|
573 |
))
|
574 |
|
575 |
fig.update_layout(
|
576 |
autosize=False,
|
577 |
width=800,
|
578 |
height=600,
|
579 |
+
margin=dict(l=80, r=150, t=20, b=20),
|
580 |
title=dict(
|
581 |
+
text=f"{row['Player']} Radar Chart (Normalized)",
|
582 |
pad=dict(t=10)
|
583 |
),
|
584 |
polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
|
leaderboard_utils.py
CHANGED
@@ -175,7 +175,7 @@ def calculate_rank_and_completeness(rank_data, selected_games):
|
|
175 |
ranks.append(rank)
|
176 |
player_data[f"{game} Score"] = player_score
|
177 |
else:
|
178 |
-
player_data[f"{game} Score"] =
|
179 |
|
180 |
# Calculate average rank and completeness for sorting only
|
181 |
if ranks:
|
@@ -264,7 +264,7 @@ def get_combined_leaderboard(rank_data, selected_games):
|
|
264 |
elif game in ["Tetris (complete)", "Tetris (planning only)"]:
|
265 |
player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
|
266 |
else:
|
267 |
-
player_data[f"{game} Score"] =
|
268 |
|
269 |
results.append(player_data)
|
270 |
|
@@ -278,7 +278,7 @@ def get_combined_leaderboard(rank_data, selected_games):
|
|
278 |
for game in GAME_ORDER:
|
279 |
if f"{game} Score" in df_results.columns:
|
280 |
df_results["Total Score"] += df_results[f"{game} Score"].apply(
|
281 |
-
lambda x: float(x) if x !=
|
282 |
)
|
283 |
|
284 |
# Sort by total score in descending order
|
|
|
175 |
ranks.append(rank)
|
176 |
player_data[f"{game} Score"] = player_score
|
177 |
else:
|
178 |
+
player_data[f"{game} Score"] = 'n/a'
|
179 |
|
180 |
# Calculate average rank and completeness for sorting only
|
181 |
if ranks:
|
|
|
264 |
elif game in ["Tetris (complete)", "Tetris (planning only)"]:
|
265 |
player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
|
266 |
else:
|
267 |
+
player_data[f"{game} Score"] = 'n/a'
|
268 |
|
269 |
results.append(player_data)
|
270 |
|
|
|
278 |
for game in GAME_ORDER:
|
279 |
if f"{game} Score" in df_results.columns:
|
280 |
df_results["Total Score"] += df_results[f"{game} Score"].apply(
|
281 |
+
lambda x: float(x) if x != 'n/a' else 0
|
282 |
)
|
283 |
|
284 |
# Sort by total score in descending order
|