Add the function to highlight best per task
Browse files- app.py +25 -9
- src/leaderboard/read_evals.py +1 -0
app.py
CHANGED
|
@@ -52,6 +52,19 @@ TASK_METADATA_GENERATIVE = {
|
|
| 52 |
"REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
|
| 53 |
}
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
def theoretical_performance(df_hash):
|
| 57 |
"""
|
|
@@ -141,8 +154,8 @@ def create_line_chart(dataframe):
|
|
| 141 |
|
| 142 |
# Layout
|
| 143 |
fig.update_layout(
|
| 144 |
-
title="
|
| 145 |
-
xaxis_title="#Params (B)", yaxis_title="
|
| 146 |
template="plotly_white", hovermode="closest",
|
| 147 |
font=dict(family="Arial", size=10), dragmode=False,
|
| 148 |
xaxis=dict(tickvals=[0, 25, 50, 75, 100, 125], ticktext=["0", "25", "50", "75", "100"]),
|
|
@@ -330,6 +343,9 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
| 330 |
# Apply medal assignments
|
| 331 |
sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)
|
| 332 |
|
|
|
|
|
|
|
|
|
|
| 333 |
field_list = fields(AutoEvalColumn)
|
| 334 |
|
| 335 |
return create_leaderboard_base(sorted_dataframe, field_list, hidden_columns)
|
|
@@ -342,7 +358,7 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
|
|
| 342 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 343 |
|
| 344 |
# Sort and reset index
|
| 345 |
-
sorted_dataframe = dataframe.sort_values(by="
|
| 346 |
sorted_dataframe["Rank"] = sorted_dataframe.index + 1
|
| 347 |
|
| 348 |
# Apply medal assignments
|
|
@@ -542,12 +558,12 @@ def create_gradio_interface():
|
|
| 542 |
f"{task} Prompt Std": "Prompt Std",
|
| 543 |
f"{task} Best Prompt": "Best Prompt",
|
| 544 |
f"{task} Best Prompt Id": "Best Prompt Id",
|
| 545 |
-
task: "
|
| 546 |
}),
|
| 547 |
-
default_selection=['Rank', 'Size', 'FS', 'Model', '
|
| 548 |
'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
|
| 549 |
hidden_columns=[col for col in LEADERBOARD_DF.columns if
|
| 550 |
-
col not in ['Rank', 'Size', 'FS', 'Model', '
|
| 551 |
'Prompt Average', 'Prompt Std', 'Best Prompt',
|
| 552 |
'Best Prompt Id']]
|
| 553 |
)
|
|
@@ -567,12 +583,12 @@ def create_gradio_interface():
|
|
| 567 |
f"{task} Prompt Std": "Prompt Std",
|
| 568 |
f"{task} Best Prompt": "Best Prompt",
|
| 569 |
f"{task} Best Prompt Id": "Best Prompt Id",
|
| 570 |
-
task: "
|
| 571 |
}),
|
| 572 |
-
default_selection=['Rank', 'Size', 'FS', 'Model', '
|
| 573 |
'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
|
| 574 |
hidden_columns=[col for col in LEADERBOARD_DF.columns if
|
| 575 |
-
col not in ['Rank', 'Size', 'FS', 'Model', '
|
| 576 |
'Prompt Average', 'Prompt Std', 'Best Prompt',
|
| 577 |
'Best Prompt Id']]
|
| 578 |
)
|
|
|
|
| 52 |
"REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
|
| 53 |
}
|
| 54 |
|
| 55 |
+
def highlight_best_per_task(df):
|
| 56 |
+
"""Add 🟡 symbol next to the maximum value in each task column"""
|
| 57 |
+
|
| 58 |
+
task_columns = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 59 |
+
|
| 60 |
+
df = df.copy()
|
| 61 |
+
for col in task_columns:
|
| 62 |
+
if col in df.columns:
|
| 63 |
+
max_val = df[col].max()
|
| 64 |
+
df[col] = df[col].apply(
|
| 65 |
+
lambda x: f"{x:.1f}🔺" if x == max_val else f"{x:.1f}"
|
| 66 |
+
)
|
| 67 |
+
return df
|
| 68 |
|
| 69 |
def theoretical_performance(df_hash):
|
| 70 |
"""
|
|
|
|
| 154 |
|
| 155 |
# Layout
|
| 156 |
fig.update_layout(
|
| 157 |
+
title="Average Performance vs #Params",
|
| 158 |
+
xaxis_title="#Params (B)", yaxis_title="Average Performance",
|
| 159 |
template="plotly_white", hovermode="closest",
|
| 160 |
font=dict(family="Arial", size=10), dragmode=False,
|
| 161 |
xaxis=dict(tickvals=[0, 25, 50, 75, 100, 125], ticktext=["0", "25", "50", "75", "100"]),
|
|
|
|
| 343 |
# Apply medal assignments
|
| 344 |
sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)
|
| 345 |
|
| 346 |
+
# Show the best values for tasks
|
| 347 |
+
sorted_dataframe = highlight_best_per_task(sorted_dataframe)
|
| 348 |
+
|
| 349 |
field_list = fields(AutoEvalColumn)
|
| 350 |
|
| 351 |
return create_leaderboard_base(sorted_dataframe, field_list, hidden_columns)
|
|
|
|
| 358 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 359 |
|
| 360 |
# Sort and reset index
|
| 361 |
+
sorted_dataframe = dataframe.sort_values(by="Comb. Perf. ⬆️", ascending=False).reset_index(drop=True)
|
| 362 |
sorted_dataframe["Rank"] = sorted_dataframe.index + 1
|
| 363 |
|
| 364 |
# Apply medal assignments
|
|
|
|
| 558 |
f"{task} Prompt Std": "Prompt Std",
|
| 559 |
f"{task} Best Prompt": "Best Prompt",
|
| 560 |
f"{task} Best Prompt Id": "Best Prompt Id",
|
| 561 |
+
task: "Comb. Perf. ⬆️"
|
| 562 |
}),
|
| 563 |
+
default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️', 'Prompt Average',
|
| 564 |
'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
|
| 565 |
hidden_columns=[col for col in LEADERBOARD_DF.columns if
|
| 566 |
+
col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
|
| 567 |
'Prompt Average', 'Prompt Std', 'Best Prompt',
|
| 568 |
'Best Prompt Id']]
|
| 569 |
)
|
|
|
|
| 583 |
f"{task} Prompt Std": "Prompt Std",
|
| 584 |
f"{task} Best Prompt": "Best Prompt",
|
| 585 |
f"{task} Best Prompt Id": "Best Prompt Id",
|
| 586 |
+
task: "Comb. Perf. ⬆️"
|
| 587 |
}),
|
| 588 |
+
default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️', 'Prompt Average',
|
| 589 |
'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
|
| 590 |
hidden_columns=[col for col in LEADERBOARD_DF.columns if
|
| 591 |
+
col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
|
| 592 |
'Prompt Average', 'Prompt Std', 'Best Prompt',
|
| 593 |
'Best Prompt Id']]
|
| 594 |
)
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -109,6 +109,7 @@ class EvalResult:
|
|
| 109 |
results[task.benchmark] = int(v[task.metric_type][-1:])
|
| 110 |
else:
|
| 111 |
#results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display
|
|
|
|
| 112 |
results[task.benchmark] = float(v[task.metric_type])
|
| 113 |
#value = float(v[task.metric_type])
|
| 114 |
#results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali
|
|
|
|
| 109 |
results[task.benchmark] = int(v[task.metric_type][-1:])
|
| 110 |
else:
|
| 111 |
#results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display
|
| 112 |
+
#results[task.benchmark] = float(v[task.metric_type])
|
| 113 |
results[task.benchmark] = float(v[task.metric_type])
|
| 114 |
#value = float(v[task.metric_type])
|
| 115 |
#results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali
|