Add AlpacaEval base winrate
Browse files
app.py
CHANGED
|
@@ -64,9 +64,6 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
| 64 |
# MATH reports qem
|
| 65 |
elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
|
| 66 |
value = data["results"]["all"]["qem"]
|
| 67 |
-
# Report length controlled winrate for AlpacaEval
|
| 68 |
-
elif task.lower() == "alpaca_eval":
|
| 69 |
-
value = data["results"][first_result_key]["length_controlled_winrate"] / 100.0
|
| 70 |
else:
|
| 71 |
first_metric_key = next(
|
| 72 |
iter(data["results"][first_result_key])
|
|
@@ -80,12 +77,18 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
| 80 |
level = k.split("|")[1].split(":")[-1]
|
| 81 |
value = v["qem"]
|
| 82 |
df.loc[model_revision, f"{task}_{level}"] = value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
else:
|
| 84 |
df.loc[model_revision, task] = value
|
| 85 |
|
| 86 |
# Put IFEval / BBH / AGIEval / AlpacaEval in first columns
|
| 87 |
-
alpaca_col = df.pop("
|
| 88 |
-
df.insert(1, "
|
| 89 |
ifeval_col = df.pop("Ifeval")
|
| 90 |
df.insert(2, "Ifeval", ifeval_col)
|
| 91 |
bbh_col = df.pop("Bbh")
|
|
|
|
| 64 |
# MATH reports qem
|
| 65 |
elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
|
| 66 |
value = data["results"]["all"]["qem"]
|
|
|
|
|
|
|
|
|
|
| 67 |
else:
|
| 68 |
first_metric_key = next(
|
| 69 |
iter(data["results"][first_result_key])
|
|
|
|
| 77 |
level = k.split("|")[1].split(":")[-1]
|
| 78 |
value = v["qem"]
|
| 79 |
df.loc[model_revision, f"{task}_{level}"] = value
|
| 80 |
+
# For AlpacaEval we report base winrate and lenght corrected one
|
| 81 |
+
elif task.lower() == "alpaca_eval":
|
| 82 |
+
value = data["results"][first_result_key]["win_rate"]
|
| 83 |
+
df.loc[model_revision, "Alpaca_eval"] = value / 100.0
|
| 84 |
+
value = data["results"][first_result_key]["length_controlled_winrate"]
|
| 85 |
+
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
|
| 86 |
else:
|
| 87 |
df.loc[model_revision, task] = value
|
| 88 |
|
| 89 |
# Put IFEval / BBH / AGIEval / AlpacaEval in first columns
|
| 90 |
+
alpaca_col = df.pop("Alpaca_eval_lc")
|
| 91 |
+
df.insert(1, "Alpaca_eval_lc", alpaca_col)
|
| 92 |
ifeval_col = df.pop("Ifeval")
|
| 93 |
df.insert(2, "Ifeval", ifeval_col)
|
| 94 |
bbh_col = df.pop("Bbh")
|