Combine IFEval
Browse files
app.py
CHANGED
|
@@ -49,11 +49,18 @@ def get_leaderboard_df():
|
|
| 49 |
if task.lower() == "truthfulqa":
|
| 50 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
| 51 |
df.loc[model_revision, task] = float(value)
|
| 52 |
-
# IFEval has several metrics but we report
|
| 53 |
elif task.lower() == "ifeval":
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
# MMLU has several metrics but we report just the average one
|
| 58 |
elif task.lower() == "mmlu":
|
| 59 |
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
|
|
|
|
| 49 |
if task.lower() == "truthfulqa":
|
| 50 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
| 51 |
df.loc[model_revision, task] = float(value)
|
| 52 |
+
# IFEval has several metrics but we report the average like Llama3 paper
|
| 53 |
elif task.lower() == "ifeval":
|
| 54 |
+
values = 0.0
|
| 55 |
+
for metric in [
|
| 56 |
+
"prompt_level_loose",
|
| 57 |
+
"prompt_level_strict",
|
| 58 |
+
"inst_level_strict",
|
| 59 |
+
"inst_level_loose",
|
| 60 |
+
]:
|
| 61 |
+
values += data["results"][first_result_key][f"{metric}_acc"]
|
| 62 |
+
value = values / 4
|
| 63 |
+
df.loc[model_revision, f"{task}"] = float(value)
|
| 64 |
# MMLU has several metrics but we report just the average one
|
| 65 |
elif task.lower() == "mmlu":
|
| 66 |
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
|