Set merge as default
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@ Evaluation of H4 and community models across a diverse range of benchmarks from
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
|
| 14 |
-
def get_leaderboard_df(merge_values: bool =
|
| 15 |
filepaths = list(Path("eval_results").rglob("*.json"))
|
| 16 |
|
| 17 |
# Parse filepaths to get unique models
|
|
@@ -47,6 +47,9 @@ def get_leaderboard_df(merge_values: bool = False):
|
|
| 47 |
# HellaSwag and ARC reports acc_norm
|
| 48 |
elif task.lower() in ["hellaswag", "arc"]:
|
| 49 |
value = data["results"][first_result_key]["acc_norm"]
|
|
|
|
|
|
|
|
|
|
| 50 |
else:
|
| 51 |
first_metric_key = next(
|
| 52 |
iter(data["results"][first_result_key])
|
|
@@ -76,7 +79,7 @@ def get_leaderboard_df(merge_values: bool = False):
|
|
| 76 |
return df
|
| 77 |
|
| 78 |
|
| 79 |
-
def refresh(merge_values: bool =
|
| 80 |
return get_leaderboard_df(merge_values)
|
| 81 |
|
| 82 |
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
|
| 14 |
+
def get_leaderboard_df(merge_values: bool = True):
|
| 15 |
filepaths = list(Path("eval_results").rglob("*.json"))
|
| 16 |
|
| 17 |
# Parse filepaths to get unique models
|
|
|
|
| 47 |
# HellaSwag and ARC reports acc_norm
|
| 48 |
elif task.lower() in ["hellaswag", "arc"]:
|
| 49 |
value = data["results"][first_result_key]["acc_norm"]
|
| 50 |
+
# BBH has several metrics but we report just the average one
|
| 51 |
+
elif task.lower() == "bbh":
|
| 52 |
+
value = [v["em"] for k, v in data["results"].items() if "_average" in k.lower()][0]
|
| 53 |
else:
|
| 54 |
first_metric_key = next(
|
| 55 |
iter(data["results"][first_result_key])
|
|
|
|
| 79 |
return df
|
| 80 |
|
| 81 |
|
| 82 |
+
def refresh(merge_values: bool = True):
|
| 83 |
return get_leaderboard_df(merge_values)
|
| 84 |
|
| 85 |
|