Fix MATH
Browse files
app.py
CHANGED
|
@@ -10,7 +10,7 @@ DESCRIPTION = f"""
|
|
| 10 |
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
| 11 |
"""
|
| 12 |
|
| 13 |
-
BENCHMARKS_TO_SKIP = ["math", "mini_math"]
|
| 14 |
|
| 15 |
|
| 16 |
def get_leaderboard_df(agg: str = "max"):
|
|
@@ -100,6 +100,8 @@ def get_leaderboard_df(agg: str = "max"):
|
|
| 100 |
"aimo_kaggle_tora_hard",
|
| 101 |
"aimo_kaggle_tora_medium_extended",
|
| 102 |
"aimo_kaggle_tora_hard_extended",
|
|
|
|
|
|
|
| 103 |
]:
|
| 104 |
for k, v in data["results"].items():
|
| 105 |
value = float(v["qem"]) / 100.0
|
|
@@ -122,7 +124,7 @@ def get_leaderboard_df(agg: str = "max"):
|
|
| 122 |
# Trim AIMO column names
|
| 123 |
df.columns = [c.replace("aimo_", "") for c in df.columns]
|
| 124 |
|
| 125 |
-
df = df.reset_index().rename(columns={"index": "Model"}).round(
|
| 126 |
# Strip off date from model name
|
| 127 |
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
|
| 128 |
|
|
|
|
| 10 |
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
| 11 |
"""
|
| 12 |
|
| 13 |
+
BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5"]
|
| 14 |
|
| 15 |
|
| 16 |
def get_leaderboard_df(agg: str = "max"):
|
|
|
|
| 100 |
"aimo_kaggle_tora_hard",
|
| 101 |
"aimo_kaggle_tora_medium_extended",
|
| 102 |
"aimo_kaggle_tora_hard_extended",
|
| 103 |
+
"aimo_math_integer_lvl4",
|
| 104 |
+
"aimo_math_integer_lvl5",
|
| 105 |
]:
|
| 106 |
for k, v in data["results"].items():
|
| 107 |
value = float(v["qem"]) / 100.0
|
|
|
|
| 124 |
# Trim AIMO column names
|
| 125 |
df.columns = [c.replace("aimo_", "") for c in df.columns]
|
| 126 |
|
| 127 |
+
df = df.reset_index().rename(columns={"index": "Model"}).round(4)
|
| 128 |
# Strip off date from model name
|
| 129 |
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
|
| 130 |
|