Nathan Habib
commited on
Commit
·
e3aaf53
1
Parent(s):
26286b2
add new evals to the leaderboard
Browse files- app.py +22 -19
- src/assets/hardcoded_evals.py +3 -0
- src/assets/text_content.py +53 -1
- src/get_model_info/utils.py +3 -0
- src/plots/read_results.py +6 -3
app.py
CHANGED
|
@@ -88,6 +88,9 @@ BENCHMARK_COLS = [
|
|
| 88 |
AutoEvalColumn.hellaswag,
|
| 89 |
AutoEvalColumn.mmlu,
|
| 90 |
AutoEvalColumn.truthfulqa,
|
|
|
|
|
|
|
|
|
|
| 91 |
]
|
| 92 |
]
|
| 93 |
|
|
@@ -107,7 +110,7 @@ update_collections(original_df.copy())
|
|
| 107 |
leaderboard_df = original_df.copy()
|
| 108 |
|
| 109 |
models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
| 110 |
-
plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
| 111 |
to_be_dumped = f"models = {repr(models)}\n"
|
| 112 |
|
| 113 |
(
|
|
@@ -516,24 +519,24 @@ with demo:
|
|
| 516 |
queue=True,
|
| 517 |
)
|
| 518 |
|
| 519 |
-
with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 538 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 539 |
|
|
|
|
| 88 |
AutoEvalColumn.hellaswag,
|
| 89 |
AutoEvalColumn.mmlu,
|
| 90 |
AutoEvalColumn.truthfulqa,
|
| 91 |
+
AutoEvalColumn.winogrande,
|
| 92 |
+
AutoEvalColumn.gsm8k,
|
| 93 |
+
AutoEvalColumn.drop
|
| 94 |
]
|
| 95 |
]
|
| 96 |
|
|
|
|
| 110 |
leaderboard_df = original_df.copy()
|
| 111 |
|
| 112 |
models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
| 113 |
+
#plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
| 114 |
to_be_dumped = f"models = {repr(models)}\n"
|
| 115 |
|
| 116 |
(
|
|
|
|
| 519 |
queue=True,
|
| 520 |
)
|
| 521 |
|
| 522 |
+
# with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
|
| 523 |
+
# with gr.Row():
|
| 524 |
+
# with gr.Column():
|
| 525 |
+
# chart = create_metric_plot_obj(
|
| 526 |
+
# plot_df,
|
| 527 |
+
# ["Average ⬆️"],
|
| 528 |
+
# HUMAN_BASELINES,
|
| 529 |
+
# title="Average of Top Scores and Human Baseline Over Time",
|
| 530 |
+
# )
|
| 531 |
+
# gr.Plot(value=chart, interactive=False, width=500, height=500)
|
| 532 |
+
# with gr.Column():
|
| 533 |
+
# chart = create_metric_plot_obj(
|
| 534 |
+
# plot_df,
|
| 535 |
+
# ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K", "DROP"],
|
| 536 |
+
# HUMAN_BASELINES,
|
| 537 |
+
# title="Top Scores and Human Baseline Over Time",
|
| 538 |
+
# )
|
| 539 |
+
# gr.Plot(value=chart, interactive=False, width=500, height=500)
|
| 540 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 541 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 542 |
|
src/assets/hardcoded_evals.py
CHANGED
|
@@ -35,6 +35,9 @@ baseline = {
|
|
| 35 |
AutoEvalColumn.hellaswag.name: 25.0,
|
| 36 |
AutoEvalColumn.mmlu.name: 25.0,
|
| 37 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
|
|
|
|
|
|
|
|
|
| 38 |
AutoEvalColumn.dummy.name: "baseline",
|
| 39 |
AutoEvalColumn.model_type.name: "",
|
| 40 |
}
|
|
|
|
| 35 |
AutoEvalColumn.hellaswag.name: 25.0,
|
| 36 |
AutoEvalColumn.mmlu.name: 25.0,
|
| 37 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
| 38 |
+
AutoEvalColumn.winogrande.name: 50.0,
|
| 39 |
+
AutoEvalColumn.gsm8k.name: 0.21,
|
| 40 |
+
AutoEvalColumn.drop.name: 0.47,
|
| 41 |
AutoEvalColumn.dummy.name: "baseline",
|
| 42 |
AutoEvalColumn.model_type.name: "",
|
| 43 |
}
|
src/assets/text_content.py
CHANGED
|
@@ -31,7 +31,10 @@ If there is no icon, we have not uploaded the information on the model yet, feel
|
|
| 31 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
| 32 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
| 33 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
| 34 |
-
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
For all these evaluations, a higher score is a better score.
|
| 37 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
|
@@ -55,6 +58,14 @@ The tasks and few shots parameters are:
|
|
| 55 |
- HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
|
| 56 |
- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
|
| 57 |
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
## Quantization
|
| 60 |
To get more information about quantization, see:
|
|
@@ -166,4 +177,45 @@ CITATION_BUTTON_TEXT = r"""
|
|
| 166 |
eprint={2109.07958},
|
| 167 |
archivePrefix={arXiv},
|
| 168 |
primaryClass={cs.CL}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
}"""
|
|
|
|
| 31 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
| 32 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
| 33 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
| 34 |
+
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
|
| 35 |
+
- <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
|
| 36 |
+
- <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
|
| 37 |
+
- <a href="https://arxiv.org/abs/1903.00161" target="_blank"> DROP </a> (3-shot) - English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.
|
| 38 |
|
| 39 |
For all these evaluations, a higher score is a better score.
|
| 40 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
|
|
|
| 58 |
- HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
|
| 59 |
- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
|
| 60 |
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
|
| 61 |
+
- Winogrande: 5-shot, *winogrande* (`acc`)
|
| 62 |
+
- GSM8k: 5-shot, *gsm8k* (`acc`)
|
| 63 |
+
- DROP: 3-shot, *drop* (`f1`)
|
| 64 |
+
|
| 65 |
+
Side note on the baseline scores:
|
| 66 |
+
- for log-likelihood evaluation, we select the random baseline
|
| 67 |
+
- for DROP, we select the best submission score according to [their leaderboard](https://leaderboard.allenai.org/drop/submissions/public) when the paper came out (NAQANet score)
|
| 68 |
+
- for GSM8K, we select the score obtained in the paper after inetuning a 6B model on the full GSM8K training set for 50 epochs
|
| 69 |
|
| 70 |
## Quantization
|
| 71 |
To get more information about quantization, see:
|
|
|
|
| 177 |
eprint={2109.07958},
|
| 178 |
archivePrefix={arXiv},
|
| 179 |
primaryClass={cs.CL}
|
| 180 |
+
}
|
| 181 |
+
@misc{DBLP:journals/corr/abs-1907-10641,
|
| 182 |
+
title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
|
| 183 |
+
author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
|
| 184 |
+
year={2019},
|
| 185 |
+
eprint={1907.10641},
|
| 186 |
+
archivePrefix={arXiv},
|
| 187 |
+
primaryClass={cs.CL}
|
| 188 |
+
}
|
| 189 |
+
@misc{DBLP:journals/corr/abs-2110-14168,
|
| 190 |
+
title={Training Verifiers to Solve Math Word Problems},
|
| 191 |
+
author={Karl Cobbe and
|
| 192 |
+
Vineet Kosaraju and
|
| 193 |
+
Mohammad Bavarian and
|
| 194 |
+
Mark Chen and
|
| 195 |
+
Heewoo Jun and
|
| 196 |
+
Lukasz Kaiser and
|
| 197 |
+
Matthias Plappert and
|
| 198 |
+
Jerry Tworek and
|
| 199 |
+
Jacob Hilton and
|
| 200 |
+
Reiichiro Nakano and
|
| 201 |
+
Christopher Hesse and
|
| 202 |
+
John Schulman},
|
| 203 |
+
year={2021},
|
| 204 |
+
eprint={2110.14168},
|
| 205 |
+
archivePrefix={arXiv},
|
| 206 |
+
primaryClass={cs.CL}
|
| 207 |
+
}
|
| 208 |
+
@misc{DBLP:journals/corr/abs-1903-00161,
|
| 209 |
+
title={{DROP:} {A} Reading Comprehension Benchmark Requiring Discrete Reasoning
|
| 210 |
+
Over Paragraphs},
|
| 211 |
+
author={Dheeru Dua and
|
| 212 |
+
Yizhong Wang and
|
| 213 |
+
Pradeep Dasigi and
|
| 214 |
+
Gabriel Stanovsky and
|
| 215 |
+
Sameer Singh and
|
| 216 |
+
Matt Gardner},
|
| 217 |
+
year={2019},
|
| 218 |
+
eprinttype={arXiv},
|
| 219 |
+
eprint={1903.00161},
|
| 220 |
+
primaryClass={cs.CL}
|
| 221 |
}"""
|
src/get_model_info/utils.py
CHANGED
|
@@ -29,6 +29,9 @@ class AutoEvalColumn: # Auto evals column
|
|
| 29 |
hellaswag = ColumnContent("HellaSwag", "number", True)
|
| 30 |
mmlu = ColumnContent("MMLU", "number", True)
|
| 31 |
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
|
|
|
|
|
|
|
|
|
| 32 |
model_type = ColumnContent("Type", "str", False)
|
| 33 |
precision = ColumnContent("Precision", "str", False) # , True)
|
| 34 |
license = ColumnContent("Hub License", "str", False)
|
|
|
|
| 29 |
hellaswag = ColumnContent("HellaSwag", "number", True)
|
| 30 |
mmlu = ColumnContent("MMLU", "number", True)
|
| 31 |
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
| 32 |
+
winogrande = ColumnContent("Winogrande", "number", True)
|
| 33 |
+
gsm8k = ColumnContent("GSM8K", "number", True)
|
| 34 |
+
drop = ColumnContent("DROP", "number", True)
|
| 35 |
model_type = ColumnContent("Type", "str", False)
|
| 36 |
precision = ColumnContent("Precision", "str", False) # , True)
|
| 37 |
license = ColumnContent("Hub License", "str", False)
|
src/plots/read_results.py
CHANGED
|
@@ -8,13 +8,16 @@ import numpy as np
|
|
| 8 |
|
| 9 |
from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
|
| 10 |
|
| 11 |
-
METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
|
| 12 |
-
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
|
| 13 |
BENCH_TO_NAME = {
|
| 14 |
"arc:challenge": AutoEvalColumn.arc.name,
|
| 15 |
"hellaswag": AutoEvalColumn.hellaswag.name,
|
| 16 |
"hendrycksTest": AutoEvalColumn.mmlu.name,
|
| 17 |
"truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
|
|
|
|
|
|
|
|
|
|
| 18 |
}
|
| 19 |
|
| 20 |
|
|
@@ -46,7 +49,7 @@ class EvalResult:
|
|
| 46 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
| 47 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
| 48 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
| 49 |
-
data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) /
|
| 50 |
data_dict[AutoEvalColumn.still_on_hub.name] = (
|
| 51 |
is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
|
| 52 |
)
|
|
|
|
| 8 |
|
| 9 |
from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
|
| 10 |
|
| 11 |
+
METRICS = ["acc_norm", "acc_norm", "acc", "mc2", "acc", "acc", "f1"]
|
| 12 |
+
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc", "winogrande", "gsm8k", "drop"]
|
| 13 |
BENCH_TO_NAME = {
|
| 14 |
"arc:challenge": AutoEvalColumn.arc.name,
|
| 15 |
"hellaswag": AutoEvalColumn.hellaswag.name,
|
| 16 |
"hendrycksTest": AutoEvalColumn.mmlu.name,
|
| 17 |
"truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
|
| 18 |
+
"winogrande": AutoEvalColumn.winogrande.name,
|
| 19 |
+
"gsm8k": AutoEvalColumn.gsm8k.name,
|
| 20 |
+
"drop": AutoEvalColumn.drop.name,
|
| 21 |
}
|
| 22 |
|
| 23 |
|
|
|
|
| 49 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
| 50 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
| 51 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
| 52 |
+
data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 7.0
|
| 53 |
data_dict[AutoEvalColumn.still_on_hub.name] = (
|
| 54 |
is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
|
| 55 |
)
|