Spaces:
Runtime error
Runtime error
Commit
·
5d28865
0
Parent(s):
Duplicate from HuggingFaceH4/open_llm_leaderboard
Browse filesCo-authored-by: Edward Beeching <[email protected]>
- .gitattributes +35 -0
- .gitignore +11 -0
- README.md +14 -0
- app.py +518 -0
- requirements.txt +70 -0
- src/assets/css_html_js.py +87 -0
- src/assets/hardcoded_evals.py +38 -0
- src/assets/scale-hf-logo.png +3 -0
- src/assets/text_content.py +166 -0
- src/auto_leaderboard/get_model_metadata.py +56 -0
- src/auto_leaderboard/load_results.py +116 -0
- src/auto_leaderboard/model_metadata_type.py +163 -0
- src/elo_leaderboard/load_results.py +200 -0
- src/elo_leaderboard/visualizations.py +137 -0
- src/init.py +73 -0
- src/utils_display.py +98 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
auto_evals/
|
2 |
+
venv/
|
3 |
+
__pycache__/
|
4 |
+
.env
|
5 |
+
.ipynb_checkpoints
|
6 |
+
*ipynb
|
7 |
+
.vscode/
|
8 |
+
|
9 |
+
gpt_4_evals/
|
10 |
+
human_evals/
|
11 |
+
model_counts.html
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Open LLM Leaderboard
|
3 |
+
emoji: 🏆
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.27.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
license: apache-2.0
|
11 |
+
duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,518 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from datetime import datetime, timezone
|
4 |
+
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
10 |
+
from huggingface_hub import HfApi
|
11 |
+
from transformers import AutoConfig
|
12 |
+
|
13 |
+
from src.auto_leaderboard.get_model_metadata import apply_metadata
|
14 |
+
from src.assets.text_content import *
|
15 |
+
from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
|
16 |
+
from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
|
17 |
+
from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
|
18 |
+
from src.assets.css_html_js import custom_css, get_window_url_params
|
19 |
+
from src.utils_display import AutoEvalColumn, EvalQueueColumn, EloEvalColumn, fields, styled_error, styled_warning, styled_message
|
20 |
+
from src.init import load_all_info_from_hub
|
21 |
+
|
22 |
+
# clone / pull the lmeh eval data
|
23 |
+
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
24 |
+
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
|
25 |
+
HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
|
26 |
+
GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
|
27 |
+
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
28 |
+
ADD_PLOTS = False
|
29 |
+
|
30 |
+
EVAL_REQUESTS_PATH = "auto_evals/eval_requests"
|
31 |
+
|
32 |
+
api = HfApi()
|
33 |
+
|
34 |
+
|
35 |
+
def restart_space():
|
36 |
+
api.restart_space(
|
37 |
+
repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
|
38 |
+
)
|
39 |
+
|
40 |
+
auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)
|
41 |
+
|
42 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
43 |
+
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
44 |
+
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
45 |
+
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
46 |
+
|
47 |
+
if not IS_PUBLIC:
|
48 |
+
COLS.insert(2, AutoEvalColumn.is_8bit.name)
|
49 |
+
TYPES.insert(2, AutoEvalColumn.is_8bit.type)
|
50 |
+
|
51 |
+
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
52 |
+
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
53 |
+
|
54 |
+
BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
|
55 |
+
|
56 |
+
ELO_COLS = [c.name for c in fields(EloEvalColumn)]
|
57 |
+
ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
|
58 |
+
ELO_SORT_COL = EloEvalColumn.gpt4.name
|
59 |
+
|
60 |
+
|
61 |
+
def has_no_nan_values(df, columns):
|
62 |
+
return df[columns].notna().all(axis=1)
|
63 |
+
|
64 |
+
|
65 |
+
def has_nan_values(df, columns):
|
66 |
+
return df[columns].isna().any(axis=1)
|
67 |
+
|
68 |
+
|
69 |
+
def get_leaderboard_df():
|
70 |
+
if auto_eval_repo:
|
71 |
+
print("Pulling evaluation results for the leaderboard.")
|
72 |
+
auto_eval_repo.git_pull()
|
73 |
+
|
74 |
+
all_data = get_eval_results_dicts(IS_PUBLIC)
|
75 |
+
|
76 |
+
if not IS_PUBLIC:
|
77 |
+
all_data.append(gpt4_values)
|
78 |
+
all_data.append(gpt35_values)
|
79 |
+
|
80 |
+
all_data.append(baseline)
|
81 |
+
apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
|
82 |
+
|
83 |
+
df = pd.DataFrame.from_records(all_data)
|
84 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
85 |
+
df = df[COLS]
|
86 |
+
|
87 |
+
# filter out if any of the benchmarks have not been produced
|
88 |
+
df = df[has_no_nan_values(df, BENCHMARK_COLS)]
|
89 |
+
return df
|
90 |
+
|
91 |
+
|
92 |
+
def get_evaluation_queue_df():
|
93 |
+
# todo @saylortwift: replace the repo by the one you created for the eval queue
|
94 |
+
if auto_eval_repo:
|
95 |
+
print("Pulling changes for the evaluation queue.")
|
96 |
+
auto_eval_repo.git_pull()
|
97 |
+
|
98 |
+
entries = [
|
99 |
+
entry
|
100 |
+
for entry in os.listdir(EVAL_REQUESTS_PATH)
|
101 |
+
if not entry.startswith(".")
|
102 |
+
]
|
103 |
+
all_evals = []
|
104 |
+
|
105 |
+
for entry in entries:
|
106 |
+
if ".json" in entry:
|
107 |
+
file_path = os.path.join(EVAL_REQUESTS_PATH, entry)
|
108 |
+
with open(file_path) as fp:
|
109 |
+
data = json.load(fp)
|
110 |
+
|
111 |
+
data["# params"] = "unknown"
|
112 |
+
data["model"] = make_clickable_model(data["model"])
|
113 |
+
data["revision"] = data.get("revision", "main")
|
114 |
+
|
115 |
+
all_evals.append(data)
|
116 |
+
else:
|
117 |
+
# this is a folder
|
118 |
+
sub_entries = [
|
119 |
+
e
|
120 |
+
for e in os.listdir(f"{EVAL_REQUESTS_PATH}/{entry}")
|
121 |
+
if not e.startswith(".")
|
122 |
+
]
|
123 |
+
for sub_entry in sub_entries:
|
124 |
+
file_path = os.path.join(EVAL_REQUESTS_PATH, entry, sub_entry)
|
125 |
+
with open(file_path) as fp:
|
126 |
+
data = json.load(fp)
|
127 |
+
|
128 |
+
# data["# params"] = get_n_params(data["model"])
|
129 |
+
data["model"] = make_clickable_model(data["model"])
|
130 |
+
all_evals.append(data)
|
131 |
+
|
132 |
+
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
133 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
134 |
+
finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
|
135 |
+
df_pending = pd.DataFrame.from_records(pending_list)
|
136 |
+
df_running = pd.DataFrame.from_records(running_list)
|
137 |
+
df_finished = pd.DataFrame.from_records(finished_list)
|
138 |
+
return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
|
139 |
+
|
140 |
+
|
141 |
+
def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
|
142 |
+
if human_eval_repo:
|
143 |
+
print("Pulling human_eval_repo changes")
|
144 |
+
human_eval_repo.git_pull()
|
145 |
+
|
146 |
+
all_data = get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed)
|
147 |
+
dataframe = pd.DataFrame.from_records(all_data)
|
148 |
+
dataframe = dataframe.sort_values(by=ELO_SORT_COL, ascending=False)
|
149 |
+
dataframe = dataframe[ELO_COLS]
|
150 |
+
return dataframe
|
151 |
+
|
152 |
+
|
153 |
+
def get_elo_elements():
|
154 |
+
df_instruct = pd.read_json("human_evals/without_code.json")
|
155 |
+
df_code_instruct = pd.read_json("human_evals/with_code.json")
|
156 |
+
|
157 |
+
elo_leaderboard = get_elo_leaderboard(
|
158 |
+
df_instruct, df_code_instruct, tie_allowed=False
|
159 |
+
)
|
160 |
+
elo_leaderboard_with_tie_allowed = get_elo_leaderboard(
|
161 |
+
df_instruct, df_code_instruct, tie_allowed=True
|
162 |
+
)
|
163 |
+
plot_1, plot_2, plot_3, plot_4 = get_elo_plots(
|
164 |
+
df_instruct, df_code_instruct, tie_allowed=False
|
165 |
+
)
|
166 |
+
|
167 |
+
return (
|
168 |
+
elo_leaderboard,
|
169 |
+
elo_leaderboard_with_tie_allowed,
|
170 |
+
plot_1,
|
171 |
+
plot_2,
|
172 |
+
plot_3,
|
173 |
+
plot_4,
|
174 |
+
)
|
175 |
+
|
176 |
+
|
177 |
+
original_df = get_leaderboard_df()
|
178 |
+
leaderboard_df = original_df.copy()
|
179 |
+
(
|
180 |
+
finished_eval_queue_df,
|
181 |
+
running_eval_queue_df,
|
182 |
+
pending_eval_queue_df,
|
183 |
+
) = get_evaluation_queue_df()
|
184 |
+
(
|
185 |
+
elo_leaderboard,
|
186 |
+
elo_leaderboard_with_tie_allowed,
|
187 |
+
plot_1,
|
188 |
+
plot_2,
|
189 |
+
plot_3,
|
190 |
+
plot_4,
|
191 |
+
) = get_elo_elements()
|
192 |
+
|
193 |
+
|
194 |
+
def is_model_on_hub(model_name, revision) -> bool:
|
195 |
+
try:
|
196 |
+
AutoConfig.from_pretrained(model_name, revision=revision)
|
197 |
+
return True, None
|
198 |
+
|
199 |
+
except ValueError as e:
|
200 |
+
return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
|
201 |
+
|
202 |
+
except Exception as e:
|
203 |
+
print("Could not get the model config from the hub.: \n", e)
|
204 |
+
return False, "was not found on hub!"
|
205 |
+
|
206 |
+
|
207 |
+
def add_new_eval(
|
208 |
+
model: str,
|
209 |
+
base_model: str,
|
210 |
+
revision: str,
|
211 |
+
is_8_bit_eval: bool,
|
212 |
+
private: bool,
|
213 |
+
is_delta_weight: bool,
|
214 |
+
):
|
215 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
216 |
+
|
217 |
+
# check the model actually exists before adding the eval
|
218 |
+
if revision == "":
|
219 |
+
revision = "main"
|
220 |
+
|
221 |
+
if is_delta_weight:
|
222 |
+
base_model_on_hub, error = is_model_on_hub(base_model, revision)
|
223 |
+
if not base_model_on_hub:
|
224 |
+
return styled_error(f'Base model "{base_model}" {error}')
|
225 |
+
|
226 |
+
model_on_hub, error = is_model_on_hub(model, revision)
|
227 |
+
if not model_on_hub:
|
228 |
+
return styled_error(f'Model "{model}" {error}')
|
229 |
+
|
230 |
+
print("adding new eval")
|
231 |
+
|
232 |
+
eval_entry = {
|
233 |
+
"model": model,
|
234 |
+
"base_model": base_model,
|
235 |
+
"revision": revision,
|
236 |
+
"private": private,
|
237 |
+
"8bit_eval": is_8_bit_eval,
|
238 |
+
"is_delta_weight": is_delta_weight,
|
239 |
+
"status": "PENDING",
|
240 |
+
"submitted_time": current_time,
|
241 |
+
}
|
242 |
+
|
243 |
+
user_name = ""
|
244 |
+
model_path = model
|
245 |
+
if "/" in model:
|
246 |
+
user_name = model.split("/")[0]
|
247 |
+
model_path = model.split("/")[1]
|
248 |
+
|
249 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
250 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
251 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
|
252 |
+
|
253 |
+
# Check for duplicate submission
|
254 |
+
if out_path.split("eval_requests/")[1].lower() in requested_models:
|
255 |
+
return styled_warning("This model has been already submitted.")
|
256 |
+
|
257 |
+
with open(out_path, "w") as f:
|
258 |
+
f.write(json.dumps(eval_entry))
|
259 |
+
|
260 |
+
api.upload_file(
|
261 |
+
path_or_fileobj=out_path,
|
262 |
+
path_in_repo=out_path,
|
263 |
+
repo_id=LMEH_REPO,
|
264 |
+
token=H4_TOKEN,
|
265 |
+
repo_type="dataset",
|
266 |
+
)
|
267 |
+
|
268 |
+
return styled_message("Your request has been submitted to the evaluation queue!")
|
269 |
+
|
270 |
+
|
271 |
+
def refresh():
|
272 |
+
leaderboard_df = get_leaderboard_df()
|
273 |
+
(
|
274 |
+
finished_eval_queue_df,
|
275 |
+
running_eval_queue_df,
|
276 |
+
pending_eval_queue_df,
|
277 |
+
) = get_evaluation_queue_df()
|
278 |
+
return (
|
279 |
+
leaderboard_df,
|
280 |
+
finished_eval_queue_df,
|
281 |
+
running_eval_queue_df,
|
282 |
+
pending_eval_queue_df,
|
283 |
+
)
|
284 |
+
|
285 |
+
|
286 |
+
def search_table(df, query):
|
287 |
+
filtered_df = df[df[AutoEvalColumn.dummy.name].str.contains(query, case=False)]
|
288 |
+
return filtered_df
|
289 |
+
|
290 |
+
|
291 |
+
def change_tab(query_param):
|
292 |
+
query_param = query_param.replace("'", '"')
|
293 |
+
query_param = json.loads(query_param)
|
294 |
+
|
295 |
+
if (
|
296 |
+
isinstance(query_param, dict)
|
297 |
+
and "tab" in query_param
|
298 |
+
and query_param["tab"] == "evaluation"
|
299 |
+
):
|
300 |
+
return gr.Tabs.update(selected=1)
|
301 |
+
else:
|
302 |
+
return gr.Tabs.update(selected=0)
|
303 |
+
|
304 |
+
|
305 |
+
demo = gr.Blocks(css=custom_css)
|
306 |
+
with demo:
|
307 |
+
gr.HTML(TITLE)
|
308 |
+
with gr.Row():
|
309 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
310 |
+
|
311 |
+
with gr.Row():
|
312 |
+
with gr.Column():
|
313 |
+
with gr.Accordion("📙 Citation", open=False):
|
314 |
+
citation_button = gr.Textbox(
|
315 |
+
value=CITATION_BUTTON_TEXT,
|
316 |
+
label=CITATION_BUTTON_LABEL,
|
317 |
+
elem_id="citation-button",
|
318 |
+
).style(show_copy_button=True)
|
319 |
+
with gr.Column():
|
320 |
+
with gr.Accordion("✨ CHANGELOG", open=False):
|
321 |
+
changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
|
322 |
+
|
323 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
324 |
+
with gr.TabItem("📊 LLM Benchmarks", elem_id="llm-benchmark-tab-table", id=0):
|
325 |
+
with gr.Column():
|
326 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
327 |
+
with gr.Box(elem_id="search-bar-table-box"):
|
328 |
+
search_bar = gr.Textbox(
|
329 |
+
placeholder="🔍 Search your model and press ENTER...",
|
330 |
+
show_label=False,
|
331 |
+
elem_id="search-bar",
|
332 |
+
)
|
333 |
+
with gr.Tabs(elem_classes="tab-buttons"):
|
334 |
+
with gr.TabItem("Light View"):
|
335 |
+
leaderboard_table_lite = gr.components.Dataframe(
|
336 |
+
value=leaderboard_df[COLS_LITE],
|
337 |
+
headers=COLS_LITE,
|
338 |
+
datatype=TYPES_LITE,
|
339 |
+
max_rows=None,
|
340 |
+
elem_id="leaderboard-table-lite",
|
341 |
+
)
|
342 |
+
with gr.TabItem("Extended Model View"):
|
343 |
+
leaderboard_table = gr.components.Dataframe(
|
344 |
+
value=leaderboard_df,
|
345 |
+
headers=COLS,
|
346 |
+
datatype=TYPES,
|
347 |
+
max_rows=None,
|
348 |
+
elem_id="leaderboard-table",
|
349 |
+
)
|
350 |
+
|
351 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
352 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
353 |
+
value=original_df,
|
354 |
+
headers=COLS,
|
355 |
+
datatype=TYPES,
|
356 |
+
max_rows=None,
|
357 |
+
visible=False,
|
358 |
+
)
|
359 |
+
search_bar.submit(
|
360 |
+
search_table,
|
361 |
+
[hidden_leaderboard_table_for_search, search_bar],
|
362 |
+
leaderboard_table,
|
363 |
+
)
|
364 |
+
|
365 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
366 |
+
hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
|
367 |
+
value=original_df[COLS_LITE],
|
368 |
+
headers=COLS_LITE,
|
369 |
+
datatype=TYPES_LITE,
|
370 |
+
max_rows=None,
|
371 |
+
visible=False,
|
372 |
+
)
|
373 |
+
search_bar.submit(
|
374 |
+
search_table,
|
375 |
+
[hidden_leaderboard_table_for_search_lite, search_bar],
|
376 |
+
leaderboard_table_lite,
|
377 |
+
)
|
378 |
+
|
379 |
+
with gr.Row():
|
380 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
381 |
+
|
382 |
+
with gr.Accordion("✅ Finished Evaluations", open=False):
|
383 |
+
with gr.Row():
|
384 |
+
finished_eval_table = gr.components.Dataframe(
|
385 |
+
value=finished_eval_queue_df,
|
386 |
+
headers=EVAL_COLS,
|
387 |
+
datatype=EVAL_TYPES,
|
388 |
+
max_rows=5,
|
389 |
+
)
|
390 |
+
with gr.Accordion("🔄 Running Evaluation Queue", open=False):
|
391 |
+
with gr.Row():
|
392 |
+
running_eval_table = gr.components.Dataframe(
|
393 |
+
value=running_eval_queue_df,
|
394 |
+
headers=EVAL_COLS,
|
395 |
+
datatype=EVAL_TYPES,
|
396 |
+
max_rows=5,
|
397 |
+
)
|
398 |
+
|
399 |
+
with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
|
400 |
+
with gr.Row():
|
401 |
+
pending_eval_table = gr.components.Dataframe(
|
402 |
+
value=pending_eval_queue_df,
|
403 |
+
headers=EVAL_COLS,
|
404 |
+
datatype=EVAL_TYPES,
|
405 |
+
max_rows=5,
|
406 |
+
)
|
407 |
+
|
408 |
+
with gr.Row():
|
409 |
+
refresh_button = gr.Button("Refresh")
|
410 |
+
refresh_button.click(
|
411 |
+
refresh,
|
412 |
+
inputs=[],
|
413 |
+
outputs=[
|
414 |
+
leaderboard_table,
|
415 |
+
finished_eval_table,
|
416 |
+
running_eval_table,
|
417 |
+
pending_eval_table,
|
418 |
+
],
|
419 |
+
)
|
420 |
+
with gr.Accordion("Submit a new model for evaluation"):
|
421 |
+
with gr.Row():
|
422 |
+
with gr.Column():
|
423 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
424 |
+
revision_name_textbox = gr.Textbox(
|
425 |
+
label="revision", placeholder="main"
|
426 |
+
)
|
427 |
+
|
428 |
+
with gr.Column():
|
429 |
+
is_8bit_toggle = gr.Checkbox(
|
430 |
+
False, label="8 bit eval", visible=not IS_PUBLIC
|
431 |
+
)
|
432 |
+
private = gr.Checkbox(
|
433 |
+
False, label="Private", visible=not IS_PUBLIC
|
434 |
+
)
|
435 |
+
is_delta_weight = gr.Checkbox(False, label="Delta weights")
|
436 |
+
base_model_name_textbox = gr.Textbox(
|
437 |
+
label="base model (for delta)"
|
438 |
+
)
|
439 |
+
|
440 |
+
submit_button = gr.Button("Submit Eval")
|
441 |
+
submission_result = gr.Markdown()
|
442 |
+
submit_button.click(
|
443 |
+
add_new_eval,
|
444 |
+
[
|
445 |
+
model_name_textbox,
|
446 |
+
base_model_name_textbox,
|
447 |
+
revision_name_textbox,
|
448 |
+
is_8bit_toggle,
|
449 |
+
private,
|
450 |
+
is_delta_weight,
|
451 |
+
],
|
452 |
+
submission_result,
|
453 |
+
)
|
454 |
+
with gr.TabItem(
|
455 |
+
"🧑⚖️ Human & GPT-4 Evaluations 🤖", elem_id="human-gpt-tab-table", id=1
|
456 |
+
):
|
457 |
+
with gr.Row():
|
458 |
+
with gr.Column(scale=2):
|
459 |
+
gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
|
460 |
+
with gr.Column(scale=1):
|
461 |
+
gr.Image(
|
462 |
+
"src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
|
463 |
+
)
|
464 |
+
gr.Markdown("## No tie allowed")
|
465 |
+
elo_leaderboard_table = gr.components.Dataframe(
|
466 |
+
value=elo_leaderboard,
|
467 |
+
headers=ELO_COLS,
|
468 |
+
datatype=ELO_TYPES,
|
469 |
+
max_rows=5,
|
470 |
+
)
|
471 |
+
|
472 |
+
gr.Markdown("## Tie allowed*")
|
473 |
+
elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
|
474 |
+
value=elo_leaderboard_with_tie_allowed,
|
475 |
+
headers=ELO_COLS,
|
476 |
+
datatype=ELO_TYPES,
|
477 |
+
max_rows=5,
|
478 |
+
)
|
479 |
+
|
480 |
+
gr.Markdown(
|
481 |
+
"\* Results when the scores of 4 and 5 were treated as ties.",
|
482 |
+
elem_classes="markdown-text",
|
483 |
+
)
|
484 |
+
|
485 |
+
gr.Markdown(
|
486 |
+
"Let us know in [this discussion](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/65) which models we should add!",
|
487 |
+
elem_id="models-to-add-text",
|
488 |
+
)
|
489 |
+
|
490 |
+
dummy = gr.Textbox(visible=False)
|
491 |
+
demo.load(
|
492 |
+
change_tab,
|
493 |
+
dummy,
|
494 |
+
tabs,
|
495 |
+
_js=get_window_url_params,
|
496 |
+
)
|
497 |
+
if ADD_PLOTS:
|
498 |
+
with gr.Box():
|
499 |
+
visualization_title = gr.HTML(VISUALIZATION_TITLE)
|
500 |
+
with gr.Row():
|
501 |
+
with gr.Column():
|
502 |
+
gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
|
503 |
+
plot_1 = gr.Plot(plot_1, show_label=False)
|
504 |
+
with gr.Column():
|
505 |
+
gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
|
506 |
+
plot_2 = gr.Plot(plot_2, show_label=False)
|
507 |
+
with gr.Row():
|
508 |
+
with gr.Column():
|
509 |
+
gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
|
510 |
+
plot_3 = gr.Plot(plot_3, show_label=False)
|
511 |
+
with gr.Column():
|
512 |
+
gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
|
513 |
+
plot_4 = gr.Plot(plot_4, show_label=False)
|
514 |
+
|
515 |
+
scheduler = BackgroundScheduler()
|
516 |
+
scheduler.add_job(restart_space, "interval", seconds=3600)
|
517 |
+
scheduler.start()
|
518 |
+
demo.queue(concurrency_count=40).launch()
|
requirements.txt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.2
|
5 |
+
anyio==3.6.2
|
6 |
+
APScheduler==3.10.1
|
7 |
+
async-timeout==4.0.2
|
8 |
+
attrs==23.1.0
|
9 |
+
certifi==2022.12.7
|
10 |
+
charset-normalizer==3.1.0
|
11 |
+
click==8.1.3
|
12 |
+
contourpy==1.0.7
|
13 |
+
cycler==0.11.0
|
14 |
+
datasets==2.12.0
|
15 |
+
entrypoints==0.4
|
16 |
+
fastapi==0.95.1
|
17 |
+
ffmpy==0.3.0
|
18 |
+
filelock==3.11.0
|
19 |
+
fonttools==4.39.3
|
20 |
+
frozenlist==1.3.3
|
21 |
+
fsspec==2023.4.0
|
22 |
+
gradio==3.27.0
|
23 |
+
gradio_client==0.1.3
|
24 |
+
h11==0.14.0
|
25 |
+
httpcore==0.17.0
|
26 |
+
httpx==0.24.0
|
27 |
+
huggingface-hub==0.13.4
|
28 |
+
idna==3.4
|
29 |
+
Jinja2==3.1.2
|
30 |
+
jsonschema==4.17.3
|
31 |
+
kiwisolver==1.4.4
|
32 |
+
linkify-it-py==2.0.0
|
33 |
+
markdown-it-py==2.2.0
|
34 |
+
MarkupSafe==2.1.2
|
35 |
+
matplotlib==3.7.1
|
36 |
+
mdit-py-plugins==0.3.3
|
37 |
+
mdurl==0.1.2
|
38 |
+
multidict==6.0.4
|
39 |
+
numpy==1.24.2
|
40 |
+
orjson==3.8.10
|
41 |
+
packaging==23.1
|
42 |
+
pandas==2.0.0
|
43 |
+
Pillow==9.5.0
|
44 |
+
plotly==5.14.1
|
45 |
+
pyarrow==11.0.0
|
46 |
+
pydantic==1.10.7
|
47 |
+
pydub==0.25.1
|
48 |
+
pyparsing==3.0.9
|
49 |
+
pyrsistent==0.19.3
|
50 |
+
python-dateutil==2.8.2
|
51 |
+
python-multipart==0.0.6
|
52 |
+
pytz==2023.3
|
53 |
+
pytz-deprecation-shim==0.1.0.post0
|
54 |
+
PyYAML==6.0
|
55 |
+
requests==2.28.2
|
56 |
+
semantic-version==2.10.0
|
57 |
+
six==1.16.0
|
58 |
+
sniffio==1.3.0
|
59 |
+
starlette==0.26.1
|
60 |
+
toolz==0.12.0
|
61 |
+
tqdm==4.65.0
|
62 |
+
transformers==4.28.1
|
63 |
+
typing_extensions==4.5.0
|
64 |
+
tzdata==2023.3
|
65 |
+
tzlocal==4.3
|
66 |
+
uc-micro-py==1.0.1
|
67 |
+
urllib3==1.26.15
|
68 |
+
uvicorn==0.21.1
|
69 |
+
websockets==11.0.1
|
70 |
+
yarl==1.8.2
|
src/assets/css_html_js.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
custom_css = """
|
2 |
+
#changelog-text {
|
3 |
+
font-size: 16px !important;
|
4 |
+
}
|
5 |
+
|
6 |
+
#changelog-text h2 {
|
7 |
+
font-size: 18px !important;
|
8 |
+
}
|
9 |
+
|
10 |
+
.markdown-text {
|
11 |
+
font-size: 16px !important;
|
12 |
+
}
|
13 |
+
|
14 |
+
#models-to-add-text {
|
15 |
+
font-size: 18px !important;
|
16 |
+
}
|
17 |
+
|
18 |
+
#citation-button span {
|
19 |
+
font-size: 16px !important;
|
20 |
+
}
|
21 |
+
|
22 |
+
#citation-button textarea {
|
23 |
+
font-size: 16px !important;
|
24 |
+
}
|
25 |
+
|
26 |
+
#citation-button > label > button {
|
27 |
+
margin: 6px;
|
28 |
+
transform: scale(1.3);
|
29 |
+
}
|
30 |
+
|
31 |
+
#leaderboard-table {
|
32 |
+
margin-top: 15px
|
33 |
+
}
|
34 |
+
|
35 |
+
#leaderboard-table-lite {
|
36 |
+
margin-top: 15px
|
37 |
+
}
|
38 |
+
|
39 |
+
#search-bar-table-box > div:first-child {
|
40 |
+
background: none;
|
41 |
+
border: none;
|
42 |
+
}
|
43 |
+
|
44 |
+
#search-bar {
|
45 |
+
padding: 0px;
|
46 |
+
width: 30%;
|
47 |
+
}
|
48 |
+
|
49 |
+
/* Hides the final AutoEvalColumn */
|
50 |
+
#llm-benchmark-tab-table table td:last-child,
|
51 |
+
#llm-benchmark-tab-table table th:last-child {
|
52 |
+
display: none;
|
53 |
+
}
|
54 |
+
|
55 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
56 |
+
table td:first-child,
|
57 |
+
table th:first-child {
|
58 |
+
max-width: 400px;
|
59 |
+
overflow: auto;
|
60 |
+
white-space: nowrap;
|
61 |
+
}
|
62 |
+
|
63 |
+
.tab-buttons button {
|
64 |
+
font-size: 20px;
|
65 |
+
}
|
66 |
+
|
67 |
+
#scale-logo {
|
68 |
+
border-style: none !important;
|
69 |
+
box-shadow: none;
|
70 |
+
display: block;
|
71 |
+
margin-left: auto;
|
72 |
+
margin-right: auto;
|
73 |
+
max-width: 600px;
|
74 |
+
}
|
75 |
+
|
76 |
+
#scale-logo .download {
|
77 |
+
display: none;
|
78 |
+
}
|
79 |
+
"""
|
80 |
+
|
81 |
+
get_window_url_params = """
|
82 |
+
function(url_params) {
|
83 |
+
const params = new URLSearchParams(window.location.search);
|
84 |
+
url_params = Object.fromEntries(params);
|
85 |
+
return url_params;
|
86 |
+
}
|
87 |
+
"""
|
src/assets/hardcoded_evals.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.utils_display import AutoEvalColumn, model_hyperlink
|
2 |
+
|
3 |
+
gpt4_values = {
|
4 |
+
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
|
5 |
+
AutoEvalColumn.revision.name: "tech report",
|
6 |
+
AutoEvalColumn.is_8bit.name: None,
|
7 |
+
AutoEvalColumn.average.name: 84.3,
|
8 |
+
AutoEvalColumn.arc.name: 96.3,
|
9 |
+
AutoEvalColumn.hellaswag.name: 95.3,
|
10 |
+
AutoEvalColumn.mmlu.name: 86.4,
|
11 |
+
AutoEvalColumn.truthfulqa.name: 59.0,
|
12 |
+
AutoEvalColumn.dummy.name: "GPT-4",
|
13 |
+
}
|
14 |
+
|
15 |
+
gpt35_values = {
|
16 |
+
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
|
17 |
+
AutoEvalColumn.revision.name: "tech report",
|
18 |
+
AutoEvalColumn.is_8bit.name: None,
|
19 |
+
AutoEvalColumn.average.name: 71.9,
|
20 |
+
AutoEvalColumn.arc.name: 85.2,
|
21 |
+
AutoEvalColumn.hellaswag.name: 85.5,
|
22 |
+
AutoEvalColumn.mmlu.name: 70.0,
|
23 |
+
AutoEvalColumn.truthfulqa.name: 47.0,
|
24 |
+
AutoEvalColumn.dummy.name: "GPT-3.5",
|
25 |
+
}
|
26 |
+
|
27 |
+
baseline = {
|
28 |
+
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
29 |
+
AutoEvalColumn.revision.name: "N/A",
|
30 |
+
AutoEvalColumn.is_8bit.name: None,
|
31 |
+
AutoEvalColumn.average.name: 25.0,
|
32 |
+
AutoEvalColumn.arc.name: 25.0,
|
33 |
+
AutoEvalColumn.hellaswag.name: 25.0,
|
34 |
+
AutoEvalColumn.mmlu.name: 25.0,
|
35 |
+
AutoEvalColumn.truthfulqa.name: 25.0,
|
36 |
+
AutoEvalColumn.dummy.name: "baseline",
|
37 |
+
}
|
38 |
+
|
src/assets/scale-hf-logo.png
ADDED
![]() |
Git LFS Details
|
src/assets/text_content.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CHANGELOG_TEXT = f"""
|
2 |
+
## [2023-06-19]
|
3 |
+
- Added model type column
|
4 |
+
- Hid revision and 8bit columns since all models are the same atm
|
5 |
+
|
6 |
+
## [2023-06-16]
|
7 |
+
- Refactored code base
|
8 |
+
- Added new columns: number of parameters, hub likes, license
|
9 |
+
|
10 |
+
## [2023-06-13]
|
11 |
+
- Adjust description for TruthfulQA
|
12 |
+
|
13 |
+
## [2023-06-12]
|
14 |
+
- Add Human & GPT-4 Evaluations
|
15 |
+
|
16 |
+
## [2023-06-05]
|
17 |
+
- Increase concurrent thread count to 40
|
18 |
+
- Search models on ENTER
|
19 |
+
|
20 |
+
## [2023-06-02]
|
21 |
+
- Add a typeahead search bar
|
22 |
+
- Use webhooks to automatically spawn a new Space when someone opens a PR
|
23 |
+
- Start recording `submitted_time` for eval requests
|
24 |
+
- Limit AutoEvalColumn max-width
|
25 |
+
|
26 |
+
## [2023-05-30]
|
27 |
+
- Add a citation button
|
28 |
+
- Simplify Gradio layout
|
29 |
+
|
30 |
+
## [2023-05-29]
|
31 |
+
- Auto-restart every hour for the latest results
|
32 |
+
- Sync with the internal version (minor style changes)
|
33 |
+
|
34 |
+
## [2023-05-24]
|
35 |
+
- Add a baseline that has 25.0 for all values
|
36 |
+
- Add CHANGELOG
|
37 |
+
|
38 |
+
## [2023-05-23]
|
39 |
+
- Fix a CSS issue that made the leaderboard hard to read in dark mode
|
40 |
+
|
41 |
+
## [2023-05-22]
|
42 |
+
- Display a success/error message after submitting evaluation requests
|
43 |
+
- Reject duplicate submission
|
44 |
+
- Do not display results that have incomplete results
|
45 |
+
- Display different queues for jobs that are RUNNING, PENDING, FINISHED status
|
46 |
+
|
47 |
+
## [2023-05-15]
|
48 |
+
- Fix a typo: from "TruthQA" to "QA"
|
49 |
+
|
50 |
+
## [2023-05-10]
|
51 |
+
- Fix a bug that prevented auto-refresh
|
52 |
+
|
53 |
+
## [2023-05-10]
|
54 |
+
- Release the leaderboard to public
|
55 |
+
"""
|
56 |
+
|
57 |
+
TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
|
58 |
+
|
59 |
+
INTRODUCTION_TEXT = f"""
|
60 |
+
📐 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art. The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
|
61 |
+
|
62 |
+
🤗 A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
|
63 |
+
|
64 |
+
📈 In the **first tab (LLM Benchmarks)**, we evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. In the **second tab (Human & GPT Evaluations)**, the evaluations are performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts.
|
65 |
+
"""
|
66 |
+
|
67 |
+
LLM_BENCHMARKS_TEXT = f"""
|
68 |
+
Evaluation is performed against 4 popular benchmarks:
|
69 |
+
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
70 |
+
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
71 |
+
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
72 |
+
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model’s propensity to reproduce falsehoods commonly found online.
|
73 |
+
|
74 |
+
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
75 |
+
"""
|
76 |
+
|
77 |
+
HUMAN_GPT_EVAL_TEXT = f"""
|
78 |
+
Evaluation is performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts. The prompts cover tasks such as brainstorming, creative generation, commonsense reasoning, open question answering, summarization, and code generation. Comparisons are made by humans and a model on a 1-8 Likert scale, where the labeler is required to choose a preference each time. Using these preferences, we create bootstrapped Elo rankings.
|
79 |
+
|
80 |
+
We collaborated with **Scale AI** to generate the completions using a professional data labeling workforce on their platform, [following the labeling instructions found here](https://docs.google.com/document/d/1c5-96Lj-UH4lzKjLvJ_MRQaVMjtoEXTYA4dvoAYVCHc/edit?usp=sharing). To understand the evaluation of popular models, we also had GPT-4 label the completions using this prompt.
|
81 |
+
|
82 |
+
For more information on the calibration and initiation of these measurements, please refer to the [announcement blog post](https://huggingface.co/blog/llm-leaderboard). We would like to express our gratitude to **LMSYS** for providing a [useful notebook](https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5?usp=sharing) for computing Elo estimates and plots.
|
83 |
+
"""
|
84 |
+
|
85 |
+
|
86 |
+
EVALUATION_QUEUE_TEXT = f"""
|
87 |
+
# Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
|
88 |
+
"""
|
89 |
+
|
90 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
91 |
+
CITATION_BUTTON_TEXT = r"""@misc{open-llm-leaderboard,
|
92 |
+
author = {Edward Beeching, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf},
|
93 |
+
title = {Open LLM Leaderboard},
|
94 |
+
year = {2023},
|
95 |
+
publisher = {Hugging Face},
|
96 |
+
howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
|
97 |
+
|
98 |
+
}
|
99 |
+
@software{eval-harness,
|
100 |
+
author = {Gao, Leo and
|
101 |
+
Tow, Jonathan and
|
102 |
+
Biderman, Stella and
|
103 |
+
Black, Sid and
|
104 |
+
DiPofi, Anthony and
|
105 |
+
Foster, Charles and
|
106 |
+
Golding, Laurence and
|
107 |
+
Hsu, Jeffrey and
|
108 |
+
McDonell, Kyle and
|
109 |
+
Muennighoff, Niklas and
|
110 |
+
Phang, Jason and
|
111 |
+
Reynolds, Laria and
|
112 |
+
Tang, Eric and
|
113 |
+
Thite, Anish and
|
114 |
+
Wang, Ben and
|
115 |
+
Wang, Kevin and
|
116 |
+
Zou, Andy},
|
117 |
+
title = {A framework for few-shot language model evaluation},
|
118 |
+
month = sep,
|
119 |
+
year = 2021,
|
120 |
+
publisher = {Zenodo},
|
121 |
+
version = {v0.0.1},
|
122 |
+
doi = {10.5281/zenodo.5371628},
|
123 |
+
url = {https://doi.org/10.5281/zenodo.5371628}
|
124 |
+
}
|
125 |
+
@misc{clark2018think,
|
126 |
+
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
127 |
+
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
128 |
+
year={2018},
|
129 |
+
eprint={1803.05457},
|
130 |
+
archivePrefix={arXiv},
|
131 |
+
primaryClass={cs.AI}
|
132 |
+
}
|
133 |
+
@misc{zellers2019hellaswag,
|
134 |
+
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
135 |
+
author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
136 |
+
year={2019},
|
137 |
+
eprint={1905.07830},
|
138 |
+
archivePrefix={arXiv},
|
139 |
+
primaryClass={cs.CL}
|
140 |
+
}
|
141 |
+
@misc{hendrycks2021measuring,
|
142 |
+
title={Measuring Massive Multitask Language Understanding},
|
143 |
+
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
144 |
+
year={2021},
|
145 |
+
eprint={2009.03300},
|
146 |
+
archivePrefix={arXiv},
|
147 |
+
primaryClass={cs.CY}
|
148 |
+
}
|
149 |
+
@misc{lin2022truthfulqa,
|
150 |
+
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
151 |
+
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
152 |
+
year={2022},
|
153 |
+
eprint={2109.07958},
|
154 |
+
archivePrefix={arXiv},
|
155 |
+
primaryClass={cs.CL}
|
156 |
+
}"""
|
157 |
+
|
158 |
+
VISUALIZATION_TITLE = """<h1 align="center" id="space-title">📊 Visualizations</h1>"""
|
159 |
+
|
160 |
+
PLOT_1_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"
|
161 |
+
|
162 |
+
PLOT_2_TITLE = "Comparison Count of Each Combination of Models (not allowing ties)"
|
163 |
+
|
164 |
+
PLOT_3_TITLE = "Elo Estimates with error bars (ties allowed)"
|
165 |
+
|
166 |
+
PLOT_4_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"
|
src/auto_leaderboard/get_model_metadata.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
from src.utils_display import AutoEvalColumn
|
5 |
+
from src.auto_leaderboard.model_metadata_type import get_model_type
|
6 |
+
|
7 |
+
from huggingface_hub import HfApi
|
8 |
+
import huggingface_hub
|
9 |
+
api = HfApi()
|
10 |
+
|
11 |
+
|
12 |
+
def get_model_infos_from_hub(leaderboard_data: List[dict]):
|
13 |
+
for model_data in leaderboard_data:
|
14 |
+
model_name = model_data["model_name_for_query"]
|
15 |
+
try:
|
16 |
+
model_info = api.model_info(model_name)
|
17 |
+
except huggingface_hub.utils._errors.RepositoryNotFoundError:
|
18 |
+
model_data[AutoEvalColumn.license.name] = None
|
19 |
+
model_data[AutoEvalColumn.likes.name] = None
|
20 |
+
model_data[AutoEvalColumn.params.name] = None
|
21 |
+
continue
|
22 |
+
|
23 |
+
model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
|
24 |
+
model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
|
25 |
+
model_data[AutoEvalColumn.params.name] = get_model_size(model_name, model_info)
|
26 |
+
|
27 |
+
|
28 |
+
def get_model_license(model_info):
|
29 |
+
try:
|
30 |
+
return model_info.cardData["license"]
|
31 |
+
except Exception:
|
32 |
+
return None
|
33 |
+
|
34 |
+
def get_model_likes(model_info):
|
35 |
+
return model_info.likes
|
36 |
+
|
37 |
+
size_pattern = re.compile(r"\d+(b|m)")
|
38 |
+
|
39 |
+
def get_model_size(model_name, model_info):
|
40 |
+
# In billions
|
41 |
+
try:
|
42 |
+
return round(model_info.safetensors["total"] / 1e9, 3)
|
43 |
+
except AttributeError:
|
44 |
+
#print(f"Repository {model_id} does not have safetensors weights")
|
45 |
+
pass
|
46 |
+
try:
|
47 |
+
size_match = re.search(size_pattern, model_name.lower())
|
48 |
+
size = size_match.group(0)
|
49 |
+
return round(int(size[:-1]) if size[-1] == "b" else int(size[:-1]) / 1e3, 3)
|
50 |
+
except AttributeError:
|
51 |
+
return None
|
52 |
+
|
53 |
+
|
54 |
+
def apply_metadata(leaderboard_data: List[dict]):
|
55 |
+
get_model_type(leaderboard_data)
|
56 |
+
get_model_infos_from_hub(leaderboard_data)
|
src/auto_leaderboard/load_results.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
import glob
|
4 |
+
import json
|
5 |
+
from typing import Dict, List, Tuple
|
6 |
+
|
7 |
+
from src.utils_display import AutoEvalColumn, make_clickable_model
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
# clone / pull the lmeh eval data
|
11 |
+
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
12 |
+
BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
|
13 |
+
BENCH_TO_NAME = {
|
14 |
+
"arc_challenge": AutoEvalColumn.arc.name,
|
15 |
+
"hellaswag": AutoEvalColumn.hellaswag.name,
|
16 |
+
"hendrycks": AutoEvalColumn.mmlu.name,
|
17 |
+
"truthfulqa_mc": AutoEvalColumn.truthfulqa.name,
|
18 |
+
}
|
19 |
+
|
20 |
+
|
21 |
+
@dataclass
|
22 |
+
class EvalResult:
|
23 |
+
eval_name: str
|
24 |
+
org: str
|
25 |
+
model: str
|
26 |
+
revision: str
|
27 |
+
is_8bit: bool
|
28 |
+
results: dict
|
29 |
+
|
30 |
+
def to_dict(self):
|
31 |
+
if self.org is not None:
|
32 |
+
base_model = f"{self.org}/{self.model}"
|
33 |
+
else:
|
34 |
+
base_model = f"{self.model}"
|
35 |
+
data_dict = {}
|
36 |
+
|
37 |
+
data_dict["eval_name"] = self.eval_name # not a column, just a save name
|
38 |
+
data_dict[AutoEvalColumn.is_8bit.name] = self.is_8bit
|
39 |
+
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
40 |
+
data_dict[AutoEvalColumn.dummy.name] = base_model
|
41 |
+
data_dict[AutoEvalColumn.revision.name] = self.revision
|
42 |
+
data_dict[AutoEvalColumn.average.name] = round(
|
43 |
+
sum([v for k, v in self.results.items()]) / 4.0, 1
|
44 |
+
)
|
45 |
+
|
46 |
+
for benchmark in BENCHMARKS:
|
47 |
+
if not benchmark in self.results.keys():
|
48 |
+
self.results[benchmark] = None
|
49 |
+
|
50 |
+
for k, v in BENCH_TO_NAME.items():
|
51 |
+
data_dict[v] = self.results[k]
|
52 |
+
|
53 |
+
return data_dict
|
54 |
+
|
55 |
+
|
56 |
+
def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
|
57 |
+
with open(json_filepath) as fp:
|
58 |
+
data = json.load(fp)
|
59 |
+
|
60 |
+
path_split = json_filepath.split("/")
|
61 |
+
org = None
|
62 |
+
model = path_split[-4]
|
63 |
+
is_8bit = path_split[-2] == "8bit"
|
64 |
+
revision = path_split[-3]
|
65 |
+
if len(path_split) == 7:
|
66 |
+
# handles gpt2 type models that don't have an org
|
67 |
+
result_key = f"{model}_{revision}_{is_8bit}"
|
68 |
+
else:
|
69 |
+
org = path_split[-5]
|
70 |
+
result_key = f"{org}_{model}_{revision}_{is_8bit}"
|
71 |
+
|
72 |
+
eval_result = None
|
73 |
+
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
74 |
+
if benchmark in json_filepath:
|
75 |
+
accs = np.array([v[metric] for v in data["results"].values()])
|
76 |
+
mean_acc = round(np.mean(accs) * 100.0, 1)
|
77 |
+
eval_result = EvalResult(
|
78 |
+
result_key, org, model, revision, is_8bit, {benchmark: mean_acc}
|
79 |
+
)
|
80 |
+
|
81 |
+
return result_key, eval_result
|
82 |
+
|
83 |
+
|
84 |
+
def get_eval_results(is_public) -> List[EvalResult]:
|
85 |
+
json_filepaths = glob.glob(
|
86 |
+
"auto_evals/eval_results/public/**/16bit/*.json", recursive=True
|
87 |
+
)
|
88 |
+
if not is_public:
|
89 |
+
json_filepaths += glob.glob(
|
90 |
+
"auto_evals/eval_results/private/**/*.json", recursive=True
|
91 |
+
)
|
92 |
+
json_filepaths += glob.glob(
|
93 |
+
"auto_evals/eval_results/private/**/*.json", recursive=True
|
94 |
+
)
|
95 |
+
# include the 8bit evals of public models
|
96 |
+
json_filepaths += glob.glob(
|
97 |
+
"auto_evals/eval_results/public/**/8bit/*.json", recursive=True
|
98 |
+
)
|
99 |
+
eval_results = {}
|
100 |
+
|
101 |
+
for json_filepath in json_filepaths:
|
102 |
+
result_key, eval_result = parse_eval_result(json_filepath)
|
103 |
+
if result_key in eval_results.keys():
|
104 |
+
eval_results[result_key].results.update(eval_result.results)
|
105 |
+
else:
|
106 |
+
eval_results[result_key] = eval_result
|
107 |
+
|
108 |
+
eval_results = [v for v in eval_results.values()]
|
109 |
+
|
110 |
+
return eval_results
|
111 |
+
|
112 |
+
|
113 |
+
def get_eval_results_dicts(is_public=True) -> List[Dict]:
|
114 |
+
eval_results = get_eval_results(is_public)
|
115 |
+
|
116 |
+
return [e.to_dict() for e in eval_results]
|
src/auto_leaderboard/model_metadata_type.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum
|
2 |
+
from typing import Dict, List
|
3 |
+
|
4 |
+
class ModelType(Enum):
|
5 |
+
PT = "pretrained"
|
6 |
+
SFT = "finetuned"
|
7 |
+
RL = "with RL"
|
8 |
+
|
9 |
+
|
10 |
+
TYPE_METADATA: Dict[str, ModelType] = {
|
11 |
+
"aisquared/dlite-v1-355m": ModelType.SFT,
|
12 |
+
"aisquared/dlite-v2-774m": ModelType.SFT,
|
13 |
+
"aisquared/dlite-v2-1_5b": ModelType.SFT,
|
14 |
+
"TheBloke/wizardLM-7B-HF": ModelType.SFT,
|
15 |
+
"TheBloke/dromedary-65b-lora-HF": ModelType.SFT,
|
16 |
+
"TheBloke/vicuna-13B-1.1-HF": ModelType.SFT,
|
17 |
+
"TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.SFT,
|
18 |
+
"wordcab/llama-natural-instructions-13b": ModelType.SFT,
|
19 |
+
"JosephusCheung/Guanaco": ModelType.SFT,
|
20 |
+
"AlekseyKorshuk/vicuna-7b": ModelType.SFT,
|
21 |
+
"AlekseyKorshuk/chatml-pyg-v1": ModelType.SFT,
|
22 |
+
"concedo/OPT-19M-ChatSalad": ModelType.SFT,
|
23 |
+
"digitous/Javalion-R": ModelType.SFT,
|
24 |
+
"digitous/Alpacino30b": ModelType.SFT,
|
25 |
+
"digitous/Javelin-GPTJ": ModelType.SFT,
|
26 |
+
"anton-l/gpt-j-tiny-random": ModelType.SFT,
|
27 |
+
"IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.SFT,
|
28 |
+
"gpt2-medium": ModelType.PT,
|
29 |
+
"PygmalionAI/pygmalion-6b": ModelType.SFT,
|
30 |
+
"medalpaca/medalpaca-7b": ModelType.SFT,
|
31 |
+
"medalpaca/medalpaca-13b": ModelType.SFT,
|
32 |
+
"chavinlo/alpaca-13b": ModelType.SFT,
|
33 |
+
"chavinlo/alpaca-native": ModelType.SFT,
|
34 |
+
"chavinlo/gpt4-x-alpaca": ModelType.SFT,
|
35 |
+
"hakurei/lotus-12B": ModelType.SFT,
|
36 |
+
"amazon/LightGPT": ModelType.SFT,
|
37 |
+
"shibing624/chinese-llama-plus-13b-hf": ModelType.SFT,
|
38 |
+
"mosaicml/mpt-7b": ModelType.PT,
|
39 |
+
"PSanni/Deer-3b": ModelType.SFT,
|
40 |
+
"bigscience/bloom-1b1": ModelType.PT,
|
41 |
+
"MetaIX/GPT4-X-Alpasta-30b": ModelType.SFT,
|
42 |
+
"EleutherAI/gpt-neox-20b": ModelType.PT,
|
43 |
+
"EleutherAI/gpt-j-6b": ModelType.PT,
|
44 |
+
"roneneldan/TinyStories-28M": ModelType.SFT,
|
45 |
+
"lmsys/vicuna-13b-delta-v1.1": ModelType.SFT,
|
46 |
+
"lmsys/vicuna-7b-delta-v1.1": ModelType.SFT,
|
47 |
+
"abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.SFT,
|
48 |
+
"pillowtalks-ai/delta13b": ModelType.SFT,
|
49 |
+
"bigcode/starcoderplus": ModelType.SFT,
|
50 |
+
"microsoft/DialoGPT-large": ModelType.SFT,
|
51 |
+
"microsoft/CodeGPT-small-py": ModelType.SFT,
|
52 |
+
"Pirr/pythia-13b-deduped-green_devil": ModelType.SFT,
|
53 |
+
"Aeala/GPT4-x-AlpacaDente2-30b": ModelType.SFT,
|
54 |
+
"Aeala/VicUnlocked-alpaca-30b": ModelType.SFT,
|
55 |
+
"dvruette/llama-13b-pretrained-sft-epoch-2": ModelType.SFT,
|
56 |
+
"dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.SFT,
|
57 |
+
"openlm-research/open_llama_3b_350bt_preview": ModelType.PT,
|
58 |
+
"openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
|
59 |
+
"openlm-research/open_llama_7b": ModelType.PT,
|
60 |
+
"openlm-research/open_llama_3b": ModelType.PT,
|
61 |
+
"openlm-research/open_llama_7b_400bt_preview": ModelType.PT,
|
62 |
+
"PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.SFT,
|
63 |
+
"GeorgiaTechResearchInstitute/galactica-6.7b-evol-instruct-70k": ModelType.SFT,
|
64 |
+
"databricks/dolly-v2-7b": ModelType.SFT,
|
65 |
+
"databricks/dolly-v2-3b": ModelType.SFT,
|
66 |
+
"databricks/dolly-v2-12b": ModelType.SFT,
|
67 |
+
"pinkmanlove/llama-65b-hf": ModelType.SFT,
|
68 |
+
"Rachneet/gpt2-xl-alpaca": ModelType.SFT,
|
69 |
+
"Locutusque/gpt2-conversational-or-qa": ModelType.SFT,
|
70 |
+
"NbAiLab/nb-gpt-j-6B-alpaca": ModelType.SFT,
|
71 |
+
"Fredithefish/ScarletPajama-3B-HF": ModelType.SFT,
|
72 |
+
"eachadea/vicuna-7b-1.1": ModelType.SFT,
|
73 |
+
"eachadea/vicuna-13b": ModelType.SFT,
|
74 |
+
"openaccess-ai-collective/wizard-mega-13b": ModelType.SFT,
|
75 |
+
"openaccess-ai-collective/manticore-13b": ModelType.SFT,
|
76 |
+
"openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.SFT,
|
77 |
+
"openaccess-ai-collective/minotaur-13b": ModelType.SFT,
|
78 |
+
"lamini/instruct-tuned-3b": ModelType.SFT,
|
79 |
+
"pythainlp/wangchanglm-7.5B-sft-enth": ModelType.SFT,
|
80 |
+
"pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.SFT,
|
81 |
+
"stabilityai/stablelm-tuned-alpha-7b": ModelType.SFT,
|
82 |
+
"CalderaAI/30B-Lazarus": ModelType.SFT,
|
83 |
+
"KoboldAI/OPT-13B-Nerybus-Mix": ModelType.SFT,
|
84 |
+
"distilgpt2": ModelType.PT,
|
85 |
+
"wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.SFT,
|
86 |
+
"OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.SFT,
|
87 |
+
"junelee/wizard-vicuna-13b": ModelType.SFT,
|
88 |
+
"BreadAi/StoryPy": ModelType.SFT,
|
89 |
+
"togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
|
90 |
+
"togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
|
91 |
+
"Writer/camel-5b-hf": ModelType.SFT,
|
92 |
+
"Writer/palmyra-base": ModelType.PT,
|
93 |
+
"MBZUAI/lamini-neo-125m": ModelType.SFT,
|
94 |
+
"TehVenom/DiffMerge_Pygmalion_Main-onto-V8P4": ModelType.SFT,
|
95 |
+
"vicgalle/gpt2-alpaca-gpt4": ModelType.SFT,
|
96 |
+
"facebook/opt-350m": ModelType.PT,
|
97 |
+
"facebook/opt-125m": ModelType.PT,
|
98 |
+
"facebook/opt-13b": ModelType.PT,
|
99 |
+
"facebook/opt-1.3b": ModelType.PT,
|
100 |
+
"facebook/opt-66b": ModelType.PT,
|
101 |
+
"facebook/galactica-120b": ModelType.PT,
|
102 |
+
"Abe13/jgpt2-v1": ModelType.SFT,
|
103 |
+
"gpt2-xl": ModelType.PT,
|
104 |
+
"HuggingFaceH4/stable-vicuna-13b-2904": ModelType.RL,
|
105 |
+
"HuggingFaceH4/llama-7b-ift-alpaca": ModelType.SFT,
|
106 |
+
"HuggingFaceH4/starchat-alpha": ModelType.SFT,
|
107 |
+
"HuggingFaceH4/starchat-beta": ModelType.SFT,
|
108 |
+
"ausboss/Llama30B-SuperHOT": ModelType.SFT,
|
109 |
+
"ausboss/llama-13b-supercot": ModelType.SFT,
|
110 |
+
"ausboss/llama-30b-supercot": ModelType.SFT,
|
111 |
+
"Neko-Institute-of-Science/metharme-7b": ModelType.SFT,
|
112 |
+
"SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.SFT,
|
113 |
+
"victor123/WizardLM-13B-1.0": ModelType.SFT,
|
114 |
+
"AlpinDale/pygmalion-instruct": ModelType.SFT,
|
115 |
+
"tiiuae/falcon-7b-instruct": ModelType.SFT,
|
116 |
+
"tiiuae/falcon-40b-instruct": ModelType.SFT,
|
117 |
+
"tiiuae/falcon-40b": ModelType.PT,
|
118 |
+
"tiiuae/falcon-7b": ModelType.PT,
|
119 |
+
"cyl/awsome-llama": ModelType.SFT,
|
120 |
+
"xzuyn/Alpacino-SuperCOT-13B": ModelType.SFT,
|
121 |
+
"xzuyn/MedicWizard-7B": ModelType.SFT,
|
122 |
+
"beomi/KoAlpaca-Polyglot-5.8B": ModelType.SFT,
|
123 |
+
"chainyo/alpaca-lora-7b": ModelType.SFT,
|
124 |
+
"Salesforce/codegen-16B-nl": ModelType.PT,
|
125 |
+
"Salesforce/codegen-16B-multi": ModelType.SFT,
|
126 |
+
"ai-forever/rugpt3large_based_on_gpt2": ModelType.SFT,
|
127 |
+
"gpt2-large": ModelType.PT,
|
128 |
+
"huggingface/llama-13b": ModelType.PT,
|
129 |
+
"huggingface/llama-7b": ModelType.PT,
|
130 |
+
"huggingface/llama-65b": ModelType.PT,
|
131 |
+
"huggingface/llama-30b": ModelType.PT,
|
132 |
+
"jondurbin/airoboros-7b": ModelType.SFT,
|
133 |
+
"jondurbin/airoboros-13b": ModelType.SFT,
|
134 |
+
"cerebras/Cerebras-GPT-1.3B": ModelType.PT,
|
135 |
+
"cerebras/Cerebras-GPT-111M": ModelType.PT,
|
136 |
+
"NousResearch/Nous-Hermes-13b": ModelType.SFT,
|
137 |
+
"project-baize/baize-v2-7b": ModelType.SFT,
|
138 |
+
"project-baize/baize-v2-13b": ModelType.SFT,
|
139 |
+
"LLMs/AlpacaGPT4-7B-elina": ModelType.SFT,
|
140 |
+
"LLMs/Vicuna-EvolInstruct-13B": ModelType.SFT,
|
141 |
+
"huggingtweets/jerma985": ModelType.SFT,
|
142 |
+
"huggyllama/llama-65b": ModelType.PT,
|
143 |
+
"WizardLM/WizardLM-13B-1.0": ModelType.SFT,
|
144 |
+
"gpt2": ModelType.PT,
|
145 |
+
"alessandropalla/instruct_gpt2": ModelType.SFT,
|
146 |
+
"MayaPH/FinOPT-Lincoln": ModelType.SFT,
|
147 |
+
"MayaPH/FinOPT-Franklin": ModelType.SFT,
|
148 |
+
"timdettmers/guanaco-33b-merged": ModelType.SFT,
|
149 |
+
"timdettmers/guanaco-65b-merged": ModelType.SFT,
|
150 |
+
"elinas/llama-30b-hf-transformers-4.29": ModelType.SFT,
|
151 |
+
"elinas/chronos-33b": ModelType.SFT,
|
152 |
+
"nmitchko/medguanaco-65b-GPTQ": ModelType.SFT,
|
153 |
+
"xhyi/PT_GPTNEO350_ATG": ModelType.SFT,
|
154 |
+
"h2oai/h2ogpt-oasst1-512-20b": ModelType.SFT,
|
155 |
+
"h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.SFT,
|
156 |
+
"nomic-ai/gpt4all-13b-snoozy": ModelType.SFT,
|
157 |
+
"nomic-ai/gpt4all-j": ModelType.SFT,
|
158 |
+
}
|
159 |
+
|
160 |
+
|
161 |
+
def get_model_type(leaderboard_data: List[dict]):
|
162 |
+
for model_data in leaderboard_data:
|
163 |
+
model_data["Type"] = TYPE_METADATA.get(model_data["model_name_for_query"], "N/A")
|
src/elo_leaderboard/load_results.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import defaultdict
|
2 |
+
from dataclasses import dataclass
|
3 |
+
from typing import Dict, List
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from datasets import load_dataset
|
8 |
+
|
9 |
+
from src.assets.text_content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
|
10 |
+
from src.utils_display import make_clickable_model, EloEvalColumn
|
11 |
+
from .visualizations import (
|
12 |
+
get_bootstrap_result,
|
13 |
+
switch_model_a_b,
|
14 |
+
visualize_battle_count,
|
15 |
+
visualize_bootstrap_scores,
|
16 |
+
visualize_pairwise_win_fraction,
|
17 |
+
visualize_rating_count,
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
@dataclass
|
22 |
+
class EloEvalResult:
|
23 |
+
model: str
|
24 |
+
gpt_4_all: int
|
25 |
+
human_all: int
|
26 |
+
human_instruct: int
|
27 |
+
human_code_instruct: int
|
28 |
+
tie_allowed: bool
|
29 |
+
|
30 |
+
def to_dict(self):
|
31 |
+
base_model = f"{self.model}"
|
32 |
+
data_dict = {}
|
33 |
+
data_dict[EloEvalColumn.model.name] = make_clickable_model(base_model)
|
34 |
+
data_dict[EloEvalColumn.gpt4.name] = self.gpt_4_all
|
35 |
+
data_dict[EloEvalColumn.human_all.name] = self.human_all
|
36 |
+
data_dict[EloEvalColumn.human_instruct.name] = self.human_instruct
|
37 |
+
data_dict[EloEvalColumn.human_code_instruct.name] = self.human_code_instruct
|
38 |
+
|
39 |
+
return data_dict
|
40 |
+
|
41 |
+
|
42 |
+
def create_eval_df(df, tie_allowed):
|
43 |
+
responses = []
|
44 |
+
for _, row in df.iterrows():
|
45 |
+
if row["status"] == "canceled":
|
46 |
+
continue
|
47 |
+
|
48 |
+
rating = row["response"]["annotations"]["Preference"]
|
49 |
+
if rating == "NaN":
|
50 |
+
continue
|
51 |
+
|
52 |
+
scores = row["response"]["responses"]
|
53 |
+
if any(s["Preference"] == "" for s in scores):
|
54 |
+
continue
|
55 |
+
|
56 |
+
response = {
|
57 |
+
"id": row["task_id"],
|
58 |
+
"prompt": row["params"]["templateVariables"]["prompt"],
|
59 |
+
"model_a": row["params"]["templateVariables"]["modela"],
|
60 |
+
"model_b": row["params"]["templateVariables"]["modelb"],
|
61 |
+
"response_a": row["params"]["templateVariables"]["response1"],
|
62 |
+
"response_b": row["params"]["templateVariables"]["response2"],
|
63 |
+
"rating": int(rating),
|
64 |
+
"ratings": [np.array([s["Preference"] for s in scores], dtype=np.int32)],
|
65 |
+
}
|
66 |
+
|
67 |
+
if tie_allowed:
|
68 |
+
response["win"] = (
|
69 |
+
"model_a"
|
70 |
+
if response["rating"] < 4
|
71 |
+
else "model_b"
|
72 |
+
if response["rating"] > 5
|
73 |
+
else "tie"
|
74 |
+
)
|
75 |
+
else:
|
76 |
+
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
77 |
+
|
78 |
+
responses.append(response)
|
79 |
+
|
80 |
+
return pd.DataFrame(responses)
|
81 |
+
|
82 |
+
|
83 |
+
def create_eval_df_for_gpt(df, tie_allowed):
|
84 |
+
responses = []
|
85 |
+
for _, row in df.iterrows():
|
86 |
+
response = {
|
87 |
+
"id": row["review_id"],
|
88 |
+
"prompt": row["question"],
|
89 |
+
"model_a": row["model1"],
|
90 |
+
"model_b": row["model2"],
|
91 |
+
"response_a": row["answer1"],
|
92 |
+
"response_b": row["answer2"],
|
93 |
+
"rating": row["score"][0],
|
94 |
+
}
|
95 |
+
|
96 |
+
if tie_allowed:
|
97 |
+
response["win"] = (
|
98 |
+
"model_a"
|
99 |
+
if response["rating"] < 4
|
100 |
+
else "model_b"
|
101 |
+
if response["rating"] > 5
|
102 |
+
else "tie"
|
103 |
+
)
|
104 |
+
else:
|
105 |
+
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
106 |
+
|
107 |
+
responses.append(response)
|
108 |
+
|
109 |
+
return pd.DataFrame(responses)
|
110 |
+
|
111 |
+
|
112 |
+
# Compute the Elo rating for each model
|
113 |
+
def compute_elo(df, k=32, scale=400, base=10, initial_rating=1000):
|
114 |
+
rating = defaultdict(lambda: initial_rating)
|
115 |
+
|
116 |
+
for _, model_a, model_b, win in df[["model_a", "model_b", "win"]].itertuples():
|
117 |
+
ra = rating[model_a]
|
118 |
+
rb = rating[model_b]
|
119 |
+
ea = 1 / (1 + base ** ((rb - ra) / scale))
|
120 |
+
eb = 1 / (1 + base ** ((ra - rb) / scale))
|
121 |
+
if win == "model_a":
|
122 |
+
sa = 1
|
123 |
+
elif win == "model_b":
|
124 |
+
sa = 0
|
125 |
+
elif win == "tie" or win == "tie (bothbad)":
|
126 |
+
sa = 0.5
|
127 |
+
else:
|
128 |
+
raise Exception(f"unexpected vote {win}")
|
129 |
+
rating[model_a] += k * (sa - ea)
|
130 |
+
rating[model_b] += k * (1 - sa - eb)
|
131 |
+
|
132 |
+
return rating
|
133 |
+
|
134 |
+
|
135 |
+
def convert_rating_from_float_to_int(df):
|
136 |
+
return {model: int(rating) for model, rating in compute_elo(df).items()}
|
137 |
+
|
138 |
+
|
139 |
+
def get_elo_results(df_instruct, df_code_instruct, tie_allowed):
|
140 |
+
df_all = pd.concat([df_instruct, df_code_instruct])
|
141 |
+
|
142 |
+
df_gpt_4 = load_dataset(
|
143 |
+
"gpt_4_evals/data/",
|
144 |
+
split="train",
|
145 |
+
revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846",
|
146 |
+
).to_pandas()
|
147 |
+
|
148 |
+
dfs = [df_instruct, df_code_instruct, df_all]
|
149 |
+
elo_ratings = [
|
150 |
+
convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed))
|
151 |
+
for df in dfs
|
152 |
+
]
|
153 |
+
|
154 |
+
gpt_4_elo_ratings = convert_rating_from_float_to_int(
|
155 |
+
create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed)
|
156 |
+
)
|
157 |
+
elo_ratings.append(gpt_4_elo_ratings)
|
158 |
+
|
159 |
+
results = [
|
160 |
+
EloEvalResult(
|
161 |
+
model=model_name,
|
162 |
+
gpt_4_all=elo_ratings[3][model_name],
|
163 |
+
human_all=elo_ratings[2][model_name],
|
164 |
+
human_instruct=elo_ratings[0][model_name],
|
165 |
+
human_code_instruct=elo_ratings[1][model_name],
|
166 |
+
tie_allowed=tie_allowed,
|
167 |
+
)
|
168 |
+
for model_name in elo_ratings[0].keys()
|
169 |
+
]
|
170 |
+
|
171 |
+
return results
|
172 |
+
|
173 |
+
|
174 |
+
def get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed) -> List[Dict]:
|
175 |
+
eval_results = get_elo_results(df_instruct, df_code_instruct, tie_allowed)
|
176 |
+
return [r.to_dict() for r in eval_results]
|
177 |
+
|
178 |
+
|
179 |
+
def get_elo_plots(df_instruct, df_code_instruct, tie_allowed):
|
180 |
+
df_instruct = create_eval_df(df_instruct, tie_allowed=tie_allowed)
|
181 |
+
df_code_instruct = create_eval_df(df_code_instruct, tie_allowed=tie_allowed)
|
182 |
+
df_all = pd.concat([df_instruct, df_code_instruct])
|
183 |
+
game = df_all[["model_a", "model_b", "win"]]
|
184 |
+
|
185 |
+
game_switch = switch_model_a_b(game)
|
186 |
+
plot_1 = visualize_pairwise_win_fraction(game_switch, PLOT_1_TITLE)
|
187 |
+
|
188 |
+
plot_2 = visualize_battle_count(game_switch, PLOT_2_TITLE)
|
189 |
+
|
190 |
+
BOOTSTRAP_ROUNDS = 1000
|
191 |
+
if "bootstrap_elo_lu" not in globals():
|
192 |
+
bootstrap_elo_lu = get_bootstrap_result(
|
193 |
+
game_switch, compute_elo, BOOTSTRAP_ROUNDS
|
194 |
+
)
|
195 |
+
|
196 |
+
plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
|
197 |
+
|
198 |
+
plot_4 = visualize_rating_count(game, PLOT_4_TITLE)
|
199 |
+
|
200 |
+
return plot_1, plot_2, plot_3, plot_4
|
src/elo_leaderboard/visualizations.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import plotly.express as px
|
6 |
+
|
7 |
+
|
8 |
+
# 1
|
9 |
+
def compute_pairwise_win_fraction(battles):
|
10 |
+
# Times each model wins as Model A
|
11 |
+
a_win_ptbl = pd.pivot_table(
|
12 |
+
battles[battles["win"] == "model_a"],
|
13 |
+
index="model_a",
|
14 |
+
columns="model_b",
|
15 |
+
aggfunc="size",
|
16 |
+
fill_value=0,
|
17 |
+
)
|
18 |
+
|
19 |
+
# Table counting times each model wins as Model B
|
20 |
+
b_win_ptbl = pd.pivot_table(
|
21 |
+
battles[battles["win"] == "model_b"],
|
22 |
+
index="model_a",
|
23 |
+
columns="model_b",
|
24 |
+
aggfunc="size",
|
25 |
+
fill_value=0,
|
26 |
+
)
|
27 |
+
|
28 |
+
# Table counting number of A-B pairs
|
29 |
+
num_battles_ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
|
30 |
+
|
31 |
+
# Computing the proportion of wins for each model as A and as B
|
32 |
+
# against all other models
|
33 |
+
row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (num_battles_ptbl + num_battles_ptbl.T)
|
34 |
+
|
35 |
+
# Arrange ordering according to proprition of wins
|
36 |
+
prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
|
37 |
+
model_names = list(prop_wins.keys())
|
38 |
+
row_beats_col = row_beats_col_freq.loc[model_names, model_names]
|
39 |
+
return row_beats_col
|
40 |
+
|
41 |
+
|
42 |
+
def visualize_pairwise_win_fraction(battles, title):
|
43 |
+
row_beats_col = compute_pairwise_win_fraction(battles)
|
44 |
+
fig = px.imshow(row_beats_col, color_continuous_scale="RdBu", text_auto=".2f", title=title)
|
45 |
+
fig.update_layout(
|
46 |
+
xaxis_title="Model B",
|
47 |
+
yaxis_title="Model A",
|
48 |
+
xaxis_side="top",
|
49 |
+
title_y=0.07,
|
50 |
+
title_x=0.5,
|
51 |
+
)
|
52 |
+
fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")
|
53 |
+
return fig
|
54 |
+
|
55 |
+
|
56 |
+
# 2
|
57 |
+
def switch_model_a_b(df):
|
58 |
+
df_switch = df.copy()
|
59 |
+
# switch with probability 0.5
|
60 |
+
for i, row in df.iterrows():
|
61 |
+
if np.random.rand() < 0.5:
|
62 |
+
df_switch.at[i, "model_a"] = row["model_b"]
|
63 |
+
df_switch.at[i, "model_b"] = row["model_a"]
|
64 |
+
if row["win"] == "model_a":
|
65 |
+
df_switch.at[i, "win"] = "model_b"
|
66 |
+
elif row["win"] == "model_b":
|
67 |
+
df_switch.at[i, "win"] = "model_a"
|
68 |
+
return df_switch
|
69 |
+
|
70 |
+
|
71 |
+
def visualize_battle_count(battles, title):
|
72 |
+
ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
|
73 |
+
battle_counts = ptbl + ptbl.T
|
74 |
+
ordering = battle_counts.sum().sort_values(ascending=False).index
|
75 |
+
fig = px.imshow(battle_counts.loc[ordering, ordering], title=title, text_auto=True, width=600)
|
76 |
+
fig.update_layout(
|
77 |
+
xaxis_title="Model B",
|
78 |
+
yaxis_title="Model A",
|
79 |
+
xaxis_side="top",
|
80 |
+
title_y=0.07,
|
81 |
+
title_x=0.5,
|
82 |
+
)
|
83 |
+
fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
|
84 |
+
return fig
|
85 |
+
|
86 |
+
|
87 |
+
# 3
|
88 |
+
def get_bootstrap_result(battles, func_compute_elo, num_round):
|
89 |
+
rows = [func_compute_elo(battles.sample(frac=1.0, replace=True)) for _ in range(num_round)]
|
90 |
+
df = pd.DataFrame(rows)
|
91 |
+
return df[df.median().sort_values(ascending=False).index]
|
92 |
+
|
93 |
+
|
94 |
+
def visualize_bootstrap_scores(df, title):
|
95 |
+
bars = (
|
96 |
+
pd.DataFrame(
|
97 |
+
dict(
|
98 |
+
lower=df.quantile(0.025),
|
99 |
+
rating=df.quantile(0.5),
|
100 |
+
upper=df.quantile(0.975),
|
101 |
+
)
|
102 |
+
)
|
103 |
+
.reset_index(names="model")
|
104 |
+
.sort_values("rating", ascending=False)
|
105 |
+
)
|
106 |
+
bars["error_y"] = bars["upper"] - bars["rating"]
|
107 |
+
bars["error_y_minus"] = bars["rating"] - bars["lower"]
|
108 |
+
bars["rating_rounded"] = np.round(bars["rating"], 2)
|
109 |
+
fig = px.scatter(
|
110 |
+
bars,
|
111 |
+
x="model",
|
112 |
+
y="rating",
|
113 |
+
error_y="error_y",
|
114 |
+
error_y_minus="error_y_minus",
|
115 |
+
text="rating_rounded",
|
116 |
+
title=title,
|
117 |
+
)
|
118 |
+
fig.update_layout(xaxis_title="Model", yaxis_title="Rating")
|
119 |
+
return fig
|
120 |
+
|
121 |
+
|
122 |
+
# 4
|
123 |
+
def visualize_rating_count(df, title):
|
124 |
+
df_all_value_counts = pd.concat([df["model_a"], df["model_b"]]).value_counts()
|
125 |
+
fig = px.bar(df_all_value_counts, title=title, text_auto=True)
|
126 |
+
|
127 |
+
min_y = df_all_value_counts.min()
|
128 |
+
max_y = df_all_value_counts.max()
|
129 |
+
|
130 |
+
y_end = math.ceil(min_y / 100) * 100
|
131 |
+
y_begin = math.floor(max_y / 100) * 100
|
132 |
+
|
133 |
+
fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
|
134 |
+
fig.update_yaxes(range=[y_begin, y_end])
|
135 |
+
# save the plot for the blog:
|
136 |
+
fig.write_html("src/assets/model_counts.html", full_html=False, include_plotlyjs="cdn")
|
137 |
+
return fig
|
src/init.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from huggingface_hub import Repository
|
3 |
+
|
4 |
+
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
5 |
+
|
6 |
+
|
7 |
+
def get_all_requested_models(requested_models_dir):
|
8 |
+
depth = 1
|
9 |
+
file_names = []
|
10 |
+
|
11 |
+
for root, dirs, files in os.walk(requested_models_dir):
|
12 |
+
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
13 |
+
if current_depth == depth:
|
14 |
+
file_names.extend([os.path.join(root, file) for file in files])
|
15 |
+
|
16 |
+
return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
|
17 |
+
|
18 |
+
def load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
|
19 |
+
auto_eval_repo = None
|
20 |
+
requested_models = None
|
21 |
+
if H4_TOKEN:
|
22 |
+
print("Pulling evaluation requests and results.")
|
23 |
+
# try:
|
24 |
+
# shutil.rmtree("./auto_evals/")
|
25 |
+
# except:
|
26 |
+
# pass
|
27 |
+
|
28 |
+
auto_eval_repo = Repository(
|
29 |
+
local_dir="./auto_evals/",
|
30 |
+
clone_from=LMEH_REPO,
|
31 |
+
use_auth_token=H4_TOKEN,
|
32 |
+
repo_type="dataset",
|
33 |
+
)
|
34 |
+
auto_eval_repo.git_pull()
|
35 |
+
|
36 |
+
requested_models_dir = "./auto_evals/eval_requests"
|
37 |
+
requested_models = get_all_requested_models(requested_models_dir)
|
38 |
+
|
39 |
+
human_eval_repo = None
|
40 |
+
if H4_TOKEN and not os.path.isdir("./human_evals"):
|
41 |
+
print("Pulling human evaluation repo")
|
42 |
+
human_eval_repo = Repository(
|
43 |
+
local_dir="./human_evals/",
|
44 |
+
clone_from=HUMAN_EVAL_REPO,
|
45 |
+
use_auth_token=H4_TOKEN,
|
46 |
+
repo_type="dataset",
|
47 |
+
)
|
48 |
+
human_eval_repo.git_pull()
|
49 |
+
|
50 |
+
gpt_4_eval_repo = None
|
51 |
+
if H4_TOKEN and not os.path.isdir("./gpt_4_evals"):
|
52 |
+
print("Pulling GPT-4 evaluation repo")
|
53 |
+
gpt_4_eval_repo = Repository(
|
54 |
+
local_dir="./gpt_4_evals/",
|
55 |
+
clone_from=GPT_4_EVAL_REPO,
|
56 |
+
use_auth_token=H4_TOKEN,
|
57 |
+
repo_type="dataset",
|
58 |
+
)
|
59 |
+
gpt_4_eval_repo.git_pull()
|
60 |
+
|
61 |
+
return auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models
|
62 |
+
|
63 |
+
|
64 |
+
#def load_results(model, benchmark, metric):
|
65 |
+
# file_path = os.path.join("autoevals", model, f"{model}-eval_{benchmark}.json")
|
66 |
+
# if not os.path.exists(file_path):
|
67 |
+
# return 0.0, None
|
68 |
+
|
69 |
+
# with open(file_path) as fp:
|
70 |
+
# data = json.load(fp)
|
71 |
+
# accs = np.array([v[metric] for k, v in data["results"].items()])
|
72 |
+
# mean_acc = np.mean(accs)
|
73 |
+
# return mean_acc, data["config"]["model_args"]
|
src/utils_display.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
# These classes are for user facing column names, to avoid having to change them
|
4 |
+
# all around the code when a modif is needed
|
5 |
+
@dataclass
|
6 |
+
class ColumnContent:
|
7 |
+
name: str
|
8 |
+
type: str
|
9 |
+
displayed_by_default: bool
|
10 |
+
hidden: bool = False
|
11 |
+
|
12 |
+
def fields(raw_class):
|
13 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
14 |
+
|
15 |
+
@dataclass(frozen=True)
|
16 |
+
class AutoEvalColumn: # Auto evals column
|
17 |
+
model = ColumnContent("Model", "markdown", True)
|
18 |
+
revision = ColumnContent("Revision", "str", True, True)
|
19 |
+
model_type = ColumnContent("Type", "bool", False)
|
20 |
+
is_8bit = ColumnContent("8bit", "bool", False, True)
|
21 |
+
license = ColumnContent("Hub License", "str", False)
|
22 |
+
params = ColumnContent("#Params (B)", "number", False)
|
23 |
+
likes = ColumnContent("Hub ❤️", "number", False)
|
24 |
+
average = ColumnContent("Average ⬆️", "number", True)
|
25 |
+
arc = ColumnContent("ARC (25-s) ⬆️", "number", True)
|
26 |
+
hellaswag = ColumnContent("HellaSwag (10-s) ⬆️", "number", True)
|
27 |
+
mmlu = ColumnContent("MMLU (5-s) ⬆️", "number", True)
|
28 |
+
truthfulqa = ColumnContent("TruthfulQA (MC) (0-s) ⬆️", "number", True)
|
29 |
+
dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
|
30 |
+
|
31 |
+
@dataclass(frozen=True)
|
32 |
+
class EloEvalColumn: # Elo evals column
|
33 |
+
model = ColumnContent("Model", "markdown", True)
|
34 |
+
gpt4 = ColumnContent("GPT-4 (all)", "number", True)
|
35 |
+
human_all = ColumnContent("Human (all)", "number", True)
|
36 |
+
human_instruct = ColumnContent("Human (instruct)", "number", True)
|
37 |
+
human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
|
38 |
+
|
39 |
+
|
40 |
+
@dataclass(frozen=True)
|
41 |
+
class EvalQueueColumn: # Queue column
|
42 |
+
model = ColumnContent("model", "markdown", True)
|
43 |
+
revision = ColumnContent("revision", "str", True)
|
44 |
+
private = ColumnContent("private", "bool", True)
|
45 |
+
is_8bit = ColumnContent("8bit_eval", "bool", True)
|
46 |
+
has_delta_weight = ColumnContent("is_delta_weight", "bool", True)
|
47 |
+
status = ColumnContent("status", "str", True)
|
48 |
+
|
49 |
+
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|
50 |
+
|
51 |
+
|
52 |
+
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
53 |
+
VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
|
54 |
+
OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
55 |
+
DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
56 |
+
MODEL_PAGE = "https://huggingface.co/models"
|
57 |
+
LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
|
58 |
+
VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
|
59 |
+
ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
|
60 |
+
|
61 |
+
|
62 |
+
def model_hyperlink(link, model_name):
|
63 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
64 |
+
|
65 |
+
|
66 |
+
def make_clickable_model(model_name):
|
67 |
+
link = f"https://huggingface.co/{model_name}"
|
68 |
+
|
69 |
+
if model_name in LLAMAS:
|
70 |
+
link = LLAMA_LINK
|
71 |
+
model_name = model_name.split("/")[1]
|
72 |
+
elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
|
73 |
+
link = VICUNA_LINK
|
74 |
+
model_name = "stable-vicuna-13b"
|
75 |
+
elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
|
76 |
+
link = ALPACA_LINK
|
77 |
+
model_name = "alpaca-13b"
|
78 |
+
if model_name == "dolly-12b":
|
79 |
+
link = DOLLY_LINK
|
80 |
+
elif model_name == "vicuna-13b":
|
81 |
+
link = VICUNA_LINK
|
82 |
+
elif model_name == "koala-13b":
|
83 |
+
link = KOALA_LINK
|
84 |
+
elif model_name == "oasst-12b":
|
85 |
+
link = OASST_LINK
|
86 |
+
#else:
|
87 |
+
# link = MODEL_PAGE
|
88 |
+
|
89 |
+
return model_hyperlink(link, model_name)
|
90 |
+
|
91 |
+
def styled_error(error):
|
92 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
93 |
+
|
94 |
+
def styled_warning(warn):
|
95 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
96 |
+
|
97 |
+
def styled_message(message):
|
98 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|