b1sheng edbeeching HF Staff commited on
Commit
7ae1238
·
0 Parent(s):

Duplicate from HuggingFaceH4/open_llm_leaderboard

Browse files

Co-authored-by: Edward Beeching <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ gpt_4_evals/
10
+ human_evals/
11
+ eval-queue/
12
+ eval-results/
13
+ auto_evals/
14
+
15
+ src/assets/model_counts.html
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Open LLM Leaderboard
3
+ emoji: 🏆
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.27.0
8
+ app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ duplicated_from: HuggingFaceH4/open_llm_leaderboard
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import pandas as pd
9
+ from apscheduler.schedulers.background import BackgroundScheduler
10
+ from huggingface_hub import HfApi
11
+ from transformers import AutoConfig
12
+
13
+ from src.auto_leaderboard.get_model_metadata import apply_metadata
14
+ from src.assets.text_content import *
15
+ from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
+ from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
17
+ from src.assets.css_html_js import custom_css, get_window_url_params
18
+ from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
19
+ from src.init import get_all_requested_models, load_all_info_from_hub
20
+
21
+ # clone / pull the lmeh eval data
22
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
23
+
24
+ QUEUE_REPO = "open-llm-leaderboard/requests"
25
+ RESULTS_REPO = "open-llm-leaderboard/results"
26
+
27
+ PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
28
+ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
29
+
30
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
31
+
32
+ EVAL_REQUESTS_PATH = "eval-queue"
33
+ EVAL_RESULTS_PATH = "eval-results"
34
+
35
+ EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
36
+ EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
37
+
38
+ api = HfApi()
39
+
40
+ def restart_space():
41
+ api.restart_space(
42
+ repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
43
+ )
44
+
45
+ eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
46
+
47
+ if not IS_PUBLIC:
48
+ eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
49
+ else:
50
+ eval_queue_private, eval_results_private = None, None
51
+
52
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
53
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
54
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
55
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
56
+
57
+ if not IS_PUBLIC:
58
+ COLS.insert(2, AutoEvalColumn.precision.name)
59
+ TYPES.insert(2, AutoEvalColumn.precision.type)
60
+
61
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
62
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
63
+
64
+ BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
65
+
66
+
67
+ def has_no_nan_values(df, columns):
68
+ return df[columns].notna().all(axis=1)
69
+
70
+
71
+ def has_nan_values(df, columns):
72
+ return df[columns].isna().any(axis=1)
73
+
74
+
75
+ def get_leaderboard_df():
76
+ if eval_results:
77
+ print("Pulling evaluation results for the leaderboard.")
78
+ eval_results.git_pull()
79
+ if eval_results_private:
80
+ print("Pulling evaluation results for the leaderboard.")
81
+ eval_results_private.git_pull()
82
+
83
+ all_data = get_eval_results_dicts(IS_PUBLIC)
84
+
85
+ if not IS_PUBLIC:
86
+ all_data.append(gpt4_values)
87
+ all_data.append(gpt35_values)
88
+
89
+ all_data.append(baseline)
90
+ apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
91
+
92
+ df = pd.DataFrame.from_records(all_data)
93
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
94
+ df = df[COLS]
95
+
96
+ # filter out if any of the benchmarks have not been produced
97
+ df = df[has_no_nan_values(df, BENCHMARK_COLS)]
98
+ return df
99
+
100
+
101
+ def get_evaluation_queue_df():
102
+ if eval_queue:
103
+ print("Pulling changes for the evaluation queue.")
104
+ eval_queue.git_pull()
105
+ if eval_queue_private:
106
+ print("Pulling changes for the evaluation queue.")
107
+ eval_queue_private.git_pull()
108
+
109
+ entries = [
110
+ entry
111
+ for entry in os.listdir(EVAL_REQUESTS_PATH)
112
+ if not entry.startswith(".")
113
+ ]
114
+ all_evals = []
115
+
116
+ for entry in entries:
117
+ if ".json" in entry:
118
+ file_path = os.path.join(EVAL_REQUESTS_PATH, entry)
119
+ with open(file_path) as fp:
120
+ data = json.load(fp)
121
+
122
+ data["# params"] = "unknown"
123
+ data["model"] = make_clickable_model(data["model"])
124
+ data["revision"] = data.get("revision", "main")
125
+
126
+ all_evals.append(data)
127
+ elif ".md" not in entry:
128
+ # this is a folder
129
+ sub_entries = [
130
+ e
131
+ for e in os.listdir(f"{EVAL_REQUESTS_PATH}/{entry}")
132
+ if not e.startswith(".")
133
+ ]
134
+ for sub_entry in sub_entries:
135
+ file_path = os.path.join(EVAL_REQUESTS_PATH, entry, sub_entry)
136
+ with open(file_path) as fp:
137
+ data = json.load(fp)
138
+
139
+ # data["# params"] = get_n_params(data["model"])
140
+ data["model"] = make_clickable_model(data["model"])
141
+ all_evals.append(data)
142
+
143
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
144
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
145
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
146
+ df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
147
+ df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
148
+ df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
149
+ return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
150
+
151
+
152
+
153
+ original_df = get_leaderboard_df()
154
+ leaderboard_df = original_df.copy()
155
+ (
156
+ finished_eval_queue_df,
157
+ running_eval_queue_df,
158
+ pending_eval_queue_df,
159
+ ) = get_evaluation_queue_df()
160
+
161
+ def is_model_on_hub(model_name, revision) -> bool:
162
+ try:
163
+ AutoConfig.from_pretrained(model_name, revision=revision)
164
+ return True, None
165
+
166
+ except ValueError as e:
167
+ return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
168
+
169
+ except Exception as e:
170
+ print(f"Could not get the model config from the hub.: {e}")
171
+ return False, "was not found on hub!"
172
+
173
+
174
+ def add_new_eval(
175
+ model: str,
176
+ base_model: str,
177
+ revision: str,
178
+ precision: str,
179
+ private: bool,
180
+ weight_type: str,
181
+ model_type: str,
182
+ ):
183
+ precision = precision.split(" ")[0]
184
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
185
+
186
+ # check the model actually exists before adding the eval
187
+ if revision == "":
188
+ revision = "main"
189
+
190
+ if weight_type in ["Delta", "Adapter"]:
191
+ base_model_on_hub, error = is_model_on_hub(base_model, revision)
192
+ if not base_model_on_hub:
193
+ return styled_error(f'Base model "{base_model}" {error}')
194
+
195
+
196
+ if not weight_type == "Adapter":
197
+ model_on_hub, error = is_model_on_hub(model, revision)
198
+ if not model_on_hub:
199
+ return styled_error(f'Model "{model}" {error}')
200
+
201
+ print("adding new eval")
202
+
203
+ eval_entry = {
204
+ "model": model,
205
+ "base_model": base_model,
206
+ "revision": revision,
207
+ "private": private,
208
+ "precision": precision,
209
+ "weight_type": weight_type,
210
+ "status": "PENDING",
211
+ "submitted_time": current_time,
212
+ "model_type": model_type,
213
+ }
214
+
215
+ user_name = ""
216
+ model_path = model
217
+ if "/" in model:
218
+ user_name = model.split("/")[0]
219
+ model_path = model.split("/")[1]
220
+
221
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
222
+ os.makedirs(OUT_DIR, exist_ok=True)
223
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
224
+
225
+ # Check for duplicate submission
226
+ if out_path.split("eval-queue/")[1].lower() in requested_models:
227
+ return styled_warning("This model has been already submitted.")
228
+
229
+ with open(out_path, "w") as f:
230
+ f.write(json.dumps(eval_entry))
231
+
232
+ api.upload_file(
233
+ path_or_fileobj=out_path,
234
+ path_in_repo=out_path.split("eval-queue/")[1],
235
+ repo_id=QUEUE_REPO,
236
+ token=H4_TOKEN,
237
+ repo_type="dataset",
238
+ commit_message=f"Add {model} to eval queue",
239
+ )
240
+
241
+ # remove the local file
242
+ os.remove(out_path)
243
+
244
+ return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
245
+
246
+
247
+ def refresh():
248
+ leaderboard_df = get_leaderboard_df()
249
+ (
250
+ finished_eval_queue_df,
251
+ running_eval_queue_df,
252
+ pending_eval_queue_df,
253
+ ) = get_evaluation_queue_df()
254
+ return (
255
+ leaderboard_df,
256
+ finished_eval_queue_df,
257
+ running_eval_queue_df,
258
+ pending_eval_queue_df,
259
+ )
260
+
261
+
262
+ def search_table(df, query):
263
+ if AutoEvalColumn.model_type.name in df.columns:
264
+ filtered_df = df[
265
+ (df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
266
+ | (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
267
+ ]
268
+ else:
269
+ filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
270
+ return filtered_df
271
+
272
+
273
+ def change_tab(query_param):
274
+ query_param = query_param.replace("'", '"')
275
+ query_param = json.loads(query_param)
276
+
277
+ if (
278
+ isinstance(query_param, dict)
279
+ and "tab" in query_param
280
+ and query_param["tab"] == "evaluation"
281
+ ):
282
+ return gr.Tabs.update(selected=1)
283
+ else:
284
+ return gr.Tabs.update(selected=0)
285
+
286
+
287
+ demo = gr.Blocks(css=custom_css)
288
+ with demo:
289
+ gr.HTML(TITLE)
290
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
291
+ with gr.Row():
292
+ with gr.Box(elem_id="search-bar-table-box"):
293
+ search_bar = gr.Textbox(
294
+ placeholder="🔍 Search your model and press ENTER...",
295
+ show_label=False,
296
+ elem_id="search-bar",
297
+ )
298
+
299
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
300
+ with gr.TabItem("🏅 LLM Benchmark (lite)", elem_id="llm-benchmark-tab-table", id=0):
301
+ leaderboard_table_lite = gr.components.Dataframe(
302
+ value=leaderboard_df[COLS_LITE],
303
+ headers=COLS_LITE,
304
+ datatype=TYPES_LITE,
305
+ max_rows=None,
306
+ elem_id="leaderboard-table-lite",
307
+ )
308
+ # Dummy leaderboard for handling the case when the user uses backspace key
309
+ hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
310
+ value=original_df[COLS_LITE],
311
+ headers=COLS_LITE,
312
+ datatype=TYPES_LITE,
313
+ max_rows=None,
314
+ visible=False,
315
+ )
316
+ search_bar.submit(
317
+ search_table,
318
+ [hidden_leaderboard_table_for_search_lite, search_bar],
319
+ leaderboard_table_lite,
320
+ )
321
+
322
+ with gr.TabItem("📊 Extended view", elem_id="llm-benchmark-tab-table", id=1):
323
+ leaderboard_table = gr.components.Dataframe(
324
+ value=leaderboard_df,
325
+ headers=COLS,
326
+ datatype=TYPES,
327
+ max_rows=None,
328
+ elem_id="leaderboard-table",
329
+ )
330
+
331
+ # Dummy leaderboard for handling the case when the user uses backspace key
332
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
333
+ value=original_df,
334
+ headers=COLS,
335
+ datatype=TYPES,
336
+ max_rows=None,
337
+ visible=False,
338
+ )
339
+ search_bar.submit(
340
+ search_table,
341
+ [hidden_leaderboard_table_for_search, search_bar],
342
+ leaderboard_table,
343
+ )
344
+ with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
345
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
346
+
347
+ with gr.TabItem("✉️✨ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
348
+ with gr.Column():
349
+ with gr.Row():
350
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
351
+
352
+ with gr.Column():
353
+ with gr.Accordion(f"✅ Finished Evaluations: {len(finished_eval_queue_df)}", open=False):
354
+ with gr.Row():
355
+ finished_eval_table = gr.components.Dataframe(
356
+ value=finished_eval_queue_df,
357
+ headers=EVAL_COLS,
358
+ datatype=EVAL_TYPES,
359
+ max_rows=5,
360
+ )
361
+ with gr.Accordion(f"🔄 Running Evaluation Queue: {len(running_eval_queue_df)}", open=False):
362
+ with gr.Row():
363
+ running_eval_table = gr.components.Dataframe(
364
+ value=running_eval_queue_df,
365
+ headers=EVAL_COLS,
366
+ datatype=EVAL_TYPES,
367
+ max_rows=5,
368
+ )
369
+
370
+ with gr.Accordion(f"⏳ Pending Evaluation Queue: {len(pending_eval_queue_df)}", open=False):
371
+ with gr.Row():
372
+ pending_eval_table = gr.components.Dataframe(
373
+ value=pending_eval_queue_df,
374
+ headers=EVAL_COLS,
375
+ datatype=EVAL_TYPES,
376
+ max_rows=5,
377
+ )
378
+ with gr.Row():
379
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
380
+
381
+ with gr.Row():
382
+ with gr.Column():
383
+ model_name_textbox = gr.Textbox(label="Model name")
384
+ revision_name_textbox = gr.Textbox(
385
+ label="revision", placeholder="main"
386
+ )
387
+ private = gr.Checkbox(
388
+ False, label="Private", visible=not IS_PUBLIC
389
+ )
390
+ model_type = gr.Dropdown(
391
+ choices=["pretrained", "fine-tuned", "with RL"],
392
+ label="Model type",
393
+ multiselect=False,
394
+ value="pretrained",
395
+ max_choices=1,
396
+ interactive=True,
397
+ )
398
+
399
+ with gr.Column():
400
+ precision = gr.Dropdown(
401
+ choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)"],
402
+ label="Precision",
403
+ multiselect=False,
404
+ value="float16",
405
+ max_choices=1,
406
+ interactive=True,
407
+ )
408
+ weight_type = gr.Dropdown(
409
+ choices=["Original", "Delta", "Adapter"],
410
+ label="Weights type",
411
+ multiselect=False,
412
+ value="Original",
413
+ max_choices=1,
414
+ interactive=True,
415
+ )
416
+ base_model_name_textbox = gr.Textbox(
417
+ label="Base model (for delta or adapter weights)"
418
+ )
419
+
420
+ submit_button = gr.Button("Submit Eval")
421
+ submission_result = gr.Markdown()
422
+ submit_button.click(
423
+ add_new_eval,
424
+ [
425
+ model_name_textbox,
426
+ base_model_name_textbox,
427
+ revision_name_textbox,
428
+ precision,
429
+ private,
430
+ weight_type,
431
+ model_type
432
+ ],
433
+ submission_result,
434
+ )
435
+
436
+ with gr.Row():
437
+ refresh_button = gr.Button("Refresh")
438
+ refresh_button.click(
439
+ refresh,
440
+ inputs=[],
441
+ outputs=[
442
+ leaderboard_table,
443
+ finished_eval_table,
444
+ running_eval_table,
445
+ pending_eval_table,
446
+ ],
447
+ )
448
+
449
+ with gr.Row():
450
+ with gr.Accordion("📙 Citation", open=False):
451
+ citation_button = gr.Textbox(
452
+ value=CITATION_BUTTON_TEXT,
453
+ label=CITATION_BUTTON_LABEL,
454
+ elem_id="citation-button",
455
+ ).style(show_copy_button=True)
456
+
457
+ dummy = gr.Textbox(visible=False)
458
+ demo.load(
459
+ change_tab,
460
+ dummy,
461
+ tabs,
462
+ _js=get_window_url_params,
463
+ )
464
+
465
+ scheduler = BackgroundScheduler()
466
+ scheduler.add_job(restart_space, "interval", seconds=3600)
467
+ scheduler.start()
468
+ demo.queue(concurrency_count=40).launch()
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ APScheduler==3.10.1
7
+ async-timeout==4.0.2
8
+ attrs==23.1.0
9
+ certifi==2022.12.7
10
+ charset-normalizer==3.1.0
11
+ click==8.1.3
12
+ contourpy==1.0.7
13
+ cycler==0.11.0
14
+ datasets==2.12.0
15
+ entrypoints==0.4
16
+ fastapi==0.95.1
17
+ ffmpy==0.3.0
18
+ filelock==3.11.0
19
+ fonttools==4.39.3
20
+ frozenlist==1.3.3
21
+ fsspec==2023.4.0
22
+ gradio==3.27.0
23
+ gradio_client==0.1.3
24
+ h11==0.14.0
25
+ httpcore==0.17.0
26
+ httpx==0.24.0
27
+ huggingface-hub==0.13.4
28
+ idna==3.4
29
+ Jinja2==3.1.2
30
+ jsonschema==4.17.3
31
+ kiwisolver==1.4.4
32
+ linkify-it-py==2.0.0
33
+ markdown-it-py==2.2.0
34
+ MarkupSafe==2.1.2
35
+ matplotlib==3.7.1
36
+ mdit-py-plugins==0.3.3
37
+ mdurl==0.1.2
38
+ multidict==6.0.4
39
+ numpy==1.24.2
40
+ orjson==3.8.10
41
+ packaging==23.1
42
+ pandas==2.0.0
43
+ Pillow==9.5.0
44
+ plotly==5.14.1
45
+ pyarrow==11.0.0
46
+ pydantic==1.10.7
47
+ pydub==0.25.1
48
+ pyparsing==3.0.9
49
+ pyrsistent==0.19.3
50
+ python-dateutil==2.8.2
51
+ python-multipart==0.0.6
52
+ pytz==2023.3
53
+ pytz-deprecation-shim==0.1.0.post0
54
+ PyYAML==6.0
55
+ requests==2.28.2
56
+ semantic-version==2.10.0
57
+ six==1.16.0
58
+ sniffio==1.3.0
59
+ starlette==0.26.1
60
+ toolz==0.12.0
61
+ tqdm==4.65.0
62
+ transformers==4.28.1
63
+ typing_extensions==4.5.0
64
+ tzdata==2023.3
65
+ tzlocal==4.3
66
+ uc-micro-py==1.0.1
67
+ urllib3==1.26.15
68
+ uvicorn==0.21.1
69
+ websockets==11.0.1
70
+ yarl==1.8.2
src/assets/css_html_js.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ #changelog-text {
3
+ font-size: 16px !important;
4
+ }
5
+
6
+ #changelog-text h2 {
7
+ font-size: 18px !important;
8
+ }
9
+
10
+ .markdown-text {
11
+ font-size: 16px !important;
12
+ }
13
+
14
+ #models-to-add-text {
15
+ font-size: 18px !important;
16
+ }
17
+
18
+ #citation-button span {
19
+ font-size: 16px !important;
20
+ }
21
+
22
+ #citation-button textarea {
23
+ font-size: 16px !important;
24
+ }
25
+
26
+ #citation-button > label > button {
27
+ margin: 6px;
28
+ transform: scale(1.3);
29
+ }
30
+
31
+ #leaderboard-table {
32
+ margin-top: 15px
33
+ }
34
+
35
+ #leaderboard-table-lite {
36
+ margin-top: 15px
37
+ }
38
+
39
+ #search-bar-table-box > div:first-child {
40
+ background: none;
41
+ border: none;
42
+ }
43
+
44
+ #search-bar {
45
+ padding: 0px;
46
+ width: 30%;
47
+ }
48
+
49
+ /* Hides the final AutoEvalColumn */
50
+ #llm-benchmark-tab-table table td:last-child,
51
+ #llm-benchmark-tab-table table th:last-child {
52
+ display: none;
53
+ }
54
+
55
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
56
+ table td:first-child,
57
+ table th:first-child {
58
+ max-width: 400px;
59
+ overflow: auto;
60
+ white-space: nowrap;
61
+ }
62
+
63
+ .tab-buttons button {
64
+ font-size: 20px;
65
+ }
66
+
67
+ #scale-logo {
68
+ border-style: none !important;
69
+ box-shadow: none;
70
+ display: block;
71
+ margin-left: auto;
72
+ margin-right: auto;
73
+ max-width: 600px;
74
+ }
75
+
76
+ #scale-logo .download {
77
+ display: none;
78
+ }
79
+ """
80
+
81
+ get_window_url_params = """
82
+ function(url_params) {
83
+ const params = new URLSearchParams(window.location.search);
84
+ url_params = Object.fromEntries(params);
85
+ return url_params;
86
+ }
87
+ """
src/assets/hardcoded_evals.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.utils_display import AutoEvalColumn, model_hyperlink
2
+
3
+ gpt4_values = {
4
+ AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
5
+ AutoEvalColumn.revision.name: "tech report",
6
+ AutoEvalColumn.precision.name: None,
7
+ AutoEvalColumn.average.name: 84.3,
8
+ AutoEvalColumn.arc.name: 96.3,
9
+ AutoEvalColumn.hellaswag.name: 95.3,
10
+ AutoEvalColumn.mmlu.name: 86.4,
11
+ AutoEvalColumn.truthfulqa.name: 59.0,
12
+ AutoEvalColumn.dummy.name: "GPT-4",
13
+ AutoEvalColumn.model_type.name: "",
14
+ }
15
+
16
+ gpt35_values = {
17
+ AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
18
+ AutoEvalColumn.revision.name: "tech report",
19
+ AutoEvalColumn.precision.name: None,
20
+ AutoEvalColumn.average.name: 71.9,
21
+ AutoEvalColumn.arc.name: 85.2,
22
+ AutoEvalColumn.hellaswag.name: 85.5,
23
+ AutoEvalColumn.mmlu.name: 70.0,
24
+ AutoEvalColumn.truthfulqa.name: 47.0,
25
+ AutoEvalColumn.dummy.name: "GPT-3.5",
26
+ AutoEvalColumn.model_type.name: "",
27
+ }
28
+
29
+ baseline = {
30
+ AutoEvalColumn.model.name: "<p>Baseline</p>",
31
+ AutoEvalColumn.revision.name: "N/A",
32
+ AutoEvalColumn.precision.name: None,
33
+ AutoEvalColumn.average.name: 25.0,
34
+ AutoEvalColumn.arc.name: 25.0,
35
+ AutoEvalColumn.hellaswag.name: 25.0,
36
+ AutoEvalColumn.mmlu.name: 25.0,
37
+ AutoEvalColumn.truthfulqa.name: 25.0,
38
+ AutoEvalColumn.dummy.name: "baseline",
39
+ AutoEvalColumn.model_type.name: "",
40
+ }
41
+
src/assets/scale-hf-logo.png ADDED

Git LFS Details

  • SHA256: 11a263a1abe4c7c9cf022cbe052dc567dcea164bdfbc111299aae3270e992934
  • Pointer size: 132 Bytes
  • Size of remote file: 1.88 MB
src/assets/text_content.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CHANGELOG_TEXT = f"""
2
+ ## [2023-06-19]
3
+ - Added model type column
4
+ - Hid revision and 8bit columns since all models are the same atm
5
+
6
+ ## [2023-06-16]
7
+ - Refactored code base
8
+ - Added new columns: number of parameters, hub likes, license
9
+
10
+ ## [2023-06-13]
11
+ - Adjust description for TruthfulQA
12
+
13
+ ## [2023-06-12]
14
+ - Add Human & GPT-4 Evaluations
15
+
16
+ ## [2023-06-05]
17
+ - Increase concurrent thread count to 40
18
+ - Search models on ENTER
19
+
20
+ ## [2023-06-02]
21
+ - Add a typeahead search bar
22
+ - Use webhooks to automatically spawn a new Space when someone opens a PR
23
+ - Start recording `submitted_time` for eval requests
24
+ - Limit AutoEvalColumn max-width
25
+
26
+ ## [2023-05-30]
27
+ - Add a citation button
28
+ - Simplify Gradio layout
29
+
30
+ ## [2023-05-29]
31
+ - Auto-restart every hour for the latest results
32
+ - Sync with the internal version (minor style changes)
33
+
34
+ ## [2023-05-24]
35
+ - Add a baseline that has 25.0 for all values
36
+ - Add CHANGELOG
37
+
38
+ ## [2023-05-23]
39
+ - Fix a CSS issue that made the leaderboard hard to read in dark mode
40
+
41
+ ## [2023-05-22]
42
+ - Display a success/error message after submitting evaluation requests
43
+ - Reject duplicate submission
44
+ - Do not display results that have incomplete results
45
+ - Display different queues for jobs that are RUNNING, PENDING, FINISHED status
46
+
47
+ ## [2023-05-15]
48
+ - Fix a typo: from "TruthQA" to "QA"
49
+
50
+ ## [2023-05-10]
51
+ - Fix a bug that prevented auto-refresh
52
+
53
+ ## [2023-05-10]
54
+ - Release the leaderboard to public
55
+ """
56
+
57
+ TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
58
+
59
+ INTRODUCTION_TEXT = f"""
60
+ 📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
61
+
62
+ 🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as the original LLaMa release.
63
+
64
+ Other cool benchmarks for LLMs are developped at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance benchmarks](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
65
+ """
66
+
67
+ LLM_BENCHMARKS_TEXT = f"""
68
+ # Context
69
+ With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
70
+
71
+ 📈 We evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
72
+
73
+ - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
74
+ - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
75
+ - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
76
+ - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model’s propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
77
+
78
+ For all these evaluations, a higher score is a better score.
79
+ We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
80
+
81
+ # Some good practices before submitting a model
82
+
83
+ ### 1) Make sure you can load your model and tokenizer using AutoClasses:
84
+ ```python
85
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
86
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
87
+ model = AutoModel.from_pretrained("your model name", revision=revision)
88
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
89
+ ```
90
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
91
+
92
+ Note: make sure your model is public!
93
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
94
+
95
+ ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
96
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of weights of your model to the `Extended Viewer`!
97
+
98
+ ### 3) Make sure your model has an open license!
99
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
100
+
101
+ ### 4) Fill up your model card
102
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
103
+
104
+ # Reproducibility and details
105
+
106
+ ### Details and logs
107
+ You can find:
108
+ - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
109
+ - details on the input/outputs for the models in the `details` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/details
110
+ - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
111
+
112
+ ### Reproducibility
113
+ To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
114
+ `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
115
+ ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
116
+
117
+ The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
118
+ *You can expect results to vary slightly for different batch sizes because of padding.*
119
+
120
+ The tasks and few shots parameters are:
121
+ - ARC: 25-shot, *arc-challenge* (`acc_norm`)
122
+ - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
123
+ - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
124
+ - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
125
+
126
+ ### Quantization
127
+ To get more information about quantization, see:
128
+ - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
129
+ - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
130
+
131
+ ### Icons
132
+ 🟢 means that the model is pretrained
133
+ 🔶 that it is finetuned
134
+ 🟦 that is was trained with RL.
135
+ If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
136
+
137
+
138
+ # In case of model failure
139
+ If your model is displayed in the `FAILED` category, its execution stopped.
140
+ Make sure you have followed the above steps first.
141
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
142
+
143
+ """
144
+
145
+ EVALUATION_QUEUE_TEXT = f"""
146
+ # Evaluation Queue for the 🤗 Open LLM Leaderboard
147
+ These models will be automatically evaluated on the 🤗 cluster.
148
+ """
149
+
150
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
151
+ CITATION_BUTTON_TEXT = r"""
152
+ @misc{open-llm-leaderboard,
153
+ author = {Edward Beeching, Clémentine Fourrier, Nathan Habib, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf},
154
+ title = {Open LLM Leaderboard},
155
+ year = {2023},
156
+ publisher = {Hugging Face},
157
+ howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
158
+ }
159
+ @software{eval-harness,
160
+ author = {Gao, Leo and
161
+ Tow, Jonathan and
162
+ Biderman, Stella and
163
+ Black, Sid and
164
+ DiPofi, Anthony and
165
+ Foster, Charles and
166
+ Golding, Laurence and
167
+ Hsu, Jeffrey and
168
+ McDonell, Kyle and
169
+ Muennighoff, Niklas and
170
+ Phang, Jason and
171
+ Reynolds, Laria and
172
+ Tang, Eric and
173
+ Thite, Anish and
174
+ Wang, Ben and
175
+ Wang, Kevin and
176
+ Zou, Andy},
177
+ title = {A framework for few-shot language model evaluation},
178
+ month = sep,
179
+ year = 2021,
180
+ publisher = {Zenodo},
181
+ version = {v0.0.1},
182
+ doi = {10.5281/zenodo.5371628},
183
+ url = {https://doi.org/10.5281/zenodo.5371628}
184
+ }
185
+ @misc{clark2018think,
186
+ title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
187
+ author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
188
+ year={2018},
189
+ eprint={1803.05457},
190
+ archivePrefix={arXiv},
191
+ primaryClass={cs.AI}
192
+ }
193
+ @misc{zellers2019hellaswag,
194
+ title={HellaSwag: Can a Machine Really Finish Your Sentence?},
195
+ author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
196
+ year={2019},
197
+ eprint={1905.07830},
198
+ archivePrefix={arXiv},
199
+ primaryClass={cs.CL}
200
+ }
201
+ @misc{hendrycks2021measuring,
202
+ title={Measuring Massive Multitask Language Understanding},
203
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
204
+ year={2021},
205
+ eprint={2009.03300},
206
+ archivePrefix={arXiv},
207
+ primaryClass={cs.CY}
208
+ }
209
+ @misc{lin2022truthfulqa,
210
+ title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
211
+ author={Stephanie Lin and Jacob Hilton and Owain Evans},
212
+ year={2022},
213
+ eprint={2109.07958},
214
+ archivePrefix={arXiv},
215
+ primaryClass={cs.CL}
216
+ }"""
src/auto_leaderboard/get_model_metadata.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ from typing import List
4
+
5
+ from src.utils_display import AutoEvalColumn
6
+ from src.auto_leaderboard.model_metadata_type import get_model_type
7
+
8
+ from huggingface_hub import HfApi
9
+ import huggingface_hub
10
+ api = HfApi(token=os.environ.get("H4_TOKEN", None))
11
+
12
+
13
+ def get_model_infos_from_hub(leaderboard_data: List[dict]):
14
+ for model_data in leaderboard_data:
15
+ model_name = model_data["model_name_for_query"]
16
+ try:
17
+ model_info = api.model_info(model_name)
18
+ except huggingface_hub.utils._errors.RepositoryNotFoundError:
19
+ print("Repo not found!", model_name)
20
+ model_data[AutoEvalColumn.license.name] = None
21
+ model_data[AutoEvalColumn.likes.name] = None
22
+ model_data[AutoEvalColumn.params.name] = get_model_size(model_name, None)
23
+ continue
24
+
25
+ model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
26
+ model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
27
+ model_data[AutoEvalColumn.params.name] = get_model_size(model_name, model_info)
28
+
29
+
30
+ def get_model_license(model_info):
31
+ try:
32
+ return model_info.cardData["license"]
33
+ except Exception:
34
+ return None
35
+
36
+ def get_model_likes(model_info):
37
+ return model_info.likes
38
+
39
+ size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
40
+
41
+ def get_model_size(model_name, model_info):
42
+ # In billions
43
+ try:
44
+ return round(model_info.safetensors["total"] / 1e9, 3)
45
+ except AttributeError:
46
+ try:
47
+ size_match = re.search(size_pattern, model_name.lower())
48
+ size = size_match.group(0)
49
+ return round(float(size[:-1]) if size[-1] == "b" else float(size[:-1]) / 1e3, 3)
50
+ except AttributeError:
51
+ return None
52
+
53
+
54
+ def apply_metadata(leaderboard_data: List[dict]):
55
+ get_model_type(leaderboard_data)
56
+ get_model_infos_from_hub(leaderboard_data)
src/auto_leaderboard/load_results.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ import glob
4
+ import json
5
+ import os
6
+ from typing import Dict, List, Tuple
7
+ import dateutil
8
+
9
+ from src.utils_display import AutoEvalColumn, make_clickable_model
10
+ import numpy as np
11
+
12
+ METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
13
+ BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
14
+ BENCH_TO_NAME = {
15
+ "arc:challenge": AutoEvalColumn.arc.name,
16
+ "hellaswag": AutoEvalColumn.hellaswag.name,
17
+ "hendrycksTest": AutoEvalColumn.mmlu.name,
18
+ "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
19
+ }
20
+
21
+
22
+ @dataclass
23
+ class EvalResult:
24
+ eval_name: str
25
+ org: str
26
+ model: str
27
+ revision: str
28
+ results: dict
29
+ precision: str = "16bit"
30
+ model_type: str = ""
31
+ weight_type: str = ""
32
+
33
+ def to_dict(self):
34
+ if self.org is not None:
35
+ base_model = f"{self.org}/{self.model}"
36
+ else:
37
+ base_model = f"{self.model}"
38
+ data_dict = {}
39
+
40
+ data_dict["eval_name"] = self.eval_name # not a column, just a save name
41
+ data_dict["weight_type"] = self.weight_type # not a column, just a save name
42
+ data_dict[AutoEvalColumn.precision.name] = self.precision
43
+ data_dict[AutoEvalColumn.model_type.name] = self.model_type
44
+ data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
45
+ data_dict[AutoEvalColumn.dummy.name] = base_model
46
+ data_dict[AutoEvalColumn.revision.name] = self.revision
47
+ data_dict[AutoEvalColumn.average.name] = round(
48
+ sum([v for k, v in self.results.items()]) / 4.0, 1
49
+ )
50
+
51
+ for benchmark in BENCHMARKS:
52
+ if benchmark not in self.results.keys():
53
+ self.results[benchmark] = None
54
+
55
+ for k, v in BENCH_TO_NAME.items():
56
+ data_dict[v] = self.results[k]
57
+
58
+ return data_dict
59
+
60
+
61
+ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
62
+ with open(json_filepath) as fp:
63
+ data = json.load(fp)
64
+
65
+
66
+ for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
67
+ if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
68
+ return None, [] # we skip models with the wrong version
69
+
70
+ try:
71
+ config = data["config"]
72
+ except KeyError:
73
+ config = data["config_general"]
74
+ model = config.get("model_name", None)
75
+ if model is None:
76
+ model = config.get("model_args", None)
77
+
78
+ model_sha = config.get("model_sha", "")
79
+ eval_sha = config.get("lighteval_sha", "")
80
+ model_split = model.split("/", 1)
81
+
82
+ model = model_split[-1]
83
+
84
+ if len(model_split) == 1:
85
+ org = None
86
+ model = model_split[0]
87
+ result_key = f"{model}_{model_sha}_{eval_sha}"
88
+ else:
89
+ org = model_split[0]
90
+ model = model_split[1]
91
+ result_key = f"{org}_{model}_{model_sha}_{eval_sha}"
92
+
93
+ eval_results = []
94
+ for benchmark, metric in zip(BENCHMARKS, METRICS):
95
+ accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
96
+ if accs.size == 0:
97
+ continue
98
+ mean_acc = round(np.mean(accs) * 100.0, 1)
99
+ eval_results.append(EvalResult(
100
+ eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, #todo model_type=, weight_type=
101
+ ))
102
+
103
+ return result_key, eval_results
104
+
105
+
106
+ def get_eval_results(is_public) -> List[EvalResult]:
107
+ json_filepaths = []
108
+
109
+ for root, dir, files in os.walk("eval-results"):
110
+ # We should only have json files in model results
111
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
112
+ continue
113
+
114
+ # Sort the files by date
115
+ try:
116
+ files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
117
+ except dateutil.parser._parser.ParserError:
118
+ up_to_date = files[-1]
119
+
120
+ up_to_date = files[-1]
121
+
122
+ if len(files) > 1:
123
+ print(root)
124
+ print(files)
125
+ print(up_to_date)
126
+ print("===")
127
+
128
+ json_filepaths.append(os.path.join(root, up_to_date))
129
+
130
+ eval_results = {}
131
+ for json_filepath in json_filepaths:
132
+ result_key, results = parse_eval_result(json_filepath)
133
+ for eval_result in results:
134
+ if result_key in eval_results.keys():
135
+ eval_results[result_key].results.update(eval_result.results)
136
+ else:
137
+ eval_results[result_key] = eval_result
138
+
139
+ eval_results = [v for v in eval_results.values()]
140
+
141
+ return eval_results
142
+
143
+
144
+ def get_eval_results_dicts(is_public=True) -> List[Dict]:
145
+ eval_results = get_eval_results(is_public)
146
+
147
+ return [e.to_dict() for e in eval_results]
src/auto_leaderboard/model_metadata_type.py ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from typing import Dict, List
4
+
5
+ from ..utils_display import AutoEvalColumn
6
+
7
+ @dataclass
8
+ class ModelInfo:
9
+ name: str
10
+ symbol: str # emoji
11
+
12
+
13
+ class ModelType(Enum):
14
+ PT = ModelInfo(name="pretrained", symbol="🟢")
15
+ SFT = ModelInfo(name="finetuned", symbol="🔶")
16
+ RL = ModelInfo(name="with RL", symbol="🟦")
17
+
18
+
19
+ TYPE_METADATA: Dict[str, ModelType] = {
20
+ "notstoic/PygmalionCoT-7b": ModelType.SFT,
21
+ "aisquared/dlite-v1-355m": ModelType.SFT,
22
+ "aisquared/dlite-v1-1_5b": ModelType.SFT,
23
+ "aisquared/dlite-v1-774m": ModelType.SFT,
24
+ "aisquared/dlite-v1-124m": ModelType.SFT,
25
+ "aisquared/chopt-2_7b": ModelType.SFT,
26
+ "aisquared/dlite-v2-124m": ModelType.SFT,
27
+ "aisquared/dlite-v2-774m": ModelType.SFT,
28
+ "aisquared/dlite-v2-1_5b": ModelType.SFT,
29
+ "aisquared/chopt-1_3b": ModelType.SFT,
30
+ "aisquared/dlite-v2-355m": ModelType.SFT,
31
+ "TheBloke/tulu-7B-fp16": ModelType.SFT,
32
+ "TheBloke/guanaco-7B-HF": ModelType.SFT,
33
+ "TheBloke/koala-7B-HF": ModelType.SFT,
34
+ "TheBloke/wizardLM-7B-HF": ModelType.SFT,
35
+ "TheBloke/airoboros-13B-HF": ModelType.SFT,
36
+ "TheBloke/koala-13B-HF": ModelType.SFT,
37
+ "TheBloke/Wizard-Vicuna-7B-Uncensored-HF": ModelType.SFT,
38
+ "TheBloke/dromedary-65b-lora-HF": ModelType.SFT,
39
+ "TheBloke/wizardLM-13B-1.0-fp16": ModelType.SFT,
40
+ "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16": ModelType.SFT,
41
+ "TheBloke/wizard-vicuna-13B-HF": ModelType.SFT,
42
+ "TheBloke/UltraLM-13B-fp16": ModelType.SFT,
43
+ "TheBloke/OpenAssistant-SFT-7-Llama-30B-HF": ModelType.SFT,
44
+ "TheBloke/vicuna-13B-1.1-HF": ModelType.SFT,
45
+ "TheBloke/guanaco-13B-HF": ModelType.SFT,
46
+ "TheBloke/airoboros-7b-gpt4-fp16": ModelType.SFT,
47
+ "TheBloke/Llama-2-13B-fp16": ModelType.PT,
48
+ "TheBloke/Planner-7B-fp16": ModelType.SFT,
49
+ "TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.SFT,
50
+ "TheBloke/gpt4-alpaca-lora-13B-HF": ModelType.SFT,
51
+ "TheBloke/gpt4-x-vicuna-13B-HF": ModelType.SFT,
52
+ "TheBloke/tulu-13B-fp16": ModelType.SFT,
53
+ "jphme/orca_mini_v2_ger_7b": ModelType.SFT,
54
+ "Ejafa/vicuna_7B_vanilla_1.1": ModelType.SFT,
55
+ "kevinpro/Vicuna-13B-CoT": ModelType.SFT,
56
+ "AlekseyKorshuk/pygmalion-6b-vicuna-chatml": ModelType.SFT,
57
+ "AlekseyKorshuk/chatml-pyg-v1": ModelType.SFT,
58
+ "concedo/Vicuzard-30B-Uncensored": ModelType.SFT,
59
+ "concedo/OPT-19M-ChatSalad": ModelType.SFT,
60
+ "concedo/Pythia-70M-ChatSalad": ModelType.SFT,
61
+ "digitous/13B-HyperMantis": ModelType.SFT,
62
+ "digitous/Adventien-GPTJ": ModelType.SFT,
63
+ "digitous/Alpacino13b": ModelType.SFT,
64
+ "digitous/GPT-R": ModelType.SFT,
65
+ "digitous/Javelin-R": ModelType.SFT,
66
+ "digitous/Javalion-GPTJ": ModelType.SFT,
67
+ "digitous/Javalion-R": ModelType.SFT,
68
+ "digitous/Skegma-GPTJ": ModelType.SFT,
69
+ "digitous/Alpacino30b": ModelType.SFT,
70
+ "digitous/Janin-GPTJ": ModelType.SFT,
71
+ "digitous/Janin-R": ModelType.SFT,
72
+ "digitous/Javelin-GPTJ": ModelType.SFT,
73
+ "SaylorTwift/gpt2_test": ModelType.PT,
74
+ "anton-l/gpt-j-tiny-random": ModelType.SFT,
75
+ "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca": ModelType.SFT,
76
+ "Lazycuber/pyg-instruct-wizardlm": ModelType.SFT,
77
+ "Lazycuber/Janemalion-6B": ModelType.SFT,
78
+ "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.SFT,
79
+ "IDEA-CCNL/Ziya-LLaMA-13B-v1": ModelType.SFT,
80
+ "dsvv-cair/alpaca-cleaned-llama-30b-bf16": ModelType.SFT,
81
+ "gpt2-medium": ModelType.PT,
82
+ "camel-ai/CAMEL-13B-Combined-Data": ModelType.SFT,
83
+ "camel-ai/CAMEL-13B-Role-Playing-Data": ModelType.SFT,
84
+ "PygmalionAI/pygmalion-6b": ModelType.SFT,
85
+ "PygmalionAI/metharme-1.3b": ModelType.SFT,
86
+ "PygmalionAI/pygmalion-1.3b": ModelType.SFT,
87
+ "PygmalionAI/pygmalion-350m": ModelType.SFT,
88
+ "PygmalionAI/pygmalion-2.7b": ModelType.SFT,
89
+ "medalpaca/medalpaca-7b": ModelType.SFT,
90
+ "lilloukas/Platypus-30B": ModelType.SFT,
91
+ "lilloukas/GPlatty-30B": ModelType.SFT,
92
+ "mncai/chatdoctor": ModelType.SFT,
93
+ "chaoyi-wu/MedLLaMA_13B": ModelType.SFT,
94
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.0": ModelType.SFT,
95
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.1": ModelType.SFT,
96
+ "hakurei/instruct-12b": ModelType.SFT,
97
+ "hakurei/lotus-12B": ModelType.SFT,
98
+ "shibing624/chinese-llama-plus-13b-hf": ModelType.SFT,
99
+ "shibing624/chinese-alpaca-plus-7b-hf": ModelType.SFT,
100
+ "shibing624/chinese-alpaca-plus-13b-hf": ModelType.SFT,
101
+ "mosaicml/mpt-7b-instruct": ModelType.SFT,
102
+ "mosaicml/mpt-30b-chat": ModelType.SFT,
103
+ "mosaicml/mpt-7b-storywriter": ModelType.SFT,
104
+ "mosaicml/mpt-30b-instruct": ModelType.SFT,
105
+ "mosaicml/mpt-7b-chat": ModelType.SFT,
106
+ "mosaicml/mpt-30b": ModelType.PT,
107
+ "Corianas/111m": ModelType.SFT,
108
+ "Corianas/Quokka_1.3b": ModelType.SFT,
109
+ "Corianas/256_5epoch": ModelType.SFT,
110
+ "Corianas/Quokka_256m": ModelType.SFT,
111
+ "Corianas/Quokka_590m": ModelType.SFT,
112
+ "Corianas/gpt-j-6B-Dolly": ModelType.SFT,
113
+ "Corianas/Quokka_2.7b": ModelType.SFT,
114
+ "cyberagent/open-calm-7b": ModelType.SFT,
115
+ "Aspik101/Nous-Hermes-13b-pl-lora_unload": ModelType.SFT,
116
+ "THUDM/chatglm2-6b": ModelType.SFT,
117
+ "MetaIX/GPT4-X-Alpasta-30b": ModelType.SFT,
118
+ "NYTK/PULI-GPTrio": ModelType.PT,
119
+ "EleutherAI/pythia-1.3b": ModelType.PT,
120
+ "EleutherAI/pythia-2.8b-deduped": ModelType.PT,
121
+ "EleutherAI/gpt-neo-125m": ModelType.PT,
122
+ "EleutherAI/pythia-160m": ModelType.PT,
123
+ "EleutherAI/gpt-neo-2.7B": ModelType.PT,
124
+ "EleutherAI/pythia-1b-deduped": ModelType.PT,
125
+ "EleutherAI/pythia-6.7b": ModelType.PT,
126
+ "EleutherAI/pythia-70m-deduped": ModelType.PT,
127
+ "EleutherAI/gpt-neox-20b": ModelType.PT,
128
+ "EleutherAI/pythia-1.4b-deduped": ModelType.PT,
129
+ "EleutherAI/pythia-2.7b": ModelType.PT,
130
+ "EleutherAI/pythia-6.9b-deduped": ModelType.PT,
131
+ "EleutherAI/pythia-70m": ModelType.PT,
132
+ "EleutherAI/gpt-j-6b": ModelType.PT,
133
+ "EleutherAI/pythia-12b-deduped": ModelType.PT,
134
+ "EleutherAI/gpt-neo-1.3B": ModelType.PT,
135
+ "EleutherAI/pythia-410m-deduped": ModelType.PT,
136
+ "EleutherAI/pythia-160m-deduped": ModelType.PT,
137
+ "EleutherAI/polyglot-ko-12.8b": ModelType.PT,
138
+ "EleutherAI/pythia-12b": ModelType.PT,
139
+ "roneneldan/TinyStories-33M": ModelType.PT,
140
+ "roneneldan/TinyStories-28M": ModelType.PT,
141
+ "roneneldan/TinyStories-1M": ModelType.PT,
142
+ "roneneldan/TinyStories-8M": ModelType.PT,
143
+ "roneneldan/TinyStories-3M": ModelType.PT,
144
+ "jerryjalapeno/nart-100k-7b": ModelType.SFT,
145
+ "lmsys/vicuna-13b-v1.3": ModelType.SFT,
146
+ "lmsys/vicuna-7b-v1.3": ModelType.SFT,
147
+ "lmsys/vicuna-13b-v1.1": ModelType.SFT,
148
+ "lmsys/vicuna-13b-delta-v1.1": ModelType.SFT,
149
+ "lmsys/vicuna-7b-delta-v1.1": ModelType.SFT,
150
+ "abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.SFT,
151
+ "haonan-li/bactrian-x-llama-13b-merged": ModelType.SFT,
152
+ "Gryphe/MythoLogic-13b": ModelType.SFT,
153
+ "Gryphe/MythoBoros-13b": ModelType.SFT,
154
+ "pillowtalks-ai/delta13b": ModelType.SFT,
155
+ "wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard": ModelType.SFT,
156
+ "bigcode/tiny_starcoder_py": ModelType.PT,
157
+ "bigcode/starcoderplus": ModelType.SFT,
158
+ "bigcode/gpt_bigcode-santacoder": ModelType.PT,
159
+ "bigcode/starcoder": ModelType.PT,
160
+ "Open-Orca/OpenOrca-Preview1-13B": ModelType.SFT,
161
+ "microsoft/DialoGPT-large": ModelType.SFT,
162
+ "microsoft/DialoGPT-small": ModelType.SFT,
163
+ "microsoft/DialoGPT-medium": ModelType.SFT,
164
+ "microsoft/CodeGPT-small-py": ModelType.SFT,
165
+ "Tincando/fiction_story_generator": ModelType.SFT,
166
+ "Pirr/pythia-13b-deduped-green_devil": ModelType.SFT,
167
+ "Aeala/GPT4-x-AlpacaDente2-30b": ModelType.SFT,
168
+ "Aeala/GPT4-x-AlpacaDente-30b": ModelType.SFT,
169
+ "Aeala/GPT4-x-Alpasta-13b": ModelType.SFT,
170
+ "Aeala/VicUnlocked-alpaca-30b": ModelType.SFT,
171
+ "Tap-M/Luna-AI-Llama2-Uncensored": ModelType.SFT,
172
+ "illuin/test-custom-llama": ModelType.SFT,
173
+ "dvruette/oasst-llama-13b-2-epochs": ModelType.SFT,
174
+ "dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.SFT,
175
+ "dvruette/llama-13b-pretrained-dropout": ModelType.PT,
176
+ "dvruette/llama-13b-pretrained": ModelType.PT,
177
+ "dvruette/llama-13b-pretrained-sft-epoch-1": ModelType.PT,
178
+ "dvruette/llama-13b-pretrained-sft-do2": ModelType.PT,
179
+ "dvruette/oasst-gpt-neox-20b-3000-steps": ModelType.SFT,
180
+ "dvruette/oasst-pythia-12b-pretrained-sft": ModelType.PT,
181
+ "dvruette/oasst-pythia-6.9b-4000-steps": ModelType.SFT,
182
+ "dvruette/gpt-neox-20b-full-precision": ModelType.SFT,
183
+ "dvruette/oasst-llama-13b-1000-steps": ModelType.SFT,
184
+ "openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
185
+ "openlm-research/open_llama_7b": ModelType.PT,
186
+ "openlm-research/open_llama_7b_v2": ModelType.PT,
187
+ "openlm-research/open_llama_3b": ModelType.PT,
188
+ "openlm-research/open_llama_13b": ModelType.PT,
189
+ "openlm-research/open_llama_3b_v2": ModelType.PT,
190
+ "PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.SFT,
191
+ "GeorgiaTechResearchInstitute/galpaca-30b": ModelType.SFT,
192
+ "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct": ModelType.SFT,
193
+ "databricks/dolly-v2-7b": ModelType.SFT,
194
+ "databricks/dolly-v2-3b": ModelType.SFT,
195
+ "databricks/dolly-v2-12b": ModelType.SFT,
196
+ "Rachneet/gpt2-xl-alpaca": ModelType.SFT,
197
+ "Locutusque/gpt2-conversational-or-qa": ModelType.SFT,
198
+ "psyche/kogpt": ModelType.SFT,
199
+ "NbAiLab/nb-gpt-j-6B-alpaca": ModelType.SFT,
200
+ "Mikael110/llama-2-7b-guanaco-fp16": ModelType.SFT,
201
+ "Mikael110/llama-2-13b-guanaco-fp16": ModelType.SFT,
202
+ "Fredithefish/CrimsonPajama": ModelType.SFT,
203
+ "Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K": ModelType.SFT,
204
+ "Fredithefish/ScarletPajama-3B-HF": ModelType.SFT,
205
+ "Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4": ModelType.SFT,
206
+ "eachadea/vicuna-13b-1.1": ModelType.SFT,
207
+ "eachadea/vicuna-7b-1.1": ModelType.SFT,
208
+ "eachadea/vicuna-13b": ModelType.SFT,
209
+ "openaccess-ai-collective/wizard-mega-13b": ModelType.SFT,
210
+ "openaccess-ai-collective/manticore-13b": ModelType.SFT,
211
+ "openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.SFT,
212
+ "openaccess-ai-collective/minotaur-13b": ModelType.SFT,
213
+ "openaccess-ai-collective/minotaur-13b-fixed": ModelType.SFT,
214
+ "openaccess-ai-collective/hippogriff-30b-chat": ModelType.SFT,
215
+ "openaccess-ai-collective/manticore-13b-chat-pyg": ModelType.SFT,
216
+ "pythainlp/wangchanglm-7.5B-sft-enth": ModelType.SFT,
217
+ "pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.SFT,
218
+ "euclaise/gpt-neox-122m-minipile-digits": ModelType.SFT,
219
+ "stabilityai/FreeWilly1-Delta-SafeTensor": ModelType.SFT,
220
+ "stabilityai/stablelm-tuned-alpha-7b": ModelType.SFT,
221
+ "stabilityai/FreeWilly2": ModelType.SFT,
222
+ "stabilityai/stablelm-base-alpha-7b": ModelType.PT,
223
+ "stabilityai/stablelm-base-alpha-3b": ModelType.PT,
224
+ "stabilityai/stablelm-tuned-alpha-3b": ModelType.SFT,
225
+ "alibidaran/medical_transcription_generator": ModelType.SFT,
226
+ "CalderaAI/30B-Lazarus": ModelType.SFT,
227
+ "CalderaAI/13B-BlueMethod": ModelType.SFT,
228
+ "CalderaAI/13B-Ouroboros": ModelType.SFT,
229
+ "KoboldAI/OPT-13B-Erebus": ModelType.SFT,
230
+ "KoboldAI/GPT-J-6B-Janeway": ModelType.SFT,
231
+ "KoboldAI/GPT-J-6B-Shinen": ModelType.SFT,
232
+ "KoboldAI/fairseq-dense-2.7B": ModelType.PT,
233
+ "KoboldAI/OPT-6B-nerys-v2": ModelType.SFT,
234
+ "KoboldAI/GPT-NeoX-20B-Skein": ModelType.SFT,
235
+ "KoboldAI/PPO_Pygway-6b-Mix": ModelType.SFT,
236
+ "KoboldAI/fairseq-dense-6.7B": ModelType.PT,
237
+ "KoboldAI/fairseq-dense-125M": ModelType.PT,
238
+ "KoboldAI/OPT-13B-Nerybus-Mix": ModelType.SFT,
239
+ "KoboldAI/OPT-2.7B-Erebus": ModelType.SFT,
240
+ "KoboldAI/OPT-350M-Nerys-v2": ModelType.SFT,
241
+ "KoboldAI/OPT-2.7B-Nerys-v2": ModelType.SFT,
242
+ "KoboldAI/OPT-2.7B-Nerybus-Mix": ModelType.SFT,
243
+ "KoboldAI/OPT-13B-Nerys-v2": ModelType.SFT,
244
+ "KoboldAI/GPT-NeoX-20B-Erebus": ModelType.SFT,
245
+ "KoboldAI/OPT-6.7B-Erebus": ModelType.SFT,
246
+ "KoboldAI/fairseq-dense-355M": ModelType.PT,
247
+ "KoboldAI/OPT-6.7B-Nerybus-Mix": ModelType.SFT,
248
+ "KoboldAI/GPT-J-6B-Adventure": ModelType.SFT,
249
+ "KoboldAI/OPT-350M-Erebus": ModelType.SFT,
250
+ "KoboldAI/GPT-J-6B-Skein": ModelType.SFT,
251
+ "KoboldAI/OPT-30B-Erebus": ModelType.SFT,
252
+ "klosax/pythia-160m-deduped-step92k-193bt": ModelType.PT,
253
+ "klosax/open_llama_3b_350bt_preview": ModelType.PT,
254
+ "klosax/openllama-3b-350bt": ModelType.PT,
255
+ "klosax/pythia-70m-deduped-step44k-92bt": ModelType.PT,
256
+ "klosax/open_llama_13b_600bt_preview": ModelType.PT,
257
+ "klosax/open_llama_7b_400bt_preview": ModelType.PT,
258
+ "WeOpenML/Alpaca-7B-v1": ModelType.SFT,
259
+ "WeOpenML/PandaLM-Alpaca-7B-v1": ModelType.SFT,
260
+ "TFLai/gpt2-turkish-uncased": ModelType.SFT,
261
+ "ehartford/WizardLM-13B-Uncensored": ModelType.SFT,
262
+ "ehartford/dolphin-llama-13b": ModelType.SFT,
263
+ "ehartford/Wizard-Vicuna-30B-Uncensored": ModelType.SFT,
264
+ "ehartford/WizardLM-30B-Uncensored": ModelType.SFT,
265
+ "ehartford/Wizard-Vicuna-13B-Uncensored": ModelType.SFT,
266
+ "ehartford/WizardLM-7B-Uncensored": ModelType.SFT,
267
+ "ehartford/based-30b": ModelType.SFT,
268
+ "ehartford/Wizard-Vicuna-7B-Uncensored": ModelType.SFT,
269
+ "wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.SFT,
270
+ "wahaha1987/llama_13b_sharegpt94k_fastchat": ModelType.SFT,
271
+ "OpenAssistant/oasst-sft-1-pythia-12b": ModelType.SFT,
272
+ "OpenAssistant/stablelm-7b-sft-v7-epoch-3": ModelType.SFT,
273
+ "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.SFT,
274
+ "OpenAssistant/pythia-12b-sft-v8-2.5k-steps": ModelType.SFT,
275
+ "OpenAssistant/pythia-12b-sft-v8-7k-steps": ModelType.SFT,
276
+ "OpenAssistant/pythia-12b-pre-v8-12.5k-steps": ModelType.SFT,
277
+ "junelee/wizard-vicuna-13b": ModelType.SFT,
278
+ "BreadAi/gpt-YA-1-1_160M": ModelType.PT,
279
+ "BreadAi/MuseCan": ModelType.PT,
280
+ "BreadAi/MusePy-1-2": ModelType.PT,
281
+ "BreadAi/DiscordPy": ModelType.PT,
282
+ "BreadAi/PM_modelV2": ModelType.PT,
283
+ "BreadAi/gpt-Youtube": ModelType.PT,
284
+ "BreadAi/StoryPy": ModelType.SFT,
285
+ "julianweng/Llama-2-7b-chat-orcah": ModelType.SFT,
286
+ "AGI-inc/lora_moe_7b_baseline": ModelType.SFT,
287
+ "AGI-inc/lora_moe_7b": ModelType.SFT,
288
+ "togethercomputer/GPT-NeoXT-Chat-Base-20B": ModelType.SFT,
289
+ "togethercomputer/RedPajama-INCITE-Chat-7B-v0.1": ModelType.SFT,
290
+ "togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1": ModelType.SFT,
291
+ "togethercomputer/RedPajama-INCITE-7B-Base": ModelType.PT,
292
+ "togethercomputer/RedPajama-INCITE-7B-Instruct": ModelType.SFT,
293
+ "togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
294
+ "togethercomputer/Pythia-Chat-Base-7B": ModelType.SFT,
295
+ "togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
296
+ "togethercomputer/GPT-JT-6B-v1": ModelType.SFT,
297
+ "togethercomputer/GPT-JT-6B-v0": ModelType.SFT,
298
+ "togethercomputer/RedPajama-INCITE-Chat-3B-v1": ModelType.SFT,
299
+ "togethercomputer/RedPajama-INCITE-7B-Chat": ModelType.SFT,
300
+ "togethercomputer/RedPajama-INCITE-Instruct-3B-v1": ModelType.SFT,
301
+ "Writer/camel-5b-hf": ModelType.SFT,
302
+ "Writer/palmyra-base": ModelType.PT,
303
+ "MBZUAI/LaMini-GPT-1.5B": ModelType.SFT,
304
+ "MBZUAI/lamini-cerebras-111m": ModelType.SFT,
305
+ "MBZUAI/lamini-neo-1.3b": ModelType.SFT,
306
+ "MBZUAI/lamini-cerebras-1.3b": ModelType.SFT,
307
+ "MBZUAI/lamini-cerebras-256m": ModelType.SFT,
308
+ "MBZUAI/LaMini-GPT-124M": ModelType.SFT,
309
+ "MBZUAI/lamini-neo-125m": ModelType.SFT,
310
+ "TehVenom/DiffMerge-DollyGPT-Pygmalion": ModelType.SFT,
311
+ "TehVenom/PPO_Shygmalion-6b": ModelType.SFT,
312
+ "TehVenom/Dolly_Shygmalion-6b-Dev_V8P2": ModelType.SFT,
313
+ "TehVenom/Pygmalion_AlpacaLora-7b": ModelType.SFT,
314
+ "TehVenom/PPO_Pygway-V8p4_Dev-6b": ModelType.SFT,
315
+ "TehVenom/Dolly_Malion-6b": ModelType.SFT,
316
+ "TehVenom/PPO_Shygmalion-V8p4_Dev-6b": ModelType.SFT,
317
+ "TehVenom/ChanMalion": ModelType.SFT,
318
+ "TehVenom/GPT-J-Pyg_PPO-6B": ModelType.SFT,
319
+ "TehVenom/Pygmalion-13b-Merged": ModelType.SFT,
320
+ "TehVenom/Metharme-13b-Merged": ModelType.SFT,
321
+ "TehVenom/Dolly_Shygmalion-6b": ModelType.SFT,
322
+ "TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4": ModelType.SFT,
323
+ "georgesung/llama2_7b_chat_uncensored": ModelType.SFT,
324
+ "vicgalle/gpt2-alpaca": ModelType.SFT,
325
+ "vicgalle/alpaca-7b": ModelType.SFT,
326
+ "vicgalle/gpt2-alpaca-gpt4": ModelType.SFT,
327
+ "facebook/opt-350m": ModelType.PT,
328
+ "facebook/opt-125m": ModelType.PT,
329
+ "facebook/xglm-4.5B": ModelType.PT,
330
+ "facebook/opt-2.7b": ModelType.PT,
331
+ "facebook/opt-6.7b": ModelType.PT,
332
+ "facebook/galactica-30b": ModelType.PT,
333
+ "facebook/opt-13b": ModelType.PT,
334
+ "facebook/opt-66b": ModelType.PT,
335
+ "facebook/xglm-7.5B": ModelType.PT,
336
+ "facebook/xglm-564M": ModelType.PT,
337
+ "facebook/opt-30b": ModelType.PT,
338
+ "golaxy/gogpt-7b": ModelType.SFT,
339
+ "psmathur/orca_mini_v2_7b": ModelType.SFT,
340
+ "psmathur/orca_mini_7b": ModelType.SFT,
341
+ "psmathur/orca_mini_3b": ModelType.SFT,
342
+ "psmathur/orca_mini_v2_13b": ModelType.SFT,
343
+ "gpt2-xl": ModelType.PT,
344
+ "lxe/Cerebras-GPT-2.7B-Alpaca-SP": ModelType.SFT,
345
+ "Monero/Manticore-13b-Chat-Pyg-Guanaco": ModelType.SFT,
346
+ "Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b": ModelType.SFT,
347
+ "Monero/WizardLM-13b-OpenAssistant-Uncensored": ModelType.SFT,
348
+ "Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b": ModelType.SFT,
349
+ "jzjiao/opt-1.3b-rlhf": ModelType.SFT,
350
+ "HuggingFaceH4/starchat-beta": ModelType.SFT,
351
+ "KnutJaegersberg/gpt-2-xl-EvolInstruct": ModelType.SFT,
352
+ "KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct": ModelType.SFT,
353
+ "openchat/openchat_8192": ModelType.SFT,
354
+ "openchat/openchat_v2": ModelType.SFT,
355
+ "openchat/openchat_v2_w": ModelType.SFT,
356
+ "ausboss/llama-13b-supercot": ModelType.SFT,
357
+ "ausboss/llama-30b-supercot": ModelType.SFT,
358
+ "Neko-Institute-of-Science/metharme-7b": ModelType.SFT,
359
+ "Neko-Institute-of-Science/pygmalion-7b": ModelType.SFT,
360
+ "SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.SFT,
361
+ "victor123/WizardLM-13B-1.0": ModelType.SFT,
362
+ "OpenBuddy/openbuddy-openllama-13b-v7-fp16": ModelType.SFT,
363
+ "baichuan-inc/Baichuan-7B": ModelType.PT,
364
+ "tiiuae/falcon-40b-instruct": ModelType.SFT,
365
+ "tiiuae/falcon-40b": ModelType.PT,
366
+ "tiiuae/falcon-7b": ModelType.PT,
367
+ "YeungNLP/firefly-llama-13b": ModelType.SFT,
368
+ "YeungNLP/firefly-llama-13b-v1.2": ModelType.SFT,
369
+ "YeungNLP/firefly-ziya-13b": ModelType.SFT,
370
+ "shaohang/Sparse0.5_OPT-1.3": ModelType.SFT,
371
+ "xzuyModelType.lpacino-SuperCOT-13B": ModelType.SFT,
372
+ "xzuyn/MedicWizard-7B": ModelType.SFT,
373
+ "beomi/KoAlpaca-Polyglot-5.8B": ModelType.SFT,
374
+ "beomi/llama-2-ko-7b": ModelType.SFT,
375
+ "Salesforce/codegen-6B-multi": ModelType.PT,
376
+ "Salesforce/codegen-16B-nl": ModelType.PT,
377
+ "Salesforce/codegen-6B-nl": ModelType.PT,
378
+ "ai-forever/rugpt3large_based_on_gpt2": ModelType.SFT,
379
+ "gpt2-large": ModelType.PT,
380
+ "frank098/orca_mini_3b_juniper": ModelType.SFT,
381
+ "frank098/WizardLM_13B_juniper": ModelType.SFT,
382
+ "huggingface/llama-13b": ModelType.PT,
383
+ "huggingface/llama-7b": ModelType.PT,
384
+ "huggingface/llama-65b": ModelType.PT,
385
+ "huggingface/llama-65b": ModelType.PT,
386
+ "huggingface/llama-30b": ModelType.PT,
387
+ "jondurbiModelType.iroboros-13b-gpt4-1.4": ModelType.SFT,
388
+ "jondurbiModelType.iroboros-7b": ModelType.SFT,
389
+ "jondurbiModelType.iroboros-7b-gpt4-1.4": ModelType.SFT,
390
+ "jondurbiModelType.iroboros-l2-13b-gpt4-1.4.1": ModelType.SFT,
391
+ "jondurbiModelType.iroboros-13b": ModelType.SFT,
392
+ "ariellee/SuperPlatty-30B": ModelType.SFT,
393
+ "danielhanchen/open_llama_3b_600bt_preview": ModelType.SFT,
394
+ "cerebras/Cerebras-GPT-256M": ModelType.PT,
395
+ "cerebras/Cerebras-GPT-1.3B": ModelType.PT,
396
+ "cerebras/Cerebras-GPT-13B": ModelType.PT,
397
+ "cerebras/Cerebras-GPT-2.7B": ModelType.PT,
398
+ "cerebras/Cerebras-GPT-111M": ModelType.PT,
399
+ "cerebras/Cerebras-GPT-6.7B": ModelType.PT,
400
+ "Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf": ModelType.RL,
401
+ "Yhyu13/llama-30B-hf-openassitant": ModelType.SFT,
402
+ "NousResearch/Nous-Hermes-Llama2-13b": ModelType.SFT,
403
+ "NousResearch/Redmond-Puffin-13B": ModelType.SFT,
404
+ "NousResearch/Nous-Hermes-13b": ModelType.SFT,
405
+ "project-baize/baize-v2-7b": ModelType.SFT,
406
+ "project-baize/baize-v2-13b": ModelType.SFT,
407
+ "LLMs/WizardLM-13B-V1.0": ModelType.SFT,
408
+ "LLMs/AlpacaGPT4-7B-elina": ModelType.SFT,
409
+ "wenge-research/yayi-7b-llama2": ModelType.SFT,
410
+ "yhyhy3/open_llama_7b_v2_med_instruct": ModelType.SFT,
411
+ "llama-anon/instruct-13b": ModelType.SFT,
412
+ "huggingtweets/jerma985": ModelType.SFT,
413
+ "huggingtweets/gladosystem": ModelType.SFT,
414
+ "huggingtweets/bladeecity-jerma985": ModelType.SFT,
415
+ "huggyllama/llama-13b": ModelType.PT,
416
+ "huggyllama/llama-65b": ModelType.PT,
417
+ "FabbriSimo01/Facebook_opt_1.3b_Quantized": ModelType.PT,
418
+ "upstage/llama-30b-instruct-2048": ModelType.SFT,
419
+ "upstage/llama-30b-instruct": ModelType.SFT,
420
+ "WizardLM/WizardLM-13B-1.0": ModelType.SFT,
421
+ "WizardLM/WizardLM-30B-V1.0": ModelType.SFT,
422
+ "WizardLM/WizardCoder-15B-V1.0": ModelType.SFT,
423
+ "gpt2": ModelType.PT,
424
+ "keyfan/vicuna-chinese-replication-v1.1": ModelType.SFT,
425
+ "nthngdy/pythia-owt2-70m-100k": ModelType.SFT,
426
+ "nthngdy/pythia-owt2-70m-50k": ModelType.SFT,
427
+ "quantumaikr/KoreanLM-hf": ModelType.SFT,
428
+ "quantumaikr/open_llama_7b_hf": ModelType.SFT,
429
+ "MayaPH/FinOPT-Lincoln": ModelType.SFT,
430
+ "MayaPH/FinOPT-Franklin": ModelType.SFT,
431
+ "MayaPH/GodziLLa-30B": ModelType.SFT,
432
+ "MayaPH/FinOPT-Washington": ModelType.SFT,
433
+ "ogimgio/gpt-neo-125m-neurallinguisticpioneers": ModelType.SFT,
434
+ "layoric/llama-2-13b-code-alpaca": ModelType.SFT,
435
+ "CobraMamba/mamba-gpt-3b": ModelType.SFT,
436
+ "timdettmers/guanaco-33b-merged": ModelType.SFT,
437
+ "elinas/chronos-33b": ModelType.SFT,
438
+ "heegyu/RedTulu-Uncensored-3B-0719": ModelType.SFT,
439
+ "heegyu/WizardVicuna-Uncensored-3B-0719": ModelType.SFT,
440
+ "heegyu/WizardVicuna-3B-0719": ModelType.SFT,
441
+ "meta-llama/Llama-2-7b-chat-hf": ModelType.RL,
442
+ "meta-llama/Llama-2-7b-hf": ModelType.PT,
443
+ "meta-llama/Llama-2-13b-chat-hf": ModelType.RL,
444
+ "meta-llama/Llama-2-13b-hf": ModelType.PT,
445
+ "meta-llama/Llama-2-70b-chat-hf": ModelType.RL,
446
+ "meta-llama/Llama-2-70b-hf": ModelType.PT,
447
+ "xhyi/PT_GPTNEO350_ATG": ModelType.SFT,
448
+ "h2oai/h2ogpt-gm-oasst1-en-1024-20b": ModelType.SFT,
449
+ "h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt": ModelType.SFT,
450
+ "h2oai/h2ogpt-oig-oasst1-512-6_9b": ModelType.SFT,
451
+ "h2oai/h2ogpt-oasst1-512-12b": ModelType.SFT,
452
+ "h2oai/h2ogpt-oig-oasst1-256-6_9b": ModelType.SFT,
453
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt": ModelType.SFT,
454
+ "h2oai/h2ogpt-oasst1-512-20b": ModelType.SFT,
455
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2": ModelType.SFT,
456
+ "h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.SFT,
457
+ "h2oai/h2ogpt-gm-oasst1-multilang-1024-20b": ModelType.SFT,
458
+ "bofenghuang/vigogne-13b-instruct": ModelType.SFT,
459
+ "Vmware/open-llama-7b-v2-open-instruct": ModelType.SFT,
460
+ "VMware/open-llama-0.7T-7B-open-instruct-v1.1": ModelType.SFT,
461
+ "ewof/koishi-instruct-3b": ModelType.SFT,
462
+ }
463
+
464
+
465
+ def get_model_type(leaderboard_data: List[dict]):
466
+ for model_data in leaderboard_data:
467
+ # Todo @clefourrier once requests are connected with results
468
+ is_delta = False # (model_data["weight_type"] != "Original")
469
+ # Stored information
470
+ if model_data["model_name_for_query"] in TYPE_METADATA:
471
+ model_data[AutoEvalColumn.model_type.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.name
472
+ model_data[AutoEvalColumn.model_type_symbol.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.symbol + ("🔺" if is_delta else "")
473
+ # Inferred from the name or the selected type
474
+ elif model_data[AutoEvalColumn.model_type.name] == "pretrained" or any([i in model_data["model_name_for_query"] for i in ["pretrained"]]):
475
+ model_data[AutoEvalColumn.model_type.name] = ModelType.PT.value.name
476
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.PT.value.symbol + ("🔺" if is_delta else "")
477
+ elif model_data[AutoEvalColumn.model_type.name] == "finetuned" or any([i in model_data["model_name_for_query"] for i in ["finetuned", "-ft-"]]):
478
+ model_data[AutoEvalColumn.model_type.name] = ModelType.SFT.value.name
479
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.SFT.value.symbol + ("🔺" if is_delta else "")
480
+ elif model_data[AutoEvalColumn.model_type.name] == "with RL" or any([i in model_data["model_name_for_query"] for i in ["-rl-", "-rlhf-"]]):
481
+ model_data[AutoEvalColumn.model_type.name] = ModelType.RL.value.name
482
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.RL.value.symbol + ("🔺" if is_delta else "")
483
+ else:
484
+ model_data[AutoEvalColumn.model_type.name] = "N/A"
485
+ model_data[AutoEvalColumn.model_type_symbol.name] = ("🔺" if is_delta else "")
486
+
487
+
src/init.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import Repository
3
+
4
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
5
+
6
+
7
+ def get_all_requested_models(requested_models_dir):
8
+ depth = 1
9
+ file_names = []
10
+
11
+ for root, dirs, files in os.walk(requested_models_dir):
12
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
13
+ if current_depth == depth:
14
+ file_names.extend([os.path.join(root, file) for file in files])
15
+
16
+ return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
17
+
18
+ def load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, QUEUE_PATH, RESULTS_PATH):
19
+ eval_queue_repo = None
20
+ eval_results_repo = None
21
+ requested_models = None
22
+
23
+ if H4_TOKEN:
24
+ print("Pulling evaluation requests and results.")
25
+
26
+ eval_queue_repo = Repository(
27
+ local_dir=QUEUE_PATH,
28
+ clone_from=QUEUE_REPO,
29
+ use_auth_token=H4_TOKEN,
30
+ repo_type="dataset",
31
+ )
32
+ eval_queue_repo.git_pull()
33
+
34
+ eval_results_repo = Repository(
35
+ local_dir=RESULTS_PATH,
36
+ clone_from=RESULTS_REPO,
37
+ use_auth_token=H4_TOKEN,
38
+ repo_type="dataset",
39
+ )
40
+ eval_results_repo.git_pull()
41
+
42
+ requested_models = get_all_requested_models("eval-queue")
43
+ else:
44
+ print("No HuggingFace token provided. Skipping evaluation requests and results.")
45
+
46
+ return eval_queue_repo, requested_models, eval_results_repo
47
+
48
+
49
+ #def load_results(model, benchmark, metric):
50
+ # file_path = os.path.join("autoevals", model, f"{model}-eval_{benchmark}.json")
51
+ # if not os.path.exists(file_path):
52
+ # return 0.0, None
53
+
54
+ # with open(file_path) as fp:
55
+ # data = json.load(fp)
56
+ # accs = np.array([v[metric] for k, v in data["results"].items()])
57
+ # mean_acc = np.mean(accs)
58
+ # return mean_acc, data["config"]["model_args"]
src/utils_display.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ # These classes are for user facing column names, to avoid having to change them
4
+ # all around the code when a modif is needed
5
+ @dataclass
6
+ class ColumnContent:
7
+ name: str
8
+ type: str
9
+ displayed_by_default: bool
10
+ hidden: bool = False
11
+
12
+ def fields(raw_class):
13
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
14
+
15
+ @dataclass(frozen=True)
16
+ class AutoEvalColumn: # Auto evals column
17
+ model_type_symbol = ColumnContent("T", "str", True)
18
+ model = ColumnContent("Model", "markdown", True)
19
+ average = ColumnContent("Average ⬆️", "number", True)
20
+ arc = ColumnContent("ARC", "number", True)
21
+ hellaswag = ColumnContent("HellaSwag", "number", True)
22
+ mmlu = ColumnContent("MMLU", "number", True)
23
+ truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
24
+ model_type = ColumnContent("Type", "str", False)
25
+ precision = ColumnContent("Precision", "str", False, True)
26
+ license = ColumnContent("Hub License", "str", False)
27
+ params = ColumnContent("#Params (B)", "number", False)
28
+ likes = ColumnContent("Hub ❤️", "number", False)
29
+ revision = ColumnContent("Model sha", "str", False, False)
30
+ dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
31
+
32
+ @dataclass(frozen=True)
33
+ class EloEvalColumn: # Elo evals column
34
+ model = ColumnContent("Model", "markdown", True)
35
+ gpt4 = ColumnContent("GPT-4 (all)", "number", True)
36
+ human_all = ColumnContent("Human (all)", "number", True)
37
+ human_instruct = ColumnContent("Human (instruct)", "number", True)
38
+ human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class EvalQueueColumn: # Queue column
43
+ model = ColumnContent("model", "markdown", True)
44
+ revision = ColumnContent("revision", "str", True)
45
+ private = ColumnContent("private", "bool", True)
46
+ precision = ColumnContent("precision", "bool", True)
47
+ weight_type = ColumnContent("weight_type", "str", "Original")
48
+ status = ColumnContent("status", "str", True)
49
+
50
+ LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
51
+
52
+
53
+ KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
54
+ VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
55
+ OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
56
+ DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
57
+ MODEL_PAGE = "https://huggingface.co/models"
58
+ LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
59
+ VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
60
+ ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
61
+
62
+
63
+ def model_hyperlink(link, model_name):
64
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
65
+
66
+
67
+ def make_clickable_model(model_name):
68
+ link = f"https://huggingface.co/{model_name}"
69
+
70
+ if model_name in LLAMAS:
71
+ link = LLAMA_LINK
72
+ model_name = model_name.split("/")[1]
73
+ elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
74
+ link = VICUNA_LINK
75
+ model_name = "stable-vicuna-13b"
76
+ elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
77
+ link = ALPACA_LINK
78
+ model_name = "alpaca-13b"
79
+ if model_name == "dolly-12b":
80
+ link = DOLLY_LINK
81
+ elif model_name == "vicuna-13b":
82
+ link = VICUNA_LINK
83
+ elif model_name == "koala-13b":
84
+ link = KOALA_LINK
85
+ elif model_name == "oasst-12b":
86
+ link = OASST_LINK
87
+ #else:
88
+ # link = MODEL_PAGE
89
+
90
+ return model_hyperlink(link, model_name)
91
+
92
+ def styled_error(error):
93
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
94
+
95
+ def styled_warning(warn):
96
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
97
+
98
+ def styled_message(message):
99
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"