Spaces:
Running
Running
Code formatting
Browse files- app.py +113 -40
- compare_significance.py +50 -18
- server.py +30 -11
app.py
CHANGED
|
@@ -5,9 +5,15 @@ import pandas as pd
|
|
| 5 |
from gradio.themes.utils.sizes import text_md
|
| 6 |
from gradio_modal import Modal
|
| 7 |
|
| 8 |
-
from content import (
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from server import LeaderboardServer
|
| 12 |
|
| 13 |
leaderboard_server = LeaderboardServer()
|
|
@@ -38,37 +44,63 @@ def process_submission(team_name, submission_id, description, link_to_model, sub
|
|
| 38 |
leaderboard_server.prepare_model_for_submission(submission_file, metadata)
|
| 39 |
except ValueError as err:
|
| 40 |
gr.Warning(str(err))
|
| 41 |
-
return
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
def submit_results():
|
| 51 |
leaderboard_server.save_pre_submit()
|
| 52 |
leaderboard_server.update_leaderboard()
|
| 53 |
gr.Info('Submission successful!')
|
| 54 |
-
return
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
|
| 61 |
def erase_pre_submit():
|
| 62 |
leaderboard_server.pre_submit = None
|
| 63 |
-
return
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
|
| 68 |
def fetch_model_detail(submission_id):
|
| 69 |
metadata = leaderboard_server.get_model_detail(submission_id)
|
| 70 |
-
return
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
def show_modal():
|
|
@@ -82,9 +114,10 @@ def hide_modal():
|
|
| 82 |
def on_application_load():
|
| 83 |
leaderboard_server.save_pre_submit()
|
| 84 |
leaderboard_server.update_leaderboard()
|
| 85 |
-
return
|
| 86 |
-
value=leaderboard_server.get_leaderboard(), visible=True),
|
| 87 |
-
choices=leaderboard_server.submission_ids)
|
|
|
|
| 88 |
|
| 89 |
|
| 90 |
custom_css = """
|
|
@@ -143,7 +176,7 @@ footer {visibility: hidden}
|
|
| 143 |
|
| 144 |
"""
|
| 145 |
|
| 146 |
-
with
|
| 147 |
with gr.Row():
|
| 148 |
with gr.Row():
|
| 149 |
gr.Markdown(HEADER_MARKDOWN)
|
|
@@ -155,17 +188,30 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
| 155 |
|
| 156 |
with gr.Row():
|
| 157 |
with gr.Tab("Overall"):
|
| 158 |
-
results_table = gr.DataFrame(
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
for c in leaderboard_server.tasks_categories:
|
| 161 |
with gr.Tab(c):
|
| 162 |
-
results_table = gr.DataFrame(
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
with gr.Tab('Model details'):
|
| 166 |
gr.Markdown(MORE_DETAILS_MARKDOWN)
|
| 167 |
-
detail_dropdown = gr.Dropdown(
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
with gr.Row():
|
| 171 |
model_description = gr.Text(value='', label='Model description', visible=False, interactive=False)
|
|
@@ -174,7 +220,8 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
| 174 |
detail_dropdown.change(
|
| 175 |
fn=fetch_model_detail,
|
| 176 |
inputs=[detail_dropdown],
|
| 177 |
-
outputs=[model_description, model_url]
|
|
|
|
| 178 |
|
| 179 |
with gr.Tab('Submission'):
|
| 180 |
with gr.Column():
|
|
@@ -213,13 +260,24 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
| 213 |
pre_submission_btn.click(
|
| 214 |
fn=on_submit_pressed,
|
| 215 |
concurrency_limit=1,
|
| 216 |
-
outputs=[pre_submission_btn]
|
| 217 |
).then(
|
| 218 |
fn=process_submission,
|
| 219 |
-
inputs=[
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
)
|
| 224 |
|
| 225 |
submission_btn_yes.click(
|
|
@@ -229,8 +287,17 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
| 229 |
|
| 230 |
modal_submit_yes.click(
|
| 231 |
fn=submit_results,
|
| 232 |
-
outputs=[
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
)
|
| 235 |
|
| 236 |
modal_submit_no.click(
|
|
@@ -240,8 +307,14 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
| 240 |
|
| 241 |
submission_btn_no.click(
|
| 242 |
fn=erase_pre_submit,
|
| 243 |
-
outputs=[
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
)
|
| 246 |
main.load(on_application_load, inputs=None, outputs=[results_table, detail_dropdown])
|
| 247 |
|
|
|
|
| 5 |
from gradio.themes.utils.sizes import text_md
|
| 6 |
from gradio_modal import Modal
|
| 7 |
|
| 8 |
+
from content import (
|
| 9 |
+
HEADER_MARKDOWN,
|
| 10 |
+
LEADERBOARD_TAB_TITLE_MARKDOWN,
|
| 11 |
+
SUBMISSION_TAB_TITLE_MARKDOWN,
|
| 12 |
+
MODAL_SUBMIT_MARKDOWN,
|
| 13 |
+
SUBMISSION_DETAILS_MARKDOWN,
|
| 14 |
+
RANKING_AFTER_SUBMISSION_MARKDOWN,
|
| 15 |
+
MORE_DETAILS_MARKDOWN,
|
| 16 |
+
)
|
| 17 |
from server import LeaderboardServer
|
| 18 |
|
| 19 |
leaderboard_server = LeaderboardServer()
|
|
|
|
| 44 |
leaderboard_server.prepare_model_for_submission(submission_file, metadata)
|
| 45 |
except ValueError as err:
|
| 46 |
gr.Warning(str(err))
|
| 47 |
+
return (
|
| 48 |
+
gr.update(value='Pre-submit model', visible=True, interactive=True),
|
| 49 |
+
gr.update(visible=False),
|
| 50 |
+
gr.update(visible=False),
|
| 51 |
+
gr.update(visible=False),
|
| 52 |
+
gr.update(visible=False),
|
| 53 |
+
gr.update(visible=False),
|
| 54 |
+
)
|
| 55 |
+
return (
|
| 56 |
+
gr.update(visible=False),
|
| 57 |
+
gr.update(visible=True),
|
| 58 |
+
gr.update(interactive=True, visible=True),
|
| 59 |
+
gr.update(interactive=True, visible=True),
|
| 60 |
+
gr.update(visible=True),
|
| 61 |
+
gr.update(
|
| 62 |
+
value=leaderboard_server.get_leaderboard(leaderboard_server.pre_submit[0]),
|
| 63 |
+
visible=True,
|
| 64 |
+
),
|
| 65 |
+
)
|
| 66 |
|
| 67 |
|
| 68 |
def submit_results():
|
| 69 |
leaderboard_server.save_pre_submit()
|
| 70 |
leaderboard_server.update_leaderboard()
|
| 71 |
gr.Info('Submission successful!')
|
| 72 |
+
return (
|
| 73 |
+
gr.update(value='Pre-submit model', visible=True, interactive=True),
|
| 74 |
+
gr.update(visible=False),
|
| 75 |
+
gr.update(visible=False),
|
| 76 |
+
gr.update(visible=False),
|
| 77 |
+
gr.update(visible=False),
|
| 78 |
+
gr.update(visible=False),
|
| 79 |
+
gr.DataFrame(value=leaderboard_server.get_leaderboard(), visible=True),
|
| 80 |
+
gr.update(visible=False),
|
| 81 |
+
gr.update(choices=leaderboard_server.submission_ids),
|
| 82 |
+
)
|
| 83 |
|
| 84 |
|
| 85 |
def erase_pre_submit():
|
| 86 |
leaderboard_server.pre_submit = None
|
| 87 |
+
return (
|
| 88 |
+
gr.update(value='Pre-submit model', visible=True, interactive=True),
|
| 89 |
+
gr.update(visible=False),
|
| 90 |
+
gr.update(visible=False),
|
| 91 |
+
gr.update(visible=False),
|
| 92 |
+
gr.update(visible=False),
|
| 93 |
+
gr.update(visible=False),
|
| 94 |
+
gr.update(visible=False),
|
| 95 |
+
)
|
| 96 |
|
| 97 |
|
| 98 |
def fetch_model_detail(submission_id):
|
| 99 |
metadata = leaderboard_server.get_model_detail(submission_id)
|
| 100 |
+
return (
|
| 101 |
+
gr.update(value=metadata['description'], visible=True),
|
| 102 |
+
gr.update(value=metadata['link_to_model'], visible=True)
|
| 103 |
+
)
|
| 104 |
|
| 105 |
|
| 106 |
def show_modal():
|
|
|
|
| 114 |
def on_application_load():
|
| 115 |
leaderboard_server.save_pre_submit()
|
| 116 |
leaderboard_server.update_leaderboard()
|
| 117 |
+
return (
|
| 118 |
+
gr.DataFrame(value=leaderboard_server.get_leaderboard(), visible=True),
|
| 119 |
+
gr.update(choices=leaderboard_server.submission_ids)
|
| 120 |
+
)
|
| 121 |
|
| 122 |
|
| 123 |
custom_css = """
|
|
|
|
| 176 |
|
| 177 |
"""
|
| 178 |
|
| 179 |
+
with gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main:
|
| 180 |
with gr.Row():
|
| 181 |
with gr.Row():
|
| 182 |
gr.Markdown(HEADER_MARKDOWN)
|
|
|
|
| 188 |
|
| 189 |
with gr.Row():
|
| 190 |
with gr.Tab("Overall"):
|
| 191 |
+
results_table = gr.DataFrame(
|
| 192 |
+
leaderboard_server.get_leaderboard(),
|
| 193 |
+
interactive=False,
|
| 194 |
+
label=None,
|
| 195 |
+
visible=True,
|
| 196 |
+
elem_classes="leaderboard-table",
|
| 197 |
+
)
|
| 198 |
for c in leaderboard_server.tasks_categories:
|
| 199 |
with gr.Tab(c):
|
| 200 |
+
results_table = gr.DataFrame(
|
| 201 |
+
leaderboard_server.get_leaderboard(),
|
| 202 |
+
interactive=False,
|
| 203 |
+
label=None,
|
| 204 |
+
visible=True,
|
| 205 |
+
elem_classes="leaderboard-table",
|
| 206 |
+
)
|
| 207 |
|
| 208 |
with gr.Tab('Model details'):
|
| 209 |
gr.Markdown(MORE_DETAILS_MARKDOWN)
|
| 210 |
+
detail_dropdown = gr.Dropdown(
|
| 211 |
+
choices=leaderboard_server.submission_ids,
|
| 212 |
+
label="Select model",
|
| 213 |
+
interactive=True,
|
| 214 |
+
)
|
| 215 |
|
| 216 |
with gr.Row():
|
| 217 |
model_description = gr.Text(value='', label='Model description', visible=False, interactive=False)
|
|
|
|
| 220 |
detail_dropdown.change(
|
| 221 |
fn=fetch_model_detail,
|
| 222 |
inputs=[detail_dropdown],
|
| 223 |
+
outputs=[model_description, model_url],
|
| 224 |
+
)
|
| 225 |
|
| 226 |
with gr.Tab('Submission'):
|
| 227 |
with gr.Column():
|
|
|
|
| 260 |
pre_submission_btn.click(
|
| 261 |
fn=on_submit_pressed,
|
| 262 |
concurrency_limit=1,
|
| 263 |
+
outputs=[pre_submission_btn],
|
| 264 |
).then(
|
| 265 |
fn=process_submission,
|
| 266 |
+
inputs=[
|
| 267 |
+
submission_team_name_tb,
|
| 268 |
+
submission_id_tb,
|
| 269 |
+
description_tb,
|
| 270 |
+
link_to_model_tb,
|
| 271 |
+
submission_file_path,
|
| 272 |
+
],
|
| 273 |
+
outputs=[
|
| 274 |
+
pre_submission_btn,
|
| 275 |
+
submit_prompt,
|
| 276 |
+
submission_btn_yes,
|
| 277 |
+
submission_btn_no,
|
| 278 |
+
pre_submit_info,
|
| 279 |
+
pre_submit_table,
|
| 280 |
+
],
|
| 281 |
)
|
| 282 |
|
| 283 |
submission_btn_yes.click(
|
|
|
|
| 287 |
|
| 288 |
modal_submit_yes.click(
|
| 289 |
fn=submit_results,
|
| 290 |
+
outputs=[
|
| 291 |
+
pre_submission_btn,
|
| 292 |
+
submission_btn_yes,
|
| 293 |
+
submission_btn_no,
|
| 294 |
+
submit_prompt,
|
| 295 |
+
pre_submit_info,
|
| 296 |
+
pre_submit_table,
|
| 297 |
+
results_table,
|
| 298 |
+
modal_submit,
|
| 299 |
+
detail_dropdown,
|
| 300 |
+
],
|
| 301 |
)
|
| 302 |
|
| 303 |
modal_submit_no.click(
|
|
|
|
| 307 |
|
| 308 |
submission_btn_no.click(
|
| 309 |
fn=erase_pre_submit,
|
| 310 |
+
outputs=[
|
| 311 |
+
pre_submission_btn,
|
| 312 |
+
submission_btn_yes,
|
| 313 |
+
submission_btn_no,
|
| 314 |
+
submit_prompt,
|
| 315 |
+
pre_submit_info,
|
| 316 |
+
pre_submit_table,
|
| 317 |
+
],
|
| 318 |
)
|
| 319 |
main.load(on_application_load, inputs=None, outputs=[results_table, detail_dropdown])
|
| 320 |
|
compare_significance.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_CMs(i, probabilities, references, thresholds):
|
|
| 36 |
FN += 1
|
| 37 |
else:
|
| 38 |
TN += 1
|
| 39 |
-
cm = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
confusion_matrices.append(cm)
|
| 41 |
|
| 42 |
return confusion_matrices
|
|
@@ -73,16 +80,20 @@ def compute_significance_bootstrap(scores_A, scores_B):
|
|
| 73 |
return pval, delta_orig
|
| 74 |
|
| 75 |
|
| 76 |
-
def compute_significance_avg_mcauroc(
|
| 77 |
-
|
|
|
|
|
|
|
| 78 |
# compute MC-AUC for model A
|
| 79 |
model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
|
| 80 |
model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
|
| 81 |
delta = np.mean(model_A_scores) - np.mean(model_B_scores)
|
| 82 |
|
| 83 |
# one-tailed test
|
| 84 |
-
p_value = (
|
| 85 |
-
|
|
|
|
|
|
|
| 86 |
|
| 87 |
return p_value, delta
|
| 88 |
|
|
@@ -114,8 +125,10 @@ def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
|
|
| 114 |
auc_scores_per_class = []
|
| 115 |
for i in range(len(n_classes)):
|
| 116 |
# for i-th class vs all others
|
| 117 |
-
fpr[i], _, thresholds[i] = roc_curve(
|
| 118 |
-
|
|
|
|
|
|
|
| 119 |
|
| 120 |
confusion_matrices = _get_CMs(i, probs, references, thresholds)
|
| 121 |
tp, fn = convert_confusion_matrices(confusion_matrices)
|
|
@@ -194,13 +207,20 @@ def process_task(task, dataA, dataB, significance_level):
|
|
| 194 |
assert len(dataA[task]) == len(dataB[task])
|
| 195 |
|
| 196 |
if metricA == "avg_mcauroc":
|
| 197 |
-
p_value, delta = compute_significance_avg_mcauroc(
|
| 198 |
-
|
|
|
|
|
|
|
| 199 |
elif metricA in ["acc", "exact_match"]:
|
| 200 |
-
p_value, delta = compute_significance_ttest(
|
|
|
|
|
|
|
|
|
|
| 201 |
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
| 202 |
-
p_value, delta = compute_significance_bootstrap(
|
| 203 |
-
|
|
|
|
|
|
|
| 204 |
else:
|
| 205 |
raise ValueError(f"Unsupported metric {metricA}")
|
| 206 |
|
|
@@ -228,14 +248,21 @@ def check_significance(fileA, fileB, significance_level=0.05):
|
|
| 228 |
assert len(dataA[task]) == len(dataB[task])
|
| 229 |
|
| 230 |
if metricA == "avg_mcauroc":
|
| 231 |
-
p_value, delta = compute_significance_avg_mcauroc(
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
|
| 234 |
elif metricA in ["acc", "exact_match"]:
|
| 235 |
-
p_value, delta = compute_significance_ttest(
|
|
|
|
|
|
|
|
|
|
| 236 |
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
| 237 |
-
p_value, delta = compute_significance_bootstrap(
|
| 238 |
-
|
|
|
|
|
|
|
| 239 |
else:
|
| 240 |
raise ValueError(f"Unsupported metric {metricA}")
|
| 241 |
if delta <= 0:
|
|
@@ -253,7 +280,12 @@ def main():
|
|
| 253 |
parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
|
| 254 |
parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
|
| 255 |
parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
|
| 256 |
-
parser.add_argument(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
args = parser.parse_args()
|
| 258 |
|
| 259 |
result = check_significance(args.modelA, args.modelB, args.significance_level)
|
|
|
|
| 36 |
FN += 1
|
| 37 |
else:
|
| 38 |
TN += 1
|
| 39 |
+
cm = {
|
| 40 |
+
"TP": TP,
|
| 41 |
+
"FP": FP,
|
| 42 |
+
"TN": TN,
|
| 43 |
+
"FN": FN,
|
| 44 |
+
"threshold": threshold,
|
| 45 |
+
"class": i,
|
| 46 |
+
}
|
| 47 |
confusion_matrices.append(cm)
|
| 48 |
|
| 49 |
return confusion_matrices
|
|
|
|
| 80 |
return pval, delta_orig
|
| 81 |
|
| 82 |
|
| 83 |
+
def compute_significance_avg_mcauroc(
|
| 84 |
+
probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
|
| 85 |
+
probsB: Sequence[Sequence[float]], referencesB: Sequence[int],
|
| 86 |
+
):
|
| 87 |
# compute MC-AUC for model A
|
| 88 |
model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
|
| 89 |
model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
|
| 90 |
delta = np.mean(model_A_scores) - np.mean(model_B_scores)
|
| 91 |
|
| 92 |
# one-tailed test
|
| 93 |
+
p_value = (
|
| 94 |
+
(model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
|
| 95 |
+
/ (len(model_A_scores) * len(model_B_scores))
|
| 96 |
+
)
|
| 97 |
|
| 98 |
return p_value, delta
|
| 99 |
|
|
|
|
| 125 |
auc_scores_per_class = []
|
| 126 |
for i in range(len(n_classes)):
|
| 127 |
# for i-th class vs all others
|
| 128 |
+
fpr[i], _, thresholds[i] = roc_curve(
|
| 129 |
+
y_true=[1 if x == n_classes[i] else 0 for x in references],
|
| 130 |
+
y_score=[prob[i] for prob in probs],
|
| 131 |
+
)
|
| 132 |
|
| 133 |
confusion_matrices = _get_CMs(i, probs, references, thresholds)
|
| 134 |
tp, fn = convert_confusion_matrices(confusion_matrices)
|
|
|
|
| 207 |
assert len(dataA[task]) == len(dataB[task])
|
| 208 |
|
| 209 |
if metricA == "avg_mcauroc":
|
| 210 |
+
p_value, delta = compute_significance_avg_mcauroc(
|
| 211 |
+
probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
| 212 |
+
probsB=dataB[task][0][1], referencesB=dataB[task][0][0],
|
| 213 |
+
)
|
| 214 |
elif metricA in ["acc", "exact_match"]:
|
| 215 |
+
p_value, delta = compute_significance_ttest(
|
| 216 |
+
scores_A=dataA[task][0],
|
| 217 |
+
scores_B=dataB[task][0]
|
| 218 |
+
)
|
| 219 |
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
| 220 |
+
p_value, delta = compute_significance_bootstrap(
|
| 221 |
+
scores_A=np.array(dataA[task][0]),
|
| 222 |
+
scores_B=np.array(dataB[task][0])
|
| 223 |
+
)
|
| 224 |
else:
|
| 225 |
raise ValueError(f"Unsupported metric {metricA}")
|
| 226 |
|
|
|
|
| 248 |
assert len(dataA[task]) == len(dataB[task])
|
| 249 |
|
| 250 |
if metricA == "avg_mcauroc":
|
| 251 |
+
p_value, delta = compute_significance_avg_mcauroc(
|
| 252 |
+
probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
| 253 |
+
probsB=dataB[task][0][1], referencesB=dataB[task][0][0],
|
| 254 |
+
)
|
| 255 |
|
| 256 |
elif metricA in ["acc", "exact_match"]:
|
| 257 |
+
p_value, delta = compute_significance_ttest(
|
| 258 |
+
scores_A=dataA[task][0],
|
| 259 |
+
scores_B=dataB[task][0]
|
| 260 |
+
)
|
| 261 |
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
| 262 |
+
p_value, delta = compute_significance_bootstrap(
|
| 263 |
+
scores_A=np.array(dataA[task][0]),
|
| 264 |
+
scores_B=np.array(dataB[task][0])
|
| 265 |
+
)
|
| 266 |
else:
|
| 267 |
raise ValueError(f"Unsupported metric {metricA}")
|
| 268 |
if delta <= 0:
|
|
|
|
| 280 |
parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
|
| 281 |
parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
|
| 282 |
parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
|
| 283 |
+
parser.add_argument(
|
| 284 |
+
"--significance_level",
|
| 285 |
+
type=float,
|
| 286 |
+
default=0.05,
|
| 287 |
+
help="Significance level (e.g., 0.05)",
|
| 288 |
+
)
|
| 289 |
args = parser.parse_args()
|
| 290 |
|
| 291 |
result = check_significance(args.modelA, args.modelB, args.significance_level)
|
server.py
CHANGED
|
@@ -18,12 +18,17 @@ REPO = f"{ORG}/LLM_benchmark_data"
|
|
| 18 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 19 |
TASKS_METADATA_PATH = "./tasks_metadata.json"
|
| 20 |
|
|
|
|
| 21 |
class LeaderboardServer:
|
| 22 |
def __init__(self):
|
| 23 |
self.server_address = REPO
|
| 24 |
self.repo_type = "dataset"
|
| 25 |
-
self.local_leaderboard = snapshot_download(
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
self.submisssion_id_to_file = {} # Map submission ids to file paths
|
| 28 |
self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))
|
| 29 |
self.tasks_categories = {self.tasks_metadata[task]["category"] for task in self.tasks_metadata}
|
|
@@ -33,8 +38,12 @@ class LeaderboardServer:
|
|
| 33 |
self.pre_submit = None
|
| 34 |
|
| 35 |
def update_leaderboard(self):
|
| 36 |
-
self.local_leaderboard = snapshot_download(
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
self.fetch_existing_models()
|
| 39 |
self.tournament_results = self.load_tournament_results()
|
| 40 |
|
|
@@ -96,17 +105,27 @@ class LeaderboardServer:
|
|
| 96 |
else:
|
| 97 |
processed_results.append(local_results)
|
| 98 |
dataframe = pd.DataFrame.from_records(processed_results)
|
| 99 |
-
df_order = (
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
dataframe = dataframe[df_order]
|
| 103 |
-
dataframe = dataframe.rename(
|
|
|
|
|
|
|
| 104 |
return dataframe
|
| 105 |
|
| 106 |
def start_tournament(self, new_model_id, new_model_file):
|
| 107 |
new_tournament = copy.deepcopy(self.tournament_results)
|
| 108 |
new_tournament[new_model_id] = {}
|
| 109 |
-
new_tournament[new_model_id][new_model_id] = {
|
|
|
|
|
|
|
| 110 |
|
| 111 |
for model in self.submission_ids:
|
| 112 |
res = check_significance(new_model_file, self.submisssion_id_to_file[model])
|
|
@@ -124,7 +143,7 @@ class LeaderboardServer:
|
|
| 124 |
data = json.load(f)
|
| 125 |
data["metadata"] = metadata
|
| 126 |
with open(file, "w") as f:
|
| 127 |
-
json.dump(data, f, separators=(',', ':'))
|
| 128 |
|
| 129 |
model_id = metadata["team_name"] + "_" + metadata["submission_id"]
|
| 130 |
tournament_results = self.start_tournament(model_id, file)
|
|
@@ -145,7 +164,7 @@ class LeaderboardServer:
|
|
| 145 |
# Temporary save tournament results
|
| 146 |
tournament_results_path = os.path.join(self.local_leaderboard, "tournament.json")
|
| 147 |
with open(tournament_results_path, "w") as f:
|
| 148 |
-
json.dump(tournament_results, f, sort_keys=True, indent=2)
|
| 149 |
|
| 150 |
api.upload_file(
|
| 151 |
path_or_fileobj=tournament_results_path,
|
|
|
|
| 18 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 19 |
TASKS_METADATA_PATH = "./tasks_metadata.json"
|
| 20 |
|
| 21 |
+
|
| 22 |
class LeaderboardServer:
|
| 23 |
def __init__(self):
|
| 24 |
self.server_address = REPO
|
| 25 |
self.repo_type = "dataset"
|
| 26 |
+
self.local_leaderboard = snapshot_download(
|
| 27 |
+
self.server_address,
|
| 28 |
+
repo_type=self.repo_type,
|
| 29 |
+
token=HF_TOKEN,
|
| 30 |
+
local_dir="./",
|
| 31 |
+
)
|
| 32 |
self.submisssion_id_to_file = {} # Map submission ids to file paths
|
| 33 |
self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))
|
| 34 |
self.tasks_categories = {self.tasks_metadata[task]["category"] for task in self.tasks_metadata}
|
|
|
|
| 38 |
self.pre_submit = None
|
| 39 |
|
| 40 |
def update_leaderboard(self):
|
| 41 |
+
self.local_leaderboard = snapshot_download(
|
| 42 |
+
self.server_address,
|
| 43 |
+
repo_type=self.repo_type,
|
| 44 |
+
token=HF_TOKEN,
|
| 45 |
+
local_dir="./",
|
| 46 |
+
)
|
| 47 |
self.fetch_existing_models()
|
| 48 |
self.tournament_results = self.load_tournament_results()
|
| 49 |
|
|
|
|
| 105 |
else:
|
| 106 |
processed_results.append(local_results)
|
| 107 |
dataframe = pd.DataFrame.from_records(processed_results)
|
| 108 |
+
df_order = (
|
| 109 |
+
["submission_id"]
|
| 110 |
+
+ list(self.tasks_metadata.keys())
|
| 111 |
+
+ [
|
| 112 |
+
col
|
| 113 |
+
for col in dataframe.columns
|
| 114 |
+
if col != "submission_id" and col not in self.tasks_metadata.keys()
|
| 115 |
+
]
|
| 116 |
+
)
|
| 117 |
dataframe = dataframe[df_order]
|
| 118 |
+
dataframe = dataframe.rename(
|
| 119 |
+
columns={key: value["name"] for key, value in self.tasks_metadata.items()}
|
| 120 |
+
)
|
| 121 |
return dataframe
|
| 122 |
|
| 123 |
def start_tournament(self, new_model_id, new_model_file):
|
| 124 |
new_tournament = copy.deepcopy(self.tournament_results)
|
| 125 |
new_tournament[new_model_id] = {}
|
| 126 |
+
new_tournament[new_model_id][new_model_id] = {
|
| 127 |
+
task: False for task in self.tasks_metadata.keys()
|
| 128 |
+
}
|
| 129 |
|
| 130 |
for model in self.submission_ids:
|
| 131 |
res = check_significance(new_model_file, self.submisssion_id_to_file[model])
|
|
|
|
| 143 |
data = json.load(f)
|
| 144 |
data["metadata"] = metadata
|
| 145 |
with open(file, "w") as f:
|
| 146 |
+
json.dump(data, f, separators=(',', ':')) # compact JSON
|
| 147 |
|
| 148 |
model_id = metadata["team_name"] + "_" + metadata["submission_id"]
|
| 149 |
tournament_results = self.start_tournament(model_id, file)
|
|
|
|
| 164 |
# Temporary save tournament results
|
| 165 |
tournament_results_path = os.path.join(self.local_leaderboard, "tournament.json")
|
| 166 |
with open(tournament_results_path, "w") as f:
|
| 167 |
+
json.dump(tournament_results, f, sort_keys=True, indent=2) # readable JSON
|
| 168 |
|
| 169 |
api.upload_file(
|
| 170 |
path_or_fileobj=tournament_results_path,
|