Spaces:
Running
Running
Commit
·
cb35049
1
Parent(s):
ab82350
shorten names
Browse files- main.py +11 -11
- website_texts.py +12 -12
main.py
CHANGED
@@ -155,8 +155,8 @@ def load_data(filename: str, data_source="data"):
|
|
155 |
"method": "Model",
|
156 |
"elo": "Elo [⬆️]",
|
157 |
"rank": "Rank [⬇️]",
|
158 |
-
"normalized-score": "
|
159 |
-
"hmr": "Harmonic
|
160 |
"improvability": "Improvability (%) [⬇️]",
|
161 |
"imputed": "Imputed (%) [⬇️]",
|
162 |
"imputed_bool": "Imputed",
|
@@ -332,44 +332,44 @@ def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
|
|
332 |
|
333 |
def _get_lbs() -> tuple[LBContainer, ...]:
|
334 |
ta = LBContainer(
|
335 |
-
name="🏅
|
336 |
file_name="full-imputed/tabarena_leaderboard.csv",
|
337 |
overview_image_name="full-imputed/tuning-impact-elo.png",
|
338 |
blurb="Leaderboard for all datasets including all (imputed) models.",
|
339 |
)
|
340 |
ta_lite = LBContainer(
|
341 |
-
name="
|
342 |
file_name="lite/full-imputed/tabarena_leaderboard.csv",
|
343 |
overview_image_name="lite/full-imputed/tuning-impact-elo.png",
|
344 |
blurb="Leaderboard for one split (1st fold, 1st repeat) for all datasets including all (imputed) models.",
|
345 |
)
|
346 |
ta_clf = LBContainer(
|
347 |
-
name="
|
348 |
file_name="full-imputed-cls/tabarena_leaderboard.csv",
|
349 |
overview_image_name="full-imputed-cls/tuning-impact-elo.png",
|
350 |
blurb="Leaderboard for all 38 classification datasets including all (imputed) models.",
|
351 |
)
|
352 |
ta_reg = LBContainer(
|
353 |
-
name="
|
354 |
file_name="full-imputed-reg/tabarena_leaderboard.csv",
|
355 |
# FIXME: get overview image without TabICL
|
356 |
overview_image_name="full-imputed-reg/tuning-impact-elo.png",
|
357 |
blurb="Leaderboard for all 13 regression datasets including all (imputed) models.",
|
358 |
)
|
359 |
ta_tabicl = LBContainer(
|
360 |
-
name="⚡
|
361 |
file_name="tabicl-imputed/tabarena_leaderboard.csv",
|
362 |
overview_image_name="tabicl-imputed/tuning-impact-elo.png",
|
363 |
blurb="Leaderboard for all 36 datasets within the constraints of TabICL including all (imputed) models.",
|
364 |
)
|
365 |
ta_tabpfn = LBContainer(
|
366 |
-
name="⚡
|
367 |
file_name="tabpfn-imputed/tabarena_leaderboard.csv",
|
368 |
overview_image_name="tabpfn-imputed/tuning-impact-elo.png",
|
369 |
blurb="Leaderboard for all 33 datasets within the constraints of TabPFN including all (imputed) models.",
|
370 |
)
|
371 |
ta_tabpfn_tabicl = LBContainer(
|
372 |
-
name="
|
373 |
file_name="tabpfn-tabicl/tabarena_leaderboard.csv",
|
374 |
overview_image_name="tabpfn-tabicl/tuning-impact-elo.png",
|
375 |
blurb="Leaderboard for all 26 datasets within the constraints of TabPFN and TabICL including all models.",
|
@@ -435,7 +435,7 @@ def main():
|
|
435 |
ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl = _get_lbs()
|
436 |
|
437 |
# -- LB Overview
|
438 |
-
gr.Markdown("## 🗺️ Overview")
|
439 |
ordered_lbs = [
|
440 |
ta,
|
441 |
ta_clf,
|
@@ -447,7 +447,7 @@ def main():
|
|
447 |
]
|
448 |
make_overview_leaderboard(lbs=ordered_lbs)
|
449 |
|
450 |
-
gr.Markdown("## 🏆 Leaderboards")
|
451 |
with gr.Tabs(elem_classes="tab-buttons"):
|
452 |
for lb_id, lb in enumerate(ordered_lbs):
|
453 |
with gr.TabItem(lb.name, elem_id="llm-benchmark-tab-table", id=lb_id):
|
|
|
155 |
"method": "Model",
|
156 |
"elo": "Elo [⬆️]",
|
157 |
"rank": "Rank [⬇️]",
|
158 |
+
"normalized-score": "Score [⬆️]",
|
159 |
+
"hmr": "Harmonic Rank [⬇️]",
|
160 |
"improvability": "Improvability (%) [⬇️]",
|
161 |
"imputed": "Imputed (%) [⬇️]",
|
162 |
"imputed_bool": "Imputed",
|
|
|
332 |
|
333 |
def _get_lbs() -> tuple[LBContainer, ...]:
|
334 |
ta = LBContainer(
|
335 |
+
name="🏅 Main",
|
336 |
file_name="full-imputed/tabarena_leaderboard.csv",
|
337 |
overview_image_name="full-imputed/tuning-impact-elo.png",
|
338 |
blurb="Leaderboard for all datasets including all (imputed) models.",
|
339 |
)
|
340 |
ta_lite = LBContainer(
|
341 |
+
name="Lite",
|
342 |
file_name="lite/full-imputed/tabarena_leaderboard.csv",
|
343 |
overview_image_name="lite/full-imputed/tuning-impact-elo.png",
|
344 |
blurb="Leaderboard for one split (1st fold, 1st repeat) for all datasets including all (imputed) models.",
|
345 |
)
|
346 |
ta_clf = LBContainer(
|
347 |
+
name="Classification",
|
348 |
file_name="full-imputed-cls/tabarena_leaderboard.csv",
|
349 |
overview_image_name="full-imputed-cls/tuning-impact-elo.png",
|
350 |
blurb="Leaderboard for all 38 classification datasets including all (imputed) models.",
|
351 |
)
|
352 |
ta_reg = LBContainer(
|
353 |
+
name="Regression",
|
354 |
file_name="full-imputed-reg/tabarena_leaderboard.csv",
|
355 |
# FIXME: get overview image without TabICL
|
356 |
overview_image_name="full-imputed-reg/tuning-impact-elo.png",
|
357 |
blurb="Leaderboard for all 13 regression datasets including all (imputed) models.",
|
358 |
)
|
359 |
ta_tabicl = LBContainer(
|
360 |
+
name="⚡ TabICL-data",
|
361 |
file_name="tabicl-imputed/tabarena_leaderboard.csv",
|
362 |
overview_image_name="tabicl-imputed/tuning-impact-elo.png",
|
363 |
blurb="Leaderboard for all 36 datasets within the constraints of TabICL including all (imputed) models.",
|
364 |
)
|
365 |
ta_tabpfn = LBContainer(
|
366 |
+
name="⚡ TabPFN-data",
|
367 |
file_name="tabpfn-imputed/tabarena_leaderboard.csv",
|
368 |
overview_image_name="tabpfn-imputed/tuning-impact-elo.png",
|
369 |
blurb="Leaderboard for all 33 datasets within the constraints of TabPFN including all (imputed) models.",
|
370 |
)
|
371 |
ta_tabpfn_tabicl = LBContainer(
|
372 |
+
name="TabPFN/ICL-data",
|
373 |
file_name="tabpfn-tabicl/tabarena_leaderboard.csv",
|
374 |
overview_image_name="tabpfn-tabicl/tuning-impact-elo.png",
|
375 |
blurb="Leaderboard for all 26 datasets within the constraints of TabPFN and TabICL including all models.",
|
|
|
435 |
ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl = _get_lbs()
|
436 |
|
437 |
# -- LB Overview
|
438 |
+
gr.Markdown("## 🗺️ TabArena Overview")
|
439 |
ordered_lbs = [
|
440 |
ta,
|
441 |
ta_clf,
|
|
|
447 |
]
|
448 |
make_overview_leaderboard(lbs=ordered_lbs)
|
449 |
|
450 |
+
gr.Markdown("## 🏆 TabArena Leaderboards")
|
451 |
with gr.Tabs(elem_classes="tab-buttons"):
|
452 |
for lb_id, lb in enumerate(ordered_lbs):
|
453 |
with gr.TabItem(lb.name, elem_id="llm-benchmark-tab-table", id=lb_id):
|
website_texts.py
CHANGED
@@ -28,8 +28,7 @@ metrics. See `More Details` for more information on the metrics.
|
|
28 |
task or dataset size constraints (e.g. TabPFN, TabICL). In general, imputation
|
29 |
negatively represents the model performance, punishing the model for not being able
|
30 |
to run on all datasets. We provide leaderboards computed only on the subset of datasets
|
31 |
-
where TabPFN, TabICL, or both can run. We denote these leaderboards by
|
32 |
-
`X-compatible` postfix.
|
33 |
"""
|
34 |
OVERVIEW_REF_PIPE = """
|
35 |
The leaderboard includes a reference pipeline, which is applied
|
@@ -73,20 +72,21 @@ random forest configuration across all figures, and perform 100 rounds of bootst
|
|
73 |
to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
|
74 |
classification, log-loss for multiclass classification, and RMSE for regression.
|
75 |
|
76 |
-
####
|
77 |
-
Following TabRepo, we
|
78 |
-
|
79 |
-
|
|
|
80 |
|
81 |
#### Average Rank
|
82 |
Ranks of methods are computed on each dataset (lower is better) and averaged.
|
83 |
|
84 |
-
#### Harmonic
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
always mediocre, as the former are more
|
89 |
-
other methods.
|
90 |
|
91 |
#### Improvability
|
92 |
We introduce improvability as a metric that measures how many percent lower the error
|
|
|
28 |
task or dataset size constraints (e.g. TabPFN, TabICL). In general, imputation
|
29 |
negatively represents the model performance, punishing the model for not being able
|
30 |
to run on all datasets. We provide leaderboards computed only on the subset of datasets
|
31 |
+
where TabPFN, TabICL, or both can run. We denote these leaderboards by `X-data`.
|
|
|
32 |
"""
|
33 |
OVERVIEW_REF_PIPE = """
|
34 |
The leaderboard includes a reference pipeline, which is applied
|
|
|
72 |
to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
|
73 |
classification, log-loss for multiclass classification, and RMSE for regression.
|
74 |
|
75 |
+
#### Score
|
76 |
+
Following TabRepo, we compute a normalized score to provide an additional relative
|
77 |
+
comparison. We linearly rescale the error such that the best method has a normalized
|
78 |
+
score of one, and the median method has a normalized score of 0. Scores below zero
|
79 |
+
are clipped to zero. These scores are then averaged across datasets.
|
80 |
|
81 |
#### Average Rank
|
82 |
Ranks of methods are computed on each dataset (lower is better) and averaged.
|
83 |
|
84 |
+
#### Harmonic Rank
|
85 |
+
We compute the harmonic mean of ranks across datasets. The harmonic mean of ranks,
|
86 |
+
1/((1/N) * sum(1/rank_i for i in range(N))), more strongly favors methods having very
|
87 |
+
low ranks on some datasets. It therefore favors methods that are sometimes very good
|
88 |
+
and sometimes very bad over methods that are always mediocre, as the former are more
|
89 |
+
likely to be useful in conjunction with other methods.
|
90 |
|
91 |
#### Improvability
|
92 |
We introduce improvability as a metric that measures how many percent lower the error
|