Spaces:

TabArena
/

leaderboard

Running

App Files Files Community

LennartPurucker commited on 25 days ago

Commit

cb35049

1 Parent(s): ab82350

shorten names

Browse files

Files changed (2) hide show

main.py +11 -11
website_texts.py +12 -12

main.py CHANGED Viewed

@@ -155,8 +155,8 @@ def load_data(filename: str, data_source="data"):
             "method": "Model",
             "elo": "Elo [⬆️]",
             "rank": "Rank [⬇️]",
-            "normalized-score": "Normalized Score [⬆️]",
-            "hmr": "Harmonic Mean Rank [⬇️]",
             "improvability": "Improvability (%) [⬇️]",
             "imputed": "Imputed (%) [⬇️]",
             "imputed_bool": "Imputed",
@@ -332,44 +332,44 @@ def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
 def _get_lbs() -> tuple[LBContainer, ...]:
     ta = LBContainer(
-        name="🏅 TabArena",
         file_name="full-imputed/tabarena_leaderboard.csv",
         overview_image_name="full-imputed/tuning-impact-elo.png",
         blurb="Leaderboard for all datasets including all (imputed) models.",
     )
     ta_lite = LBContainer(
-        name="TabArena-Lite",
         file_name="lite/full-imputed/tabarena_leaderboard.csv",
         overview_image_name="lite/full-imputed/tuning-impact-elo.png",
         blurb="Leaderboard for one split (1st fold, 1st repeat) for all datasets including all (imputed) models.",
     )
     ta_clf = LBContainer(
-        name="TabArena-Classification",
         file_name="full-imputed-cls/tabarena_leaderboard.csv",
         overview_image_name="full-imputed-cls/tuning-impact-elo.png",
         blurb="Leaderboard for all 38 classification datasets including all (imputed) models.",
     )
     ta_reg = LBContainer(
-        name="TabArena-Regression",
         file_name="full-imputed-reg/tabarena_leaderboard.csv",
         # FIXME: get overview image without TabICL
         overview_image_name="full-imputed-reg/tuning-impact-elo.png",
         blurb="Leaderboard for all 13 regression datasets including all (imputed) models.",
     )
     ta_tabicl = LBContainer(
-        name="⚡ TabArena (TabICL-compatible)",
         file_name="tabicl-imputed/tabarena_leaderboard.csv",
         overview_image_name="tabicl-imputed/tuning-impact-elo.png",
         blurb="Leaderboard for all 36 datasets within the constraints of TabICL including all (imputed) models.",
     )
     ta_tabpfn = LBContainer(
-        name="⚡ TabArena (TabPFN-compatible)",
         file_name="tabpfn-imputed/tabarena_leaderboard.csv",
         overview_image_name="tabpfn-imputed/tuning-impact-elo.png",
         blurb="Leaderboard for all 33 datasets within the constraints of TabPFN including all (imputed) models.",
     )
     ta_tabpfn_tabicl = LBContainer(
-        name="TabArena (TabPFN-TabICL-compatible)",
         file_name="tabpfn-tabicl/tabarena_leaderboard.csv",
         overview_image_name="tabpfn-tabicl/tuning-impact-elo.png",
         blurb="Leaderboard for all 26 datasets within the constraints of TabPFN and TabICL including all models.",
@@ -435,7 +435,7 @@ def main():
         ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl = _get_lbs()
         # -- LB Overview
-        gr.Markdown("## 🗺️ Overview")
         ordered_lbs = [
             ta,
             ta_clf,
@@ -447,7 +447,7 @@ def main():
         ]
         make_overview_leaderboard(lbs=ordered_lbs)
-        gr.Markdown("## 🏆 Leaderboards")
         with gr.Tabs(elem_classes="tab-buttons"):
             for lb_id, lb in enumerate(ordered_lbs):
                 with gr.TabItem(lb.name, elem_id="llm-benchmark-tab-table", id=lb_id):

             "method": "Model",
             "elo": "Elo [⬆️]",
             "rank": "Rank [⬇️]",
+            "normalized-score": "Score [⬆️]",
+            "hmr": "Harmonic Rank [⬇️]",
             "improvability": "Improvability (%) [⬇️]",
             "imputed": "Imputed (%) [⬇️]",
             "imputed_bool": "Imputed",
 def _get_lbs() -> tuple[LBContainer, ...]:
     ta = LBContainer(
+        name="🏅 Main",
         file_name="full-imputed/tabarena_leaderboard.csv",
         overview_image_name="full-imputed/tuning-impact-elo.png",
         blurb="Leaderboard for all datasets including all (imputed) models.",
     )
     ta_lite = LBContainer(
+        name="Lite",
         file_name="lite/full-imputed/tabarena_leaderboard.csv",
         overview_image_name="lite/full-imputed/tuning-impact-elo.png",
         blurb="Leaderboard for one split (1st fold, 1st repeat) for all datasets including all (imputed) models.",
     )
     ta_clf = LBContainer(
+        name="Classification",
         file_name="full-imputed-cls/tabarena_leaderboard.csv",
         overview_image_name="full-imputed-cls/tuning-impact-elo.png",
         blurb="Leaderboard for all 38 classification datasets including all (imputed) models.",
     )
     ta_reg = LBContainer(
+        name="Regression",
         file_name="full-imputed-reg/tabarena_leaderboard.csv",
         # FIXME: get overview image without TabICL
         overview_image_name="full-imputed-reg/tuning-impact-elo.png",
         blurb="Leaderboard for all 13 regression datasets including all (imputed) models.",
     )
     ta_tabicl = LBContainer(
+        name="⚡ TabICL-data",
         file_name="tabicl-imputed/tabarena_leaderboard.csv",
         overview_image_name="tabicl-imputed/tuning-impact-elo.png",
         blurb="Leaderboard for all 36 datasets within the constraints of TabICL including all (imputed) models.",
     )
     ta_tabpfn = LBContainer(
+        name="⚡ TabPFN-data",
         file_name="tabpfn-imputed/tabarena_leaderboard.csv",
         overview_image_name="tabpfn-imputed/tuning-impact-elo.png",
         blurb="Leaderboard for all 33 datasets within the constraints of TabPFN including all (imputed) models.",
     )
     ta_tabpfn_tabicl = LBContainer(
+        name="TabPFN/ICL-data",
         file_name="tabpfn-tabicl/tabarena_leaderboard.csv",
         overview_image_name="tabpfn-tabicl/tuning-impact-elo.png",
         blurb="Leaderboard for all 26 datasets within the constraints of TabPFN and TabICL including all models.",
         ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl = _get_lbs()
         # -- LB Overview
+        gr.Markdown("## 🗺️ TabArena Overview")
         ordered_lbs = [
             ta,
             ta_clf,
         ]
         make_overview_leaderboard(lbs=ordered_lbs)
+        gr.Markdown("## 🏆 TabArena Leaderboards")
         with gr.Tabs(elem_classes="tab-buttons"):
             for lb_id, lb in enumerate(ordered_lbs):
                 with gr.TabItem(lb.name, elem_id="llm-benchmark-tab-table", id=lb_id):

website_texts.py CHANGED Viewed

@@ -28,8 +28,7 @@ metrics. See `More Details` for more information on the metrics.
 task or dataset size constraints (e.g. TabPFN, TabICL). In general, imputation
 negatively represents the model performance, punishing the model for not being able
 to run on all datasets. We provide leaderboards computed only on the subset of datasets
-where TabPFN, TabICL, or both can run. We denote these leaderboards by adding a
-`X-compatible` postfix.
 """
 OVERVIEW_REF_PIPE = """
 The leaderboard includes a reference pipeline, which is applied
@@ -73,20 +72,21 @@ random forest configuration across all figures, and perform 100 rounds of bootst
 to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
 classification, log-loss for multiclass classification, and RMSE for regression.
-#### Normalized Score
-Following TabRepo, we linearly rescale the error such that the best method has a
-normalized score of one, and the median method has a normalized score of 0. Scores
-below zero are clipped to zero. These scores are then averaged across datasets.
 #### Average Rank
 Ranks of methods are computed on each dataset (lower is better) and averaged.
-#### Harmonic Mean Rank
-Taking the harmonic mean of ranks, 1/((1/N) * sum(1/rank_i for i in range(N))),
-more strongly favors methods having very low ranks on some datasets. It therefore favors
- methods that are sometimes very good and sometimes very bad over methods that are
- always mediocre, as the former are more likely to be useful in conjunction with
- other methods.
 #### Improvability
 We introduce improvability as a metric that measures how many percent lower the error

 task or dataset size constraints (e.g. TabPFN, TabICL). In general, imputation
 negatively represents the model performance, punishing the model for not being able
 to run on all datasets. We provide leaderboards computed only on the subset of datasets
+where TabPFN, TabICL, or both can run. We denote these leaderboards by `X-data`.
 """
 OVERVIEW_REF_PIPE = """
 The leaderboard includes a reference pipeline, which is applied
 to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
 classification, log-loss for multiclass classification, and RMSE for regression.
+#### Score
+Following TabRepo, we compute a normalized score to provide an additional relative
+comparison. We linearly rescale the error such that the best method has a normalized
+score of one, and the median method has a normalized score of 0. Scores below zero
+are clipped to zero. These scores are then averaged across datasets.
 #### Average Rank
 Ranks of methods are computed on each dataset (lower is better) and averaged.
+#### Harmonic Rank
+We compute the harmonic mean of ranks across datasets. The harmonic mean of ranks,
+1/((1/N) * sum(1/rank_i for i in range(N))), more strongly favors methods having very
+low ranks on some datasets. It therefore favors methods that are sometimes very good
+and sometimes very bad over methods that are  always mediocre, as the former are more
+likely to be useful in conjunction with other methods.
 #### Improvability
 We introduce improvability as a metric that measures how many percent lower the error