LennartPurucker commited on
Commit
cb35049
·
1 Parent(s): ab82350

shorten names

Browse files
Files changed (2) hide show
  1. main.py +11 -11
  2. website_texts.py +12 -12
main.py CHANGED
@@ -155,8 +155,8 @@ def load_data(filename: str, data_source="data"):
155
  "method": "Model",
156
  "elo": "Elo [⬆️]",
157
  "rank": "Rank [⬇️]",
158
- "normalized-score": "Normalized Score [⬆️]",
159
- "hmr": "Harmonic Mean Rank [⬇️]",
160
  "improvability": "Improvability (%) [⬇️]",
161
  "imputed": "Imputed (%) [⬇️]",
162
  "imputed_bool": "Imputed",
@@ -332,44 +332,44 @@ def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
332
 
333
  def _get_lbs() -> tuple[LBContainer, ...]:
334
  ta = LBContainer(
335
- name="🏅 TabArena",
336
  file_name="full-imputed/tabarena_leaderboard.csv",
337
  overview_image_name="full-imputed/tuning-impact-elo.png",
338
  blurb="Leaderboard for all datasets including all (imputed) models.",
339
  )
340
  ta_lite = LBContainer(
341
- name="TabArena-Lite",
342
  file_name="lite/full-imputed/tabarena_leaderboard.csv",
343
  overview_image_name="lite/full-imputed/tuning-impact-elo.png",
344
  blurb="Leaderboard for one split (1st fold, 1st repeat) for all datasets including all (imputed) models.",
345
  )
346
  ta_clf = LBContainer(
347
- name="TabArena-Classification",
348
  file_name="full-imputed-cls/tabarena_leaderboard.csv",
349
  overview_image_name="full-imputed-cls/tuning-impact-elo.png",
350
  blurb="Leaderboard for all 38 classification datasets including all (imputed) models.",
351
  )
352
  ta_reg = LBContainer(
353
- name="TabArena-Regression",
354
  file_name="full-imputed-reg/tabarena_leaderboard.csv",
355
  # FIXME: get overview image without TabICL
356
  overview_image_name="full-imputed-reg/tuning-impact-elo.png",
357
  blurb="Leaderboard for all 13 regression datasets including all (imputed) models.",
358
  )
359
  ta_tabicl = LBContainer(
360
- name="⚡ TabArena (TabICL-compatible)",
361
  file_name="tabicl-imputed/tabarena_leaderboard.csv",
362
  overview_image_name="tabicl-imputed/tuning-impact-elo.png",
363
  blurb="Leaderboard for all 36 datasets within the constraints of TabICL including all (imputed) models.",
364
  )
365
  ta_tabpfn = LBContainer(
366
- name="⚡ TabArena (TabPFN-compatible)",
367
  file_name="tabpfn-imputed/tabarena_leaderboard.csv",
368
  overview_image_name="tabpfn-imputed/tuning-impact-elo.png",
369
  blurb="Leaderboard for all 33 datasets within the constraints of TabPFN including all (imputed) models.",
370
  )
371
  ta_tabpfn_tabicl = LBContainer(
372
- name="TabArena (TabPFN-TabICL-compatible)",
373
  file_name="tabpfn-tabicl/tabarena_leaderboard.csv",
374
  overview_image_name="tabpfn-tabicl/tuning-impact-elo.png",
375
  blurb="Leaderboard for all 26 datasets within the constraints of TabPFN and TabICL including all models.",
@@ -435,7 +435,7 @@ def main():
435
  ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl = _get_lbs()
436
 
437
  # -- LB Overview
438
- gr.Markdown("## 🗺️ Overview")
439
  ordered_lbs = [
440
  ta,
441
  ta_clf,
@@ -447,7 +447,7 @@ def main():
447
  ]
448
  make_overview_leaderboard(lbs=ordered_lbs)
449
 
450
- gr.Markdown("## 🏆 Leaderboards")
451
  with gr.Tabs(elem_classes="tab-buttons"):
452
  for lb_id, lb in enumerate(ordered_lbs):
453
  with gr.TabItem(lb.name, elem_id="llm-benchmark-tab-table", id=lb_id):
 
155
  "method": "Model",
156
  "elo": "Elo [⬆️]",
157
  "rank": "Rank [⬇️]",
158
+ "normalized-score": "Score [⬆️]",
159
+ "hmr": "Harmonic Rank [⬇️]",
160
  "improvability": "Improvability (%) [⬇️]",
161
  "imputed": "Imputed (%) [⬇️]",
162
  "imputed_bool": "Imputed",
 
332
 
333
  def _get_lbs() -> tuple[LBContainer, ...]:
334
  ta = LBContainer(
335
+ name="🏅 Main",
336
  file_name="full-imputed/tabarena_leaderboard.csv",
337
  overview_image_name="full-imputed/tuning-impact-elo.png",
338
  blurb="Leaderboard for all datasets including all (imputed) models.",
339
  )
340
  ta_lite = LBContainer(
341
+ name="Lite",
342
  file_name="lite/full-imputed/tabarena_leaderboard.csv",
343
  overview_image_name="lite/full-imputed/tuning-impact-elo.png",
344
  blurb="Leaderboard for one split (1st fold, 1st repeat) for all datasets including all (imputed) models.",
345
  )
346
  ta_clf = LBContainer(
347
+ name="Classification",
348
  file_name="full-imputed-cls/tabarena_leaderboard.csv",
349
  overview_image_name="full-imputed-cls/tuning-impact-elo.png",
350
  blurb="Leaderboard for all 38 classification datasets including all (imputed) models.",
351
  )
352
  ta_reg = LBContainer(
353
+ name="Regression",
354
  file_name="full-imputed-reg/tabarena_leaderboard.csv",
355
  # FIXME: get overview image without TabICL
356
  overview_image_name="full-imputed-reg/tuning-impact-elo.png",
357
  blurb="Leaderboard for all 13 regression datasets including all (imputed) models.",
358
  )
359
  ta_tabicl = LBContainer(
360
+ name="⚡ TabICL-data",
361
  file_name="tabicl-imputed/tabarena_leaderboard.csv",
362
  overview_image_name="tabicl-imputed/tuning-impact-elo.png",
363
  blurb="Leaderboard for all 36 datasets within the constraints of TabICL including all (imputed) models.",
364
  )
365
  ta_tabpfn = LBContainer(
366
+ name="⚡ TabPFN-data",
367
  file_name="tabpfn-imputed/tabarena_leaderboard.csv",
368
  overview_image_name="tabpfn-imputed/tuning-impact-elo.png",
369
  blurb="Leaderboard for all 33 datasets within the constraints of TabPFN including all (imputed) models.",
370
  )
371
  ta_tabpfn_tabicl = LBContainer(
372
+ name="TabPFN/ICL-data",
373
  file_name="tabpfn-tabicl/tabarena_leaderboard.csv",
374
  overview_image_name="tabpfn-tabicl/tuning-impact-elo.png",
375
  blurb="Leaderboard for all 26 datasets within the constraints of TabPFN and TabICL including all models.",
 
435
  ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl = _get_lbs()
436
 
437
  # -- LB Overview
438
+ gr.Markdown("## 🗺️ TabArena Overview")
439
  ordered_lbs = [
440
  ta,
441
  ta_clf,
 
447
  ]
448
  make_overview_leaderboard(lbs=ordered_lbs)
449
 
450
+ gr.Markdown("## 🏆 TabArena Leaderboards")
451
  with gr.Tabs(elem_classes="tab-buttons"):
452
  for lb_id, lb in enumerate(ordered_lbs):
453
  with gr.TabItem(lb.name, elem_id="llm-benchmark-tab-table", id=lb_id):
website_texts.py CHANGED
@@ -28,8 +28,7 @@ metrics. See `More Details` for more information on the metrics.
28
  task or dataset size constraints (e.g. TabPFN, TabICL). In general, imputation
29
  negatively represents the model performance, punishing the model for not being able
30
  to run on all datasets. We provide leaderboards computed only on the subset of datasets
31
- where TabPFN, TabICL, or both can run. We denote these leaderboards by adding a
32
- `X-compatible` postfix.
33
  """
34
  OVERVIEW_REF_PIPE = """
35
  The leaderboard includes a reference pipeline, which is applied
@@ -73,20 +72,21 @@ random forest configuration across all figures, and perform 100 rounds of bootst
73
  to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
74
  classification, log-loss for multiclass classification, and RMSE for regression.
75
 
76
- #### Normalized Score
77
- Following TabRepo, we linearly rescale the error such that the best method has a
78
- normalized score of one, and the median method has a normalized score of 0. Scores
79
- below zero are clipped to zero. These scores are then averaged across datasets.
 
80
 
81
  #### Average Rank
82
  Ranks of methods are computed on each dataset (lower is better) and averaged.
83
 
84
- #### Harmonic Mean Rank
85
- Taking the harmonic mean of ranks, 1/((1/N) * sum(1/rank_i for i in range(N))),
86
- more strongly favors methods having very low ranks on some datasets. It therefore favors
87
- methods that are sometimes very good and sometimes very bad over methods that are
88
- always mediocre, as the former are more likely to be useful in conjunction with
89
- other methods.
90
 
91
  #### Improvability
92
  We introduce improvability as a metric that measures how many percent lower the error
 
28
  task or dataset size constraints (e.g. TabPFN, TabICL). In general, imputation
29
  negatively represents the model performance, punishing the model for not being able
30
  to run on all datasets. We provide leaderboards computed only on the subset of datasets
31
+ where TabPFN, TabICL, or both can run. We denote these leaderboards by `X-data`.
 
32
  """
33
  OVERVIEW_REF_PIPE = """
34
  The leaderboard includes a reference pipeline, which is applied
 
72
  to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
73
  classification, log-loss for multiclass classification, and RMSE for regression.
74
 
75
+ #### Score
76
+ Following TabRepo, we compute a normalized score to provide an additional relative
77
+ comparison. We linearly rescale the error such that the best method has a normalized
78
+ score of one, and the median method has a normalized score of 0. Scores below zero
79
+ are clipped to zero. These scores are then averaged across datasets.
80
 
81
  #### Average Rank
82
  Ranks of methods are computed on each dataset (lower is better) and averaged.
83
 
84
+ #### Harmonic Rank
85
+ We compute the harmonic mean of ranks across datasets. The harmonic mean of ranks,
86
+ 1/((1/N) * sum(1/rank_i for i in range(N))), more strongly favors methods having very
87
+ low ranks on some datasets. It therefore favors methods that are sometimes very good
88
+ and sometimes very bad over methods that are always mediocre, as the former are more
89
+ likely to be useful in conjunction with other methods.
90
 
91
  #### Improvability
92
  We introduce improvability as a metric that measures how many percent lower the error