LennartPurucker commited on
Commit
e3793a3
·
1 Parent(s): 22b0fe7

maint: make more nice looking

Browse files
data/leaderboard-classification.csv.zip DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9e301dfeeb8cc7092301268aad4e4c9922517b6288101a588b5a15f5a0aaca9
3
- size 4679
 
 
 
 
data/leaderboard-regression.csv.zip DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9e301dfeeb8cc7092301268aad4e4c9922517b6288101a588b5a15f5a0aaca9
3
- size 4679
 
 
 
 
data/{leaderboard-all.csv.zip → tabarena_leaderboard.csv.zip} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9e301dfeeb8cc7092301268aad4e4c9922517b6288101a588b5a15f5a0aaca9
3
- size 4679
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c95306a347a69e561a82562c1a6306f5a9f6819a60f458d5e350639c35cde848
3
+ size 10202
main.py CHANGED
@@ -67,7 +67,7 @@ def rename_map(model_name: str) -> str:
67
  "MNCA": "ModernNCA",
68
  "NN_TORCH": "TorchMLP",
69
  "FASTAI": "FastaiMLP",
70
- "TABPFN": "TabPFNv2",
71
  "EBM": "EBM",
72
  "TABDPT": "TabDPT",
73
  "TABICL": "TabICL",
@@ -88,8 +88,6 @@ def load_data(filename: str):
88
  f"Loaded dataframe with {len(df_leaderboard)} rows and columns {df_leaderboard.columns}"
89
  )
90
 
91
- # sort by ELO
92
- df_leaderboard = df_leaderboard.sort_values(by="elo", ascending=False)
93
 
94
  # add model family information
95
 
@@ -101,22 +99,51 @@ def load_data(filename: str):
101
  )
102
  df_leaderboard["method"] = df_leaderboard["method"].apply(rename_map)
103
 
 
 
 
 
 
 
 
104
  # select only the columns we want to display
105
  df_leaderboard = df_leaderboard.loc[
106
- :, ["Type", "TypeName", "method", "elo", "rank", "time_train_s", "time_infer_s"]
 
 
 
 
 
 
 
 
 
 
 
107
  ]
108
 
109
  # round for better display
110
- df_leaderboard = df_leaderboard.round(1)
 
 
 
 
 
 
 
 
 
 
111
 
112
  # rename some columns
113
  return df_leaderboard.rename(
114
  columns={
115
- "time_train_s": "training time (s) [⬇️]",
116
- "time_infer_s": "inference time (s) [⬇️]",
117
  "method": "Model",
118
  "elo": "Elo [⬆️]",
119
  "rank": "Rank [⬇️]",
 
120
  }
121
  )
122
 
@@ -138,6 +165,26 @@ def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
138
  "(tuned + ensemble)"
139
  ) | df_leaderboard["Model"].str.endswith("(4h)")
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  return Leaderboard(
142
  value=df_leaderboard,
143
  select_columns=SelectColumns(
@@ -152,17 +199,29 @@ def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
152
  "Only Default",
153
  "Only Tuned",
154
  "Only Tuned + Ensemble",
 
155
  ],
156
  search_columns=["Model", "Type"],
157
  filter_columns=[
158
- ColumnFilter(
159
- "TypeFiler", type="checkboxgroup", label="Filter by Model Type"
160
- ),
161
  ColumnFilter("Only Default", type="boolean", default=False),
162
  ColumnFilter("Only Tuned", type="boolean", default=False),
163
  ColumnFilter("Only Tuned + Ensemble", type="boolean", default=False),
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  ],
165
- bool_checkboxgroup_label="Custom Views (Exclusive, only toggle one at a time):",
166
  )
167
 
168
 
@@ -174,7 +233,7 @@ def main():
174
 
175
  with gr.Tabs(elem_classes="tab-buttons"):
176
  with gr.TabItem("🏅 Overall", elem_id="llm-benchmark-tab-table", id=2):
177
- df_leaderboard = load_data("leaderboard-all")
178
  make_leaderboard(df_leaderboard)
179
 
180
  # TODO: decide on which subsets we want to support here.
 
67
  "MNCA": "ModernNCA",
68
  "NN_TORCH": "TorchMLP",
69
  "FASTAI": "FastaiMLP",
70
+ "TABPFNV2": "TabPFNv2",
71
  "EBM": "EBM",
72
  "TABDPT": "TabDPT",
73
  "TABICL": "TabICL",
 
88
  f"Loaded dataframe with {len(df_leaderboard)} rows and columns {df_leaderboard.columns}"
89
  )
90
 
 
 
91
 
92
  # add model family information
93
 
 
99
  )
100
  df_leaderboard["method"] = df_leaderboard["method"].apply(rename_map)
101
 
102
+ # elo,elo+,elo-,mrr
103
+ df_leaderboard["Elo 95% CI"] = (
104
+ "+"
105
+ + df_leaderboard["elo+"].round(0).astype(int).astype(str)
106
+ + "/-"
107
+ + df_leaderboard["elo-"].round(0).astype(int).astype(str)
108
+ )
109
  # select only the columns we want to display
110
  df_leaderboard = df_leaderboard.loc[
111
+ :,
112
+ [
113
+ "Type",
114
+ "TypeName",
115
+ "method",
116
+ "elo",
117
+ "Elo 95% CI",
118
+ "rank",
119
+ "normalized-error",
120
+ "median_time_train_s_per_1K",
121
+ "median_time_infer_s_per_1K",
122
+ ],
123
  ]
124
 
125
  # round for better display
126
+ df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round(0)
127
+ df_leaderboard[["median_time_train_s_per_1K", "rank"]] = df_leaderboard[
128
+ ["median_time_train_s_per_1K", "rank"]
129
+ ].round(2)
130
+ df_leaderboard[["normalized-error", "median_time_infer_s_per_1K"]] = df_leaderboard[
131
+ ["normalized-error", "median_time_infer_s_per_1K"]
132
+ ].round(3)
133
+
134
+ df_leaderboard = df_leaderboard.sort_values(by="elo", ascending=False)
135
+ df_leaderboard = df_leaderboard.reset_index(drop=True)
136
+ df_leaderboard = df_leaderboard.reset_index(names="#")
137
 
138
  # rename some columns
139
  return df_leaderboard.rename(
140
  columns={
141
+ "median_time_train_s_per_1K": "Median Train Time (s/1K) [⬇️]",
142
+ "median_time_infer_s_per_1K": "Median Predict Time (s/1K)) [⬇️]",
143
  "method": "Model",
144
  "elo": "Elo [⬆️]",
145
  "rank": "Rank [⬇️]",
146
+ "normalized-error": "Normalized Error [⬇️]",
147
  }
148
  )
149
 
 
165
  "(tuned + ensemble)"
166
  ) | df_leaderboard["Model"].str.endswith("(4h)")
167
 
168
+ # Add Imputed count postfix
169
+ mask = df_leaderboard["Model"].str.startswith("TabPFNv2")
170
+ df_leaderboard.loc[mask, "Model"] = (
171
+ df_leaderboard.loc[mask, "Model"] + " [35.29% IMPUTED]"
172
+ )
173
+ mask = df_leaderboard["Model"].str.startswith("TabICL")
174
+ df_leaderboard.loc[mask, "Model"] = (
175
+ df_leaderboard.loc[mask, "Model"] + " [29.41% IMPUTED]"
176
+ )
177
+
178
+ df_leaderboard["Imputed"] = df_leaderboard["Model"].str.startswith(
179
+ "TabPFNv2"
180
+ ) | df_leaderboard["Model"].str.startswith("TabICL")
181
+ df_leaderboard["Imputed"] = df_leaderboard["Imputed"].replace(
182
+ {
183
+ True: "Imputed",
184
+ False: "Not Imputed",
185
+ }
186
+ )
187
+
188
  return Leaderboard(
189
  value=df_leaderboard,
190
  select_columns=SelectColumns(
 
199
  "Only Default",
200
  "Only Tuned",
201
  "Only Tuned + Ensemble",
202
+ "Imputed",
203
  ],
204
  search_columns=["Model", "Type"],
205
  filter_columns=[
206
+ ColumnFilter("TypeFiler", type="checkboxgroup", label="Model Types."),
 
 
207
  ColumnFilter("Only Default", type="boolean", default=False),
208
  ColumnFilter("Only Tuned", type="boolean", default=False),
209
  ColumnFilter("Only Tuned + Ensemble", type="boolean", default=False),
210
+ ColumnFilter(
211
+ "Imputed",
212
+ type="checkboxgroup",
213
+ label="(Not) Imputed Models.",
214
+ info="We impute the performance for models that cannot run on all"
215
+ " datasets due to task or dataset size constraints (e.g. TabPFN,"
216
+ " TabICL). We impute with the performance of a defaultRandomForest. "
217
+ " We add a postfix [X% IMPUTED] to the model if any results were "
218
+ "imputed. The X% shows the percentage of"
219
+ " datasets that were imputed. In general, imputation negatively"
220
+ " represents the model performance, punishing the model for not"
221
+ " being able to run on all datasets.",
222
+ ),
223
  ],
224
+ bool_checkboxgroup_label="Custom Views (exclusive, only toggle one at a time):",
225
  )
226
 
227
 
 
233
 
234
  with gr.Tabs(elem_classes="tab-buttons"):
235
  with gr.TabItem("🏅 Overall", elem_id="llm-benchmark-tab-table", id=2):
236
+ df_leaderboard = load_data("tabarena_leaderboard")
237
  make_leaderboard(df_leaderboard)
238
 
239
  # TODO: decide on which subsets we want to support here.