Spaces:
Running
Running
Commit
·
d425853
1
Parent(s):
cb51391
add: more metric cols
Browse files- data/tabarena_leaderboard.csv.zip +2 -2
- main.py +68 -16
data/tabarena_leaderboard.csv.zip
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b23c724927320a54d5e4edcf1b2d938bc818c1dfd5461f1a8d204bb0b44d095
|
3 |
+
size 10582
|
main.py
CHANGED
@@ -26,6 +26,9 @@ tuned configurations. Each model is implemented in a tested real-world pipeline
|
|
26 |
optimized to get the most out of the model by the maintainers of TabArena, and where
|
27 |
possible together with the authors of the model.
|
28 |
|
|
|
|
|
|
|
29 |
**Reference Pipeline:** The leaderboard includes a reference pipeline, which is applied
|
30 |
independently of the tuning protocol and constraints we constructed for models within TabArena.
|
31 |
The reference pipeline aims to represent the performance quickly achievable by a
|
@@ -39,22 +42,68 @@ The current leaderboard is based on TabArena-v0.1.
|
|
39 |
|
40 |
|
41 |
ABOUT_TEXT = """
|
|
|
|
|
|
|
42 |
## Using TabArena for Benchmarking
|
43 |
To compare your own methods to the pre-computed results for all models on the leaderboard,
|
44 |
you can use the TabArena framework. For examples on how to use TabArena for benchmarking,
|
45 |
please see https://github.com/TabArena/tabarena_benchmarking_examples
|
46 |
|
|
|
|
|
|
|
|
|
47 |
## Contributing Data
|
48 |
For anything related to the datasets used in TabArena, please see https://github.com/TabArena/tabarena_dataset_curation
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
|
55 |
-
|
56 |
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
The current core maintainers of TabArena are:
|
59 |
[Nick Erickson](https://github.com/Innixma),
|
60 |
[Lennart Purucker](https://github.com/LennartPurucker/),
|
@@ -139,6 +188,9 @@ def load_data(filename: str):
|
|
139 |
+ df_leaderboard["elo-"].round(0).astype(int).astype(str)
|
140 |
)
|
141 |
# select only the columns we want to display
|
|
|
|
|
|
|
142 |
df_leaderboard = df_leaderboard.loc[
|
143 |
:,
|
144 |
[
|
@@ -147,8 +199,10 @@ def load_data(filename: str):
|
|
147 |
"method",
|
148 |
"elo",
|
149 |
"Elo 95% CI",
|
|
|
150 |
"rank",
|
151 |
-
"
|
|
|
152 |
"median_time_train_s_per_1K",
|
153 |
"median_time_infer_s_per_1K",
|
154 |
],
|
@@ -158,11 +212,11 @@ def load_data(filename: str):
|
|
158 |
df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round(
|
159 |
0
|
160 |
)
|
161 |
-
df_leaderboard[["median_time_train_s_per_1K", "rank"]] = df_leaderboard[
|
162 |
-
["median_time_train_s_per_1K", "rank"]
|
163 |
].round(2)
|
164 |
-
df_leaderboard[["normalized-
|
165 |
-
["normalized-
|
166 |
].round(3)
|
167 |
|
168 |
df_leaderboard = df_leaderboard.sort_values(by="elo", ascending=False)
|
@@ -177,14 +231,12 @@ def load_data(filename: str):
|
|
177 |
"method": "Model",
|
178 |
"elo": "Elo [⬆️]",
|
179 |
"rank": "Rank [⬇️]",
|
180 |
-
"normalized-
|
|
|
|
|
181 |
}
|
182 |
)
|
183 |
|
184 |
-
# TODO show ELO +/- sem
|
185 |
-
# TODO: rename and re-order columns
|
186 |
-
|
187 |
-
|
188 |
def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
|
189 |
df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply(
|
190 |
lambda m: f"{m} {model_type_emoji[m]}"
|
|
|
26 |
optimized to get the most out of the model by the maintainers of TabArena, and where
|
27 |
possible together with the authors of the model.
|
28 |
|
29 |
+
**Metrics:** The leaderboard is ranked based on Elo. We present several additional
|
30 |
+
metrics. See the `About` tab for more information on the metrics.
|
31 |
+
|
32 |
**Reference Pipeline:** The leaderboard includes a reference pipeline, which is applied
|
33 |
independently of the tuning protocol and constraints we constructed for models within TabArena.
|
34 |
The reference pipeline aims to represent the performance quickly achievable by a
|
|
|
42 |
|
43 |
|
44 |
ABOUT_TEXT = """
|
45 |
+
TabArena is a living benchmark system for predictive machine learning on tabular data.
|
46 |
+
We introduce TabArena and provide an overview of TabArena-v0.1 in our paper: TBA.
|
47 |
+
|
48 |
## Using TabArena for Benchmarking
|
49 |
To compare your own methods to the pre-computed results for all models on the leaderboard,
|
50 |
you can use the TabArena framework. For examples on how to use TabArena for benchmarking,
|
51 |
please see https://github.com/TabArena/tabarena_benchmarking_examples
|
52 |
|
53 |
+
## Contributing to the Leaderboard; Contributing Models
|
54 |
+
For guidelines on how to contribute your model to TabArena, or the result of your model
|
55 |
+
to the official leaderboard, please see the appendix of our paper: TBA.
|
56 |
+
|
57 |
## Contributing Data
|
58 |
For anything related to the datasets used in TabArena, please see https://github.com/TabArena/tabarena_dataset_curation
|
59 |
|
60 |
+
---
|
61 |
+
|
62 |
+
## Leaderboard Documentation
|
63 |
+
|
64 |
+
The leaderboard is ranked by Elo and includes several other metrics. Here is a short
|
65 |
+
description for these metrics:
|
66 |
+
|
67 |
+
#### Elo
|
68 |
+
We evaluate models using the Elo rating system, following Chatbot Arena. Elo is a
|
69 |
+
pairwise comparison-based rating system where each model's rating predicts its expected
|
70 |
+
win probability against others, with a 400-point Elo gap corresponding to a 10 to 1
|
71 |
+
(91\%) expected win rate. We calibrate 1000 Elo to the performance of our default
|
72 |
+
random forest configuration across all figures, and perform 100 rounds of bootstrapping
|
73 |
+
to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
|
74 |
+
classification, log-loss for multiclass classification, and RMSE for regression.
|
75 |
+
|
76 |
+
#### Normalized Score
|
77 |
+
Following TabRepo, we linearly rescale the error such that the best method has a
|
78 |
+
normalized score of one, and the median method has a normalized score of 0. Scores
|
79 |
+
below zero are clipped to zero. These scores are then averaged across datasets.
|
80 |
|
81 |
+
#### Average Rank
|
82 |
+
Ranks of methods are computed on each dataset (lower is better) and averaged.
|
83 |
|
84 |
+
#### Harmonic Mean Rank
|
85 |
+
Taking the harmonic mean of ranks, 1/((1/N) * sum(1/rank_i for i in range(N))),
|
86 |
+
more strongly favors methods having very low ranks on some datasets. It therefore favors
|
87 |
+
methods that are sometimes very good and sometimes very bad over methods that are
|
88 |
+
always mediocre, as the former are more likely to be useful in conjunction with
|
89 |
+
other methods.
|
90 |
+
|
91 |
+
#### Improvability
|
92 |
+
We introduce improvability as a metric that measures how many percent lower the error
|
93 |
+
of the best method is than the current method on a dataset. This is then averaged over
|
94 |
+
datasets. Formally, for a single dataset improvability is (err_i - besterr_i)/err_i * 100\%.
|
95 |
+
Improvability is always between $0\%$ and $100\%$.
|
96 |
+
|
97 |
+
---
|
98 |
+
|
99 |
+
## Contact
|
100 |
+
|
101 |
+
For most inquires, please open issues in the relevant GitHub repository or here on
|
102 |
+
HuggingFace.
|
103 |
+
|
104 |
+
For any other inquiries related to TabArena, please reach out to: [email protected]
|
105 |
+
|
106 |
+
### Core Maintainers
|
107 |
The current core maintainers of TabArena are:
|
108 |
[Nick Erickson](https://github.com/Innixma),
|
109 |
[Lennart Purucker](https://github.com/LennartPurucker/),
|
|
|
188 |
+ df_leaderboard["elo-"].round(0).astype(int).astype(str)
|
189 |
)
|
190 |
# select only the columns we want to display
|
191 |
+
df_leaderboard["normalized-score"] = 1 - df_leaderboard["normalized-error"]
|
192 |
+
df_leaderboard["hmr"] = 1/df_leaderboard["mrr"]
|
193 |
+
df_leaderboard["improvability"] = 100 * df_leaderboard["champ_delta"]
|
194 |
df_leaderboard = df_leaderboard.loc[
|
195 |
:,
|
196 |
[
|
|
|
199 |
"method",
|
200 |
"elo",
|
201 |
"Elo 95% CI",
|
202 |
+
"normalized-score",
|
203 |
"rank",
|
204 |
+
"hmr",
|
205 |
+
"improvability",
|
206 |
"median_time_train_s_per_1K",
|
207 |
"median_time_infer_s_per_1K",
|
208 |
],
|
|
|
212 |
df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round(
|
213 |
0
|
214 |
)
|
215 |
+
df_leaderboard[["median_time_train_s_per_1K", "rank", "hmr"]] = df_leaderboard[
|
216 |
+
["median_time_train_s_per_1K", "rank", "hmr"]
|
217 |
].round(2)
|
218 |
+
df_leaderboard[["normalized-score", "median_time_infer_s_per_1K", "improvability"]] = df_leaderboard[
|
219 |
+
["normalized-score", "median_time_infer_s_per_1K", "improvability"]
|
220 |
].round(3)
|
221 |
|
222 |
df_leaderboard = df_leaderboard.sort_values(by="elo", ascending=False)
|
|
|
231 |
"method": "Model",
|
232 |
"elo": "Elo [⬆️]",
|
233 |
"rank": "Rank [⬇️]",
|
234 |
+
"normalized-score": "Normalized Score [⬆️]",
|
235 |
+
"hmr": "Harmonic Mean Rank [⬇️]",
|
236 |
+
"improvability": "Improvability (%) [⬇️]",
|
237 |
}
|
238 |
)
|
239 |
|
|
|
|
|
|
|
|
|
240 |
def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
|
241 |
df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply(
|
242 |
lambda m: f"{m} {model_type_emoji[m]}"
|