geoalgo commited on
Commit
16d8300
Β·
1 Parent(s): 6d5f6e0

add comments

Browse files
Files changed (2) hide show
  1. constants.py +7 -7
  2. main.py +24 -10
constants.py CHANGED
@@ -1,4 +1,4 @@
1
- class MethodTypes:
2
  col_name: str = "method_type"
3
  automl: str = "AutoML"
4
  tree: str = "Tree-based"
@@ -10,10 +10,10 @@ class MethodTypes:
10
 
11
 
12
  model_type_emoji = {
13
- MethodTypes.tree: "🌴",
14
- MethodTypes.foundational: "🧠",
15
- MethodTypes.finetuned: "🌐",
16
- MethodTypes.automl: "πŸ€–",
17
- MethodTypes.baseline: "πŸ“",
18
- MethodTypes.other: "❓",
19
  }
 
1
+ class Constants:
2
  col_name: str = "method_type"
3
  automl: str = "AutoML"
4
  tree: str = "Tree-based"
 
10
 
11
 
12
  model_type_emoji = {
13
+ Constants.tree: "🌴",
14
+ Constants.foundational: "🧠",
15
+ Constants.finetuned: "🌐",
16
+ Constants.automl: "πŸ€–",
17
+ Constants.baseline: "πŸ“",
18
+ Constants.other: "❓",
19
  }
main.py CHANGED
@@ -5,14 +5,14 @@ import pandas as pd
5
  import gradio as gr
6
  from gradio_leaderboard import Leaderboard, ColumnFilter
7
 
8
- from constants import MethodTypes, model_type_emoji
9
 
10
 
11
  TITLE = """<h1 align="center" id="space-title">TabArena: Public leaderboard for Tabular methods</h1>"""
12
 
13
  INTRODUCTION_TEXT = ("TabArena Leaderboard measures the performance of tabular models on a collection of tabular "
14
  "datasets manually curated. The datasets are collected to make sure they are tabular, with "
15
- "permissive license without ethical issues and so on, we refer to the paper XXX for a "
16
  "description of our approach.")
17
 
18
  ABOUT_TEXT = f"""
@@ -37,30 +37,44 @@ TODO update when arxiv version is ready,
37
 
38
  def get_model_family(model_name: str) -> str:
39
  prefixes_mapping = {
40
- MethodTypes.automl: ["AutoGluon"],
41
- MethodTypes.finetuned: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"],
42
- MethodTypes.tree: ["GBM", "CAT", "EBM", "XGB"],
43
- MethodTypes.foundational: ["TABDPT", "TABICL", "TABPFN"],
44
- MethodTypes.baseline: ["KNN", "LR"]
45
  }
46
  for method_type, prefixes in prefixes_mapping.items():
47
  for prefix in prefixes:
48
  if prefix.lower() in model_name.lower():
49
  return method_type
50
- return MethodTypes.other
51
 
52
 
53
  def load_data(filename: str):
54
  df_leaderboard = pd.read_csv(Path(__file__).parent / "data" / f"{filename}.csv.zip")
55
  print(f"Loaded dataframe with {len(df_leaderboard)} rows and columns {df_leaderboard.columns}")
56
- df_leaderboard["family"] = df_leaderboard.loc[:, "method"].apply(get_model_family)
57
- df_leaderboard["family"] = df_leaderboard.loc[:, "family"].apply(lambda s: s + " " + model_type_emoji[s])
 
 
 
 
 
 
 
 
58
  df_leaderboard = df_leaderboard.loc[:, ["method", "family", "time_train_s", "time_infer_s", "rank", "elo"]]
 
 
59
  df_leaderboard = df_leaderboard.round(1)
 
 
60
  df_leaderboard.rename(columns={
61
  "time_train_s": "training time (s)",
62
  "time_infer_s": "inference time (s)",
63
  }, inplace=True)
 
 
64
  return df_leaderboard
65
 
66
 
 
5
  import gradio as gr
6
  from gradio_leaderboard import Leaderboard, ColumnFilter
7
 
8
+ from constants import Constants, model_type_emoji
9
 
10
 
11
  TITLE = """<h1 align="center" id="space-title">TabArena: Public leaderboard for Tabular methods</h1>"""
12
 
13
  INTRODUCTION_TEXT = ("TabArena Leaderboard measures the performance of tabular models on a collection of tabular "
14
  "datasets manually curated. The datasets are collected to make sure they are tabular, with "
15
+ "permissive license without ethical issues and so on, we refer to the paper for a full "
16
  "description of our approach.")
17
 
18
  ABOUT_TEXT = f"""
 
37
 
38
  def get_model_family(model_name: str) -> str:
39
  prefixes_mapping = {
40
+ Constants.automl: ["AutoGluon"],
41
+ Constants.finetuned: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"],
42
+ Constants.tree: ["GBM", "CAT", "EBM", "XGB"],
43
+ Constants.foundational: ["TABDPT", "TABICL", "TABPFN"],
44
+ Constants.baseline: ["KNN", "LR"]
45
  }
46
  for method_type, prefixes in prefixes_mapping.items():
47
  for prefix in prefixes:
48
  if prefix.lower() in model_name.lower():
49
  return method_type
50
+ return Constants.other
51
 
52
 
53
  def load_data(filename: str):
54
  df_leaderboard = pd.read_csv(Path(__file__).parent / "data" / f"{filename}.csv.zip")
55
  print(f"Loaded dataframe with {len(df_leaderboard)} rows and columns {df_leaderboard.columns}")
56
+
57
+ # sort by ELO
58
+ df_leaderboard.sort_values(by="elo", ascending=False, inplace=True)
59
+
60
+ # add model family information
61
+ df_leaderboard["family"] = df_leaderboard.loc[:, "method"].apply(
62
+ lambda s: get_model_family(s) + " " + model_type_emoji[get_model_family(s)]
63
+ )
64
+
65
+ # select only the columns we want to display
66
  df_leaderboard = df_leaderboard.loc[:, ["method", "family", "time_train_s", "time_infer_s", "rank", "elo"]]
67
+
68
+ # round for better display
69
  df_leaderboard = df_leaderboard.round(1)
70
+
71
+ # rename some columns
72
  df_leaderboard.rename(columns={
73
  "time_train_s": "training time (s)",
74
  "time_infer_s": "inference time (s)",
75
  }, inplace=True)
76
+
77
+ # TODO show ELO +/- sem
78
  return df_leaderboard
79
 
80