rntc Claude commited on
Commit
25c6939
·
1 Parent(s): 9f4fde3

Fix leaderboard startup and simplify to core functionality

Browse files

Major fixes:
- Fix fields() function to work with both make_dataclass and @dataclass
- Fix column name mapping (model -> Model, average -> Average, etc.)
- Fix JSON file filtering logic that was skipping result files
- Fix search_columns references to use correct case-sensitive names
- Remove unnecessary metadata columns (precision, license, params, etc.)
- Simplify to core leaderboard: Model name + task scores + average

The app now starts successfully and displays a clean leaderboard
focused on model performance comparison across NER tasks.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (4) hide show
  1. app.py +4 -24
  2. src/display/utils.py +10 -8
  3. src/leaderboard/read_evals.py +11 -19
  4. src/populate.py +2 -2
app.py CHANGED
@@ -67,36 +67,16 @@ def init_leaderboard(dataframe):
67
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
68
  label="Select Columns to Display:",
69
  ),
70
- search_columns=["model", "license"],
71
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
72
  filter_columns=[],
73
  bool_checkboxgroup_label="Hide models",
74
  interactive=False,
75
  )
76
 
77
- # Build filter columns based on available data
78
  filter_columns = []
79
 
80
- # Add precision filter only if precision column has data
81
- if "precision" in dataframe.columns and not dataframe["precision"].isna().all():
82
- filter_columns.append(ColumnFilter("precision", type="checkboxgroup", label="Precision"))
83
-
84
- # Add params filter only if params column has data
85
- if "params" in dataframe.columns and not dataframe["params"].isna().all():
86
- filter_columns.append(ColumnFilter(
87
- "params",
88
- type="slider",
89
- min=0.01,
90
- max=150,
91
- label="Select the number of parameters (B)",
92
- ))
93
-
94
- # Add still_on_hub filter only if column has data
95
- if "still_on_hub" in dataframe.columns and not dataframe["still_on_hub"].isna().all():
96
- filter_columns.append(ColumnFilter(
97
- "still_on_hub", type="boolean", label="Deleted/incomplete", default=True
98
- ))
99
-
100
  return Leaderboard(
101
  value=dataframe,
102
  datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -105,7 +85,7 @@ def init_leaderboard(dataframe):
105
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
106
  label="Select Columns to Display:",
107
  ),
108
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
109
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
110
  filter_columns=filter_columns,
111
  bool_checkboxgroup_label="Hide models",
@@ -173,7 +153,7 @@ with demo:
173
  model_name_textbox = gr.Textbox(label="Nom du modèle")
174
  revision_name_textbox = gr.Textbox(label="Révision commit", placeholder="main")
175
  precision = gr.Dropdown(
176
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
177
  label="Précision",
178
  multiselect=False,
179
  value="float16",
 
67
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
68
  label="Select Columns to Display:",
69
  ),
70
+ search_columns=["Model"],
71
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
72
  filter_columns=[],
73
  bool_checkboxgroup_label="Hide models",
74
  interactive=False,
75
  )
76
 
77
+ # Build filter columns - simplified since we removed most metadata columns
78
  filter_columns = []
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  return Leaderboard(
81
  value=dataframe,
82
  datatype=[c.type for c in fields(AutoEvalColumn)],
 
85
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
86
  label="Select Columns to Display:",
87
  ),
88
+ search_columns=["Model"],
89
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
90
  filter_columns=filter_columns,
91
  bool_checkboxgroup_label="Hide models",
 
153
  model_name_textbox = gr.Textbox(label="Nom du modèle")
154
  revision_name_textbox = gr.Textbox(label="Révision commit", placeholder="main")
155
  precision = gr.Dropdown(
156
+ choices=["float16", "bfloat16"],
157
  label="Précision",
158
  multiselect=False,
159
  value="float16",
src/display/utils.py CHANGED
@@ -6,7 +6,15 @@ import pandas as pd
6
  from src.about import Tasks
7
 
8
  def fields(raw_class):
9
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
 
 
 
 
 
 
 
10
 
11
 
12
  # These classes are for user facing column names,
@@ -30,13 +38,7 @@ auto_eval_column_dict.append(("average", ColumnContent("Average", "number", True
30
  #Scores
31
  for task in Tasks:
32
  auto_eval_column_dict.append((task.name, ColumnContent(task.value.col_name, "number", True)))
33
- # Model information
34
- auto_eval_column_dict.append(("precision", ColumnContent("Precision", "str", False)))
35
- auto_eval_column_dict.append(("license", ColumnContent("Hub License", "str", False)))
36
- auto_eval_column_dict.append(("params", ColumnContent("#Params (B)", "number", False)))
37
- auto_eval_column_dict.append(("likes", ColumnContent("Hub ❤️", "number", False)))
38
- auto_eval_column_dict.append(("still_on_hub", ColumnContent("Available on the hub", "bool", False)))
39
- auto_eval_column_dict.append(("revision", ColumnContent("Model sha", "str", False, False)))
40
 
41
  # We use make dataclass to dynamically fill the scores from Tasks
42
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
6
  from src.about import Tasks
7
 
8
  def fields(raw_class):
9
+ if hasattr(raw_class, '__dataclass_fields__'):
10
+ # For make_dataclass created classes
11
+ if raw_class.__dataclass_fields__:
12
+ return [field.type for field in raw_class.__dataclass_fields__.values()]
13
+ else:
14
+ # For regular @dataclass with empty __dataclass_fields__, check __dict__
15
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" and hasattr(v, 'name')]
16
+ # Fallback for non-dataclass
17
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" and hasattr(v, 'name')]
18
 
19
 
20
  # These classes are for user facing column names,
 
38
  #Scores
39
  for task in Tasks:
40
  auto_eval_column_dict.append((task.name, ColumnContent(task.value.col_name, "number", True)))
41
+ # Model information - simplified to only essential columns
 
 
 
 
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
src/leaderboard/read_evals.py CHANGED
@@ -129,18 +129,9 @@ class EvalResult:
129
  average = sum(valid_results) / len(valid_results) if valid_results else 0.0
130
  data_dict = {
131
  "eval_name": self.eval_name, # not a column, just a save name,
132
- "precision": self.precision.value.name,
133
- "model_type": self.model_type.value.name,
134
- "model_type_symbol": self.model_type.value.symbol,
135
- "weight_type": self.weight_type.value.name,
136
- "architecture": self.architecture,
137
- "model": make_clickable_model(self.full_model),
138
- "revision": self.revision,
139
- "average": average,
140
- "license": self.license,
141
- "likes": self.likes,
142
- "params": self.num_params,
143
- "still_on_hub": self.still_on_hub,
144
  }
145
 
146
  for task in Tasks:
@@ -176,17 +167,18 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  model_result_filepaths = []
177
 
178
  for root, _, files in os.walk(results_path):
179
- # We should only have json files in model results
180
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
 
181
  continue
182
 
183
- # Sort the files by date
184
  try:
185
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
186
- except dateutil.parser._parser.ParserError:
187
- files = [files[-1]]
188
 
189
- for file in files:
190
  model_result_filepaths.append(os.path.join(root, file))
191
 
192
  eval_results = {}
 
129
  average = sum(valid_results) / len(valid_results) if valid_results else 0.0
130
  data_dict = {
131
  "eval_name": self.eval_name, # not a column, just a save name,
132
+ "T": self.model_type.value.symbol,
133
+ "Model": make_clickable_model(self.full_model),
134
+ "Average": average,
 
 
 
 
 
 
 
 
 
135
  }
136
 
137
  for task in Tasks:
 
167
  model_result_filepaths = []
168
 
169
  for root, _, files in os.walk(results_path):
170
+ # We need at least one json file in model results
171
+ json_files = [f for f in files if f.endswith(".json")]
172
+ if len(json_files) == 0:
173
  continue
174
 
175
+ # Sort the JSON files by date
176
  try:
177
+ json_files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
178
+ except Exception:
179
+ json_files = [json_files[-1]] if json_files else []
180
 
181
+ for file in json_files:
182
  model_result_filepaths.append(os.path.join(root, file))
183
 
184
  eval_results = {}
src/populate.py CHANGED
@@ -25,12 +25,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
25
  # Sort by the first task (EMEA NER) since we don't have an average for NER tasks
26
  # If no results exist yet, just sort by model name
27
  first_task = list(Tasks)[0] # emea_ner
28
- task_col_name = getattr(AutoEvalColumn, first_task.name).name
29
  if task_col_name in df.columns:
30
  df = df.sort_values(by=[task_col_name], ascending=False)
31
  else:
32
  # Fallback to sorting by model name if no task results yet
33
- df = df.sort_values(by=[AutoEvalColumn.model.name], ascending=True)
34
 
35
  # Only select columns that exist in the DataFrame
36
  available_cols = [col for col in cols if col in df.columns]
 
25
  # Sort by the first task (EMEA NER) since we don't have an average for NER tasks
26
  # If no results exist yet, just sort by model name
27
  first_task = list(Tasks)[0] # emea_ner
28
+ task_col_name = first_task.value.col_name # Use the col_name directly
29
  if task_col_name in df.columns:
30
  df = df.sort_values(by=[task_col_name], ascending=False)
31
  else:
32
  # Fallback to sorting by model name if no task results yet
33
+ df = df.sort_values(by=["Model"], ascending=True)
34
 
35
  # Only select columns that exist in the DataFrame
36
  available_cols = [col for col in cols if col in df.columns]