Spaces:
Sleeping
Sleeping
Fix leaderboard startup and simplify to core functionality
Browse filesMajor fixes:
- Fix fields() function to work with both make_dataclass and @dataclass
- Fix column name mapping (model -> Model, average -> Average, etc.)
- Fix JSON file filtering logic that was skipping result files
- Fix search_columns references to use correct case-sensitive names
- Remove unnecessary metadata columns (precision, license, params, etc.)
- Simplify to core leaderboard: Model name + task scores + average
The app now starts successfully and displays a clean leaderboard
focused on model performance comparison across NER tasks.
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <[email protected]>
- app.py +4 -24
- src/display/utils.py +10 -8
- src/leaderboard/read_evals.py +11 -19
- src/populate.py +2 -2
app.py
CHANGED
@@ -67,36 +67,16 @@ def init_leaderboard(dataframe):
|
|
67 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
68 |
label="Select Columns to Display:",
|
69 |
),
|
70 |
-
search_columns=["
|
71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
72 |
filter_columns=[],
|
73 |
bool_checkboxgroup_label="Hide models",
|
74 |
interactive=False,
|
75 |
)
|
76 |
|
77 |
-
# Build filter columns
|
78 |
filter_columns = []
|
79 |
|
80 |
-
# Add precision filter only if precision column has data
|
81 |
-
if "precision" in dataframe.columns and not dataframe["precision"].isna().all():
|
82 |
-
filter_columns.append(ColumnFilter("precision", type="checkboxgroup", label="Precision"))
|
83 |
-
|
84 |
-
# Add params filter only if params column has data
|
85 |
-
if "params" in dataframe.columns and not dataframe["params"].isna().all():
|
86 |
-
filter_columns.append(ColumnFilter(
|
87 |
-
"params",
|
88 |
-
type="slider",
|
89 |
-
min=0.01,
|
90 |
-
max=150,
|
91 |
-
label="Select the number of parameters (B)",
|
92 |
-
))
|
93 |
-
|
94 |
-
# Add still_on_hub filter only if column has data
|
95 |
-
if "still_on_hub" in dataframe.columns and not dataframe["still_on_hub"].isna().all():
|
96 |
-
filter_columns.append(ColumnFilter(
|
97 |
-
"still_on_hub", type="boolean", label="Deleted/incomplete", default=True
|
98 |
-
))
|
99 |
-
|
100 |
return Leaderboard(
|
101 |
value=dataframe,
|
102 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
@@ -105,7 +85,7 @@ def init_leaderboard(dataframe):
|
|
105 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
106 |
label="Select Columns to Display:",
|
107 |
),
|
108 |
-
search_columns=[
|
109 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
110 |
filter_columns=filter_columns,
|
111 |
bool_checkboxgroup_label="Hide models",
|
@@ -173,7 +153,7 @@ with demo:
|
|
173 |
model_name_textbox = gr.Textbox(label="Nom du modèle")
|
174 |
revision_name_textbox = gr.Textbox(label="Révision commit", placeholder="main")
|
175 |
precision = gr.Dropdown(
|
176 |
-
choices=[
|
177 |
label="Précision",
|
178 |
multiselect=False,
|
179 |
value="float16",
|
|
|
67 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
68 |
label="Select Columns to Display:",
|
69 |
),
|
70 |
+
search_columns=["Model"],
|
71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
72 |
filter_columns=[],
|
73 |
bool_checkboxgroup_label="Hide models",
|
74 |
interactive=False,
|
75 |
)
|
76 |
|
77 |
+
# Build filter columns - simplified since we removed most metadata columns
|
78 |
filter_columns = []
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
return Leaderboard(
|
81 |
value=dataframe,
|
82 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
|
85 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
86 |
label="Select Columns to Display:",
|
87 |
),
|
88 |
+
search_columns=["Model"],
|
89 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
90 |
filter_columns=filter_columns,
|
91 |
bool_checkboxgroup_label="Hide models",
|
|
|
153 |
model_name_textbox = gr.Textbox(label="Nom du modèle")
|
154 |
revision_name_textbox = gr.Textbox(label="Révision commit", placeholder="main")
|
155 |
precision = gr.Dropdown(
|
156 |
+
choices=["float16", "bfloat16"],
|
157 |
label="Précision",
|
158 |
multiselect=False,
|
159 |
value="float16",
|
src/display/utils.py
CHANGED
@@ -6,7 +6,15 @@ import pandas as pd
|
|
6 |
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
# These classes are for user facing column names,
|
@@ -30,13 +38,7 @@ auto_eval_column_dict.append(("average", ColumnContent("Average", "number", True
|
|
30 |
#Scores
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append((task.name, ColumnContent(task.value.col_name, "number", True)))
|
33 |
-
# Model information
|
34 |
-
auto_eval_column_dict.append(("precision", ColumnContent("Precision", "str", False)))
|
35 |
-
auto_eval_column_dict.append(("license", ColumnContent("Hub License", "str", False)))
|
36 |
-
auto_eval_column_dict.append(("params", ColumnContent("#Params (B)", "number", False)))
|
37 |
-
auto_eval_column_dict.append(("likes", ColumnContent("Hub ❤️", "number", False)))
|
38 |
-
auto_eval_column_dict.append(("still_on_hub", ColumnContent("Available on the hub", "bool", False)))
|
39 |
-
auto_eval_column_dict.append(("revision", ColumnContent("Model sha", "str", False, False)))
|
40 |
|
41 |
# We use make dataclass to dynamically fill the scores from Tasks
|
42 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
6 |
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
+
if hasattr(raw_class, '__dataclass_fields__'):
|
10 |
+
# For make_dataclass created classes
|
11 |
+
if raw_class.__dataclass_fields__:
|
12 |
+
return [field.type for field in raw_class.__dataclass_fields__.values()]
|
13 |
+
else:
|
14 |
+
# For regular @dataclass with empty __dataclass_fields__, check __dict__
|
15 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" and hasattr(v, 'name')]
|
16 |
+
# Fallback for non-dataclass
|
17 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" and hasattr(v, 'name')]
|
18 |
|
19 |
|
20 |
# These classes are for user facing column names,
|
|
|
38 |
#Scores
|
39 |
for task in Tasks:
|
40 |
auto_eval_column_dict.append((task.name, ColumnContent(task.value.col_name, "number", True)))
|
41 |
+
# Model information - simplified to only essential columns
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
@@ -129,18 +129,9 @@ class EvalResult:
|
|
129 |
average = sum(valid_results) / len(valid_results) if valid_results else 0.0
|
130 |
data_dict = {
|
131 |
"eval_name": self.eval_name, # not a column, just a save name,
|
132 |
-
"
|
133 |
-
"
|
134 |
-
"
|
135 |
-
"weight_type": self.weight_type.value.name,
|
136 |
-
"architecture": self.architecture,
|
137 |
-
"model": make_clickable_model(self.full_model),
|
138 |
-
"revision": self.revision,
|
139 |
-
"average": average,
|
140 |
-
"license": self.license,
|
141 |
-
"likes": self.likes,
|
142 |
-
"params": self.num_params,
|
143 |
-
"still_on_hub": self.still_on_hub,
|
144 |
}
|
145 |
|
146 |
for task in Tasks:
|
@@ -176,17 +167,18 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
176 |
model_result_filepaths = []
|
177 |
|
178 |
for root, _, files in os.walk(results_path):
|
179 |
-
# We
|
180 |
-
|
|
|
181 |
continue
|
182 |
|
183 |
-
# Sort the files by date
|
184 |
try:
|
185 |
-
|
186 |
-
except
|
187 |
-
|
188 |
|
189 |
-
for file in
|
190 |
model_result_filepaths.append(os.path.join(root, file))
|
191 |
|
192 |
eval_results = {}
|
|
|
129 |
average = sum(valid_results) / len(valid_results) if valid_results else 0.0
|
130 |
data_dict = {
|
131 |
"eval_name": self.eval_name, # not a column, just a save name,
|
132 |
+
"T": self.model_type.value.symbol,
|
133 |
+
"Model": make_clickable_model(self.full_model),
|
134 |
+
"Average": average,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
}
|
136 |
|
137 |
for task in Tasks:
|
|
|
167 |
model_result_filepaths = []
|
168 |
|
169 |
for root, _, files in os.walk(results_path):
|
170 |
+
# We need at least one json file in model results
|
171 |
+
json_files = [f for f in files if f.endswith(".json")]
|
172 |
+
if len(json_files) == 0:
|
173 |
continue
|
174 |
|
175 |
+
# Sort the JSON files by date
|
176 |
try:
|
177 |
+
json_files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
178 |
+
except Exception:
|
179 |
+
json_files = [json_files[-1]] if json_files else []
|
180 |
|
181 |
+
for file in json_files:
|
182 |
model_result_filepaths.append(os.path.join(root, file))
|
183 |
|
184 |
eval_results = {}
|
src/populate.py
CHANGED
@@ -25,12 +25,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
25 |
# Sort by the first task (EMEA NER) since we don't have an average for NER tasks
|
26 |
# If no results exist yet, just sort by model name
|
27 |
first_task = list(Tasks)[0] # emea_ner
|
28 |
-
task_col_name =
|
29 |
if task_col_name in df.columns:
|
30 |
df = df.sort_values(by=[task_col_name], ascending=False)
|
31 |
else:
|
32 |
# Fallback to sorting by model name if no task results yet
|
33 |
-
df = df.sort_values(by=[
|
34 |
|
35 |
# Only select columns that exist in the DataFrame
|
36 |
available_cols = [col for col in cols if col in df.columns]
|
|
|
25 |
# Sort by the first task (EMEA NER) since we don't have an average for NER tasks
|
26 |
# If no results exist yet, just sort by model name
|
27 |
first_task = list(Tasks)[0] # emea_ner
|
28 |
+
task_col_name = first_task.value.col_name # Use the col_name directly
|
29 |
if task_col_name in df.columns:
|
30 |
df = df.sort_values(by=[task_col_name], ascending=False)
|
31 |
else:
|
32 |
# Fallback to sorting by model name if no task results yet
|
33 |
+
df = df.sort_values(by=["Model"], ascending=True)
|
34 |
|
35 |
# Only select columns that exist in the DataFrame
|
36 |
available_cols = [col for col in cols if col in df.columns]
|