Spaces:
Sleeping
Sleeping
Fix critical stability issues in leaderboard
Browse files- Fix KeyError when tasks missing from evaluation results
- Prevent division by zero in average calculations
- Add safe DataFrame column access in filtering
- Fix file path bugs in subdirectory processing
- Add JSON error handling for malformed queue files
- Improve license access with proper fallbacks
- Make filter columns dynamic based on available data
These fixes prevent major crash scenarios when handling edge cases,
empty data, or malformed evaluation files.
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <[email protected]>
- app.py +25 -26
- src/display/formatting.py +6 -1
- src/leaderboard/read_evals.py +3 -2
- src/populate.py +20 -11
- src/submission/submit.py +5 -1
app.py
CHANGED
@@ -69,23 +69,34 @@ def init_leaderboard(dataframe):
|
|
69 |
),
|
70 |
search_columns=["model", "license"],
|
71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
72 |
-
filter_columns=[
|
73 |
-
ColumnFilter("precision", type="checkboxgroup", label="Precision"),
|
74 |
-
ColumnFilter(
|
75 |
-
"params",
|
76 |
-
type="slider",
|
77 |
-
min=0.01,
|
78 |
-
max=150,
|
79 |
-
label="Select the number of parameters (B)",
|
80 |
-
),
|
81 |
-
ColumnFilter(
|
82 |
-
"still_on_hub", type="boolean", label="Deleted/incomplete", default=True
|
83 |
-
),
|
84 |
-
],
|
85 |
bool_checkboxgroup_label="Hide models",
|
86 |
interactive=False,
|
87 |
)
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
return Leaderboard(
|
90 |
value=dataframe,
|
91 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
@@ -96,19 +107,7 @@ def init_leaderboard(dataframe):
|
|
96 |
),
|
97 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
98 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
99 |
-
filter_columns=
|
100 |
-
ColumnFilter("precision", type="checkboxgroup", label="Precision"),
|
101 |
-
ColumnFilter(
|
102 |
-
"params",
|
103 |
-
type="slider",
|
104 |
-
min=0.01,
|
105 |
-
max=150,
|
106 |
-
label="Select the number of parameters (B)",
|
107 |
-
),
|
108 |
-
ColumnFilter(
|
109 |
-
"still_on_hub", type="boolean", label="Deleted/incomplete", default=True
|
110 |
-
),
|
111 |
-
],
|
112 |
bool_checkboxgroup_label="Hide models",
|
113 |
interactive=False,
|
114 |
)
|
|
|
69 |
),
|
70 |
search_columns=["model", "license"],
|
71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
72 |
+
filter_columns=[],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
bool_checkboxgroup_label="Hide models",
|
74 |
interactive=False,
|
75 |
)
|
76 |
|
77 |
+
# Build filter columns based on available data
|
78 |
+
filter_columns = []
|
79 |
+
|
80 |
+
# Add precision filter only if precision column has data
|
81 |
+
if "precision" in dataframe.columns and not dataframe["precision"].isna().all():
|
82 |
+
filter_columns.append(ColumnFilter("precision", type="checkboxgroup", label="Precision"))
|
83 |
+
|
84 |
+
# Add params filter only if params column has data
|
85 |
+
if "params" in dataframe.columns and not dataframe["params"].isna().all():
|
86 |
+
filter_columns.append(ColumnFilter(
|
87 |
+
"params",
|
88 |
+
type="slider",
|
89 |
+
min=0.01,
|
90 |
+
max=150,
|
91 |
+
label="Select the number of parameters (B)",
|
92 |
+
))
|
93 |
+
|
94 |
+
# Add still_on_hub filter only if column has data
|
95 |
+
if "still_on_hub" in dataframe.columns and not dataframe["still_on_hub"].isna().all():
|
96 |
+
filter_columns.append(ColumnFilter(
|
97 |
+
"still_on_hub", type="boolean", label="Deleted/incomplete", default=True
|
98 |
+
))
|
99 |
+
|
100 |
return Leaderboard(
|
101 |
value=dataframe,
|
102 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
|
107 |
),
|
108 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
109 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
110 |
+
filter_columns=filter_columns,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
bool_checkboxgroup_label="Hide models",
|
112 |
interactive=False,
|
113 |
)
|
src/display/formatting.py
CHANGED
@@ -20,7 +20,12 @@ def styled_message(message):
|
|
20 |
|
21 |
|
22 |
def has_no_nan_values(df, columns):
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
def has_nan_values(df, columns):
|
|
|
20 |
|
21 |
|
22 |
def has_no_nan_values(df, columns):
|
23 |
+
if df.empty:
|
24 |
+
return pd.Series([], dtype=bool)
|
25 |
+
existing_cols = [col for col in columns if col in df.columns]
|
26 |
+
if not existing_cols:
|
27 |
+
return pd.Series([True] * len(df), index=df.index)
|
28 |
+
return df[existing_cols].notna().all(axis=1)
|
29 |
|
30 |
|
31 |
def has_nan_values(df, columns):
|
src/leaderboard/read_evals.py
CHANGED
@@ -125,7 +125,8 @@ class EvalResult:
|
|
125 |
|
126 |
def to_dict(self):
|
127 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
128 |
-
|
|
|
129 |
data_dict = {
|
130 |
"eval_name": self.eval_name, # not a column, just a save name,
|
131 |
"precision": self.precision.value.name,
|
@@ -143,7 +144,7 @@ class EvalResult:
|
|
143 |
}
|
144 |
|
145 |
for task in Tasks:
|
146 |
-
data_dict[task.value.col_name] = self.results
|
147 |
|
148 |
return data_dict
|
149 |
|
|
|
125 |
|
126 |
def to_dict(self):
|
127 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
128 |
+
valid_results = [v for v in self.results.values() if v is not None]
|
129 |
+
average = sum(valid_results) / len(valid_results) if valid_results else 0.0
|
130 |
data_dict = {
|
131 |
"eval_name": self.eval_name, # not a column, just a save name,
|
132 |
"precision": self.precision.value.name,
|
|
|
144 |
}
|
145 |
|
146 |
for task in Tasks:
|
147 |
+
data_dict[task.value.col_name] = self.results.get(task.value.benchmark, None)
|
148 |
|
149 |
return data_dict
|
150 |
|
src/populate.py
CHANGED
@@ -49,24 +49,33 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
49 |
for entry in entries:
|
50 |
if ".json" in entry:
|
51 |
file_path = os.path.join(save_path, entry)
|
52 |
-
|
53 |
-
|
|
|
54 |
|
55 |
-
|
56 |
-
|
57 |
|
58 |
-
|
|
|
|
|
|
|
59 |
elif ".md" not in entry:
|
60 |
# this is a folder
|
61 |
-
sub_entries = [e for e in os.listdir(
|
|
|
62 |
for sub_entry in sub_entries:
|
63 |
file_path = os.path.join(save_path, entry, sub_entry)
|
64 |
-
|
65 |
-
|
|
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
70 |
|
71 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
72 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
|
|
49 |
for entry in entries:
|
50 |
if ".json" in entry:
|
51 |
file_path = os.path.join(save_path, entry)
|
52 |
+
try:
|
53 |
+
with open(file_path) as fp:
|
54 |
+
data = json.load(fp)
|
55 |
|
56 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
57 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
58 |
|
59 |
+
all_evals.append(data)
|
60 |
+
except (json.JSONDecodeError, KeyError, IOError) as e:
|
61 |
+
print(f"Error processing {file_path}: {e}")
|
62 |
+
continue
|
63 |
elif ".md" not in entry:
|
64 |
# this is a folder
|
65 |
+
sub_entries = [e for e in os.listdir(os.path.join(save_path, entry))
|
66 |
+
if os.path.isfile(os.path.join(save_path, entry, e)) and not e.startswith(".")]
|
67 |
for sub_entry in sub_entries:
|
68 |
file_path = os.path.join(save_path, entry, sub_entry)
|
69 |
+
try:
|
70 |
+
with open(file_path) as fp:
|
71 |
+
data = json.load(fp)
|
72 |
|
73 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
74 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
75 |
+
all_evals.append(data)
|
76 |
+
except (json.JSONDecodeError, KeyError, IOError) as e:
|
77 |
+
print(f"Error processing {file_path}: {e}")
|
78 |
+
continue
|
79 |
|
80 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
81 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
src/submission/submit.py
CHANGED
@@ -52,7 +52,11 @@ def add_new_eval(
|
|
52 |
|
53 |
# Were the model card and license filled?
|
54 |
try:
|
55 |
-
license =
|
|
|
|
|
|
|
|
|
56 |
except Exception:
|
57 |
return styled_error("Please select a license for your model")
|
58 |
|
|
|
52 |
|
53 |
# Were the model card and license filled?
|
54 |
try:
|
55 |
+
license = "Unknown"
|
56 |
+
if hasattr(model_info, 'cardData') and model_info.cardData:
|
57 |
+
license = model_info.cardData.get("license", "Unknown")
|
58 |
+
if license == "Unknown":
|
59 |
+
return styled_error("Please select a license for your model")
|
60 |
except Exception:
|
61 |
return styled_error("Please select a license for your model")
|
62 |
|