Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
9fd4b06
1
Parent(s):
2817fcb
debug
Browse files- app.py +76 -8
- src/display/utils.py +45 -1
app.py
CHANGED
@@ -94,16 +94,48 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
|
|
94 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
95 |
|
96 |
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
99 |
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
# filter for correct track
|
104 |
-
# dataframe = dataframe.loc[dataframe["Track"] == track]
|
105 |
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
return Leaderboard(
|
109 |
value=dataframe,
|
@@ -113,12 +145,40 @@ def init_leaderboard_mib_subgraph(dataframe, track):
|
|
113 |
cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden],
|
114 |
label="Select Columns to Display:",
|
115 |
),
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden],
|
118 |
bool_checkboxgroup_label="Hide models",
|
119 |
interactive=False,
|
120 |
)
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
123 |
# print("Debugging column issues:")
|
124 |
# print("\nActual DataFrame columns:")
|
@@ -233,7 +293,15 @@ with demo:
|
|
233 |
# with gr.Row():
|
234 |
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
235 |
|
|
|
|
|
236 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
238 |
|
239 |
# Then modify the Causal Graph tab section
|
|
|
94 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
95 |
|
96 |
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
# def init_leaderboard_mib_subgraph(dataframe, track):
|
101 |
+
# # print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
|
102 |
|
103 |
+
# if dataframe is None or dataframe.empty:
|
104 |
+
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
105 |
+
|
106 |
+
# # filter for correct track
|
107 |
+
# # dataframe = dataframe.loc[dataframe["Track"] == track]
|
108 |
+
|
109 |
+
# # print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
|
110 |
+
|
111 |
+
# return Leaderboard(
|
112 |
+
# value=dataframe,
|
113 |
+
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
114 |
+
# select_columns=SelectColumns(
|
115 |
+
# default_selection=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default],
|
116 |
+
# cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden],
|
117 |
+
# label="Select Columns to Display:",
|
118 |
+
# ),
|
119 |
+
# search_columns=["Method"], # Changed from AutoEvalColumn_mib_subgraph.model.name to "Method"
|
120 |
+
# hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden],
|
121 |
+
# bool_checkboxgroup_label="Hide models",
|
122 |
+
# interactive=False,
|
123 |
+
# )
|
124 |
+
|
125 |
+
|
126 |
|
|
|
|
|
127 |
|
128 |
+
def init_leaderboard_mib_subgraph(dataframe, track):
|
129 |
+
if dataframe is None or dataframe.empty:
|
130 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
131 |
+
|
132 |
+
# Get unique tasks and models for filters
|
133 |
+
tasks = list(set(task.value.benchmark for task in TasksMib_Subgraph))
|
134 |
+
models = list(set(
|
135 |
+
model
|
136 |
+
for task in TasksMib_Subgraph
|
137 |
+
for model in task.value.models
|
138 |
+
))
|
139 |
|
140 |
return Leaderboard(
|
141 |
value=dataframe,
|
|
|
145 |
cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden],
|
146 |
label="Select Columns to Display:",
|
147 |
),
|
148 |
+
column_filters=[
|
149 |
+
ColumnFilter(
|
150 |
+
column="task_filter",
|
151 |
+
choices=tasks,
|
152 |
+
label="Filter by Task:",
|
153 |
+
default=None
|
154 |
+
),
|
155 |
+
ColumnFilter(
|
156 |
+
column="model_filter",
|
157 |
+
choices=models,
|
158 |
+
label="Filter by Model:",
|
159 |
+
default=None
|
160 |
+
)
|
161 |
+
],
|
162 |
+
search_columns=["Method"],
|
163 |
hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden],
|
164 |
bool_checkboxgroup_label="Hide models",
|
165 |
interactive=False,
|
166 |
)
|
167 |
|
168 |
+
|
169 |
+
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
183 |
# print("Debugging column issues:")
|
184 |
# print("\nActual DataFrame columns:")
|
|
|
293 |
# with gr.Row():
|
294 |
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
295 |
|
296 |
+
# with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
297 |
+
# leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
298 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
299 |
+
# Add description for filters
|
300 |
+
gr.Markdown("""
|
301 |
+
### Filtering Options
|
302 |
+
Use the dropdown menus below to filter results by specific tasks or models.
|
303 |
+
You can combine filters to see specific task-model combinations.
|
304 |
+
""")
|
305 |
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
306 |
|
307 |
# Then modify the Causal Graph tab section
|
src/display/utils.py
CHANGED
@@ -60,15 +60,49 @@ AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_
|
|
60 |
|
61 |
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
auto_eval_column_dict_mib_subgraph = []
|
64 |
|
65 |
# Method name column
|
66 |
auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
# For each task and model combination
|
69 |
for task in TasksMib_Subgraph:
|
70 |
for model in task.value.models:
|
71 |
-
col_name = f"{task.value.benchmark}_{model}"
|
72 |
auto_eval_column_dict_mib_subgraph.append([
|
73 |
col_name,
|
74 |
ColumnContent,
|
@@ -79,6 +113,10 @@ for task in TasksMib_Subgraph:
|
|
79 |
auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
80 |
|
81 |
|
|
|
|
|
|
|
|
|
82 |
# Create the dataclass for MIB columns
|
83 |
AutoEvalColumn_mib_subgraph = make_dataclass("AutoEvalColumn_mib_subgraph", auto_eval_column_dict_mib_subgraph, frozen=True)
|
84 |
|
@@ -102,6 +140,12 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
|
102 |
|
103 |
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
# # Initialize the MIB causal graph columns
|
106 |
# auto_eval_column_dict_mib_causalgraph = []
|
107 |
|
|
|
60 |
|
61 |
|
62 |
|
63 |
+
# auto_eval_column_dict_mib_subgraph = []
|
64 |
+
|
65 |
+
# # Method name column
|
66 |
+
# auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
67 |
+
|
68 |
+
# # For each task and model combination
|
69 |
+
# for task in TasksMib_Subgraph:
|
70 |
+
# for model in task.value.models:
|
71 |
+
# col_name = f"{task.value.benchmark}_{model}" # ioi_gpt2, mcqa_qwen2.5, etc.
|
72 |
+
# auto_eval_column_dict_mib_subgraph.append([
|
73 |
+
# col_name,
|
74 |
+
# ColumnContent,
|
75 |
+
# ColumnContent(col_name, "number", True)
|
76 |
+
# ])
|
77 |
+
|
78 |
+
# # Average column
|
79 |
+
# auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
80 |
+
|
81 |
auto_eval_column_dict_mib_subgraph = []
|
82 |
|
83 |
# Method name column
|
84 |
auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
85 |
|
86 |
+
# Add task filter column
|
87 |
+
task_values = list(set(task.value.benchmark for task in TasksMib_Subgraph))
|
88 |
+
auto_eval_column_dict_mib_subgraph.append(
|
89 |
+
["task_filter", ColumnContent, ColumnContent("Task", "str", True, never_hidden=True)]
|
90 |
+
)
|
91 |
+
|
92 |
+
# Add model filter column
|
93 |
+
model_values = list(set(
|
94 |
+
model
|
95 |
+
for task in TasksMib_Subgraph
|
96 |
+
for model in task.value.models
|
97 |
+
))
|
98 |
+
auto_eval_column_dict_mib_subgraph.append(
|
99 |
+
["model_filter", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)]
|
100 |
+
)
|
101 |
+
|
102 |
# For each task and model combination
|
103 |
for task in TasksMib_Subgraph:
|
104 |
for model in task.value.models:
|
105 |
+
col_name = f"{task.value.benchmark}_{model}"
|
106 |
auto_eval_column_dict_mib_subgraph.append([
|
107 |
col_name,
|
108 |
ColumnContent,
|
|
|
113 |
auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
114 |
|
115 |
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
# Create the dataclass for MIB columns
|
121 |
AutoEvalColumn_mib_subgraph = make_dataclass("AutoEvalColumn_mib_subgraph", auto_eval_column_dict_mib_subgraph, frozen=True)
|
122 |
|
|
|
140 |
|
141 |
|
142 |
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
# # Initialize the MIB causal graph columns
|
150 |
# auto_eval_column_dict_mib_causalgraph = []
|
151 |
|