Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -79,7 +79,7 @@ except Exception:
|
|
79 |
|
80 |
|
81 |
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
|
82 |
-
LEADERBOARD_DF_MIB_CAUSALGRAPH =
|
83 |
|
84 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
85 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
@@ -116,103 +116,6 @@ def init_leaderboard_mib(dataframe, track):
|
|
116 |
interactive=False,
|
117 |
)
|
118 |
|
119 |
-
|
120 |
-
def calculate_best_layer_scores(task_data):
|
121 |
-
"""
|
122 |
-
Calculate the best scores across layers for output token and location
|
123 |
-
|
124 |
-
Args:
|
125 |
-
task_data: Dictionary containing task scores for different layers
|
126 |
-
|
127 |
-
Returns:
|
128 |
-
Dictionary with best scores and corresponding layer
|
129 |
-
"""
|
130 |
-
output_token_scores = [layer_data['output_token'] for layer_data in task_data.values()]
|
131 |
-
output_location_scores = [layer_data['output_location'] for layer_data in task_data.values()]
|
132 |
-
|
133 |
-
best_output_token = max(output_token_scores)
|
134 |
-
best_output_location = max(output_location_scores)
|
135 |
-
|
136 |
-
# Find the layer with the best combined performance
|
137 |
-
layer_scores = [(layer, layer_data['output_token'] + layer_data['output_location'])
|
138 |
-
for layer, layer_data in task_data.items()]
|
139 |
-
best_layer = max(layer_scores, key=lambda x: x[1])[0]
|
140 |
-
|
141 |
-
return {
|
142 |
-
'output_token': best_output_token,
|
143 |
-
'output_location': best_output_location,
|
144 |
-
'best_layer': int(best_layer)
|
145 |
-
}
|
146 |
-
|
147 |
-
def process_single_method(json_data):
|
148 |
-
"""
|
149 |
-
Process results for a single method into summary rows
|
150 |
-
|
151 |
-
Args:
|
152 |
-
json_data: Dictionary containing results for one method
|
153 |
-
|
154 |
-
Returns:
|
155 |
-
List of summary rows for the method
|
156 |
-
"""
|
157 |
-
summary_rows = []
|
158 |
-
method_name = json_data['method_name']
|
159 |
-
|
160 |
-
for model_result in json_data['results']:
|
161 |
-
model_id = model_result['model_id']
|
162 |
-
task_data = model_result['task_scores']['MCQA']
|
163 |
-
|
164 |
-
best_scores = calculate_best_layer_scores(task_data)
|
165 |
-
|
166 |
-
summary_row = {
|
167 |
-
'Method': method_name,
|
168 |
-
'Model': model_id,
|
169 |
-
'Best Output Token Score': best_scores['output_token'],
|
170 |
-
'Best Output Location Score': best_scores['output_location'],
|
171 |
-
'Best Layer': best_scores['best_layer']
|
172 |
-
}
|
173 |
-
summary_rows.append(summary_row)
|
174 |
-
|
175 |
-
return summary_rows
|
176 |
-
|
177 |
-
def init_leaderboard_mib_causal(json_data_list, track):
|
178 |
-
"""
|
179 |
-
Creates a leaderboard summary for causal intervention results from multiple methods
|
180 |
-
|
181 |
-
Args:
|
182 |
-
json_data_list: List of dictionaries containing results for different methods
|
183 |
-
track: Track identifier (currently unused but maintained for compatibility)
|
184 |
-
|
185 |
-
Returns:
|
186 |
-
Leaderboard object containing processed and formatted results
|
187 |
-
"""
|
188 |
-
# Process all methods
|
189 |
-
all_summary_data = []
|
190 |
-
for method_data in json_data_list:
|
191 |
-
method_summary = process_single_method(method_data)
|
192 |
-
all_summary_data.extend(method_summary)
|
193 |
-
|
194 |
-
# Convert to DataFrame
|
195 |
-
results_df = pd.DataFrame(all_summary_data)
|
196 |
-
|
197 |
-
# Sort by best score (using output token score as primary metric)
|
198 |
-
results_df = results_df.sort_values('Best Output Token Score', ascending=False)
|
199 |
-
|
200 |
-
# Round numeric columns to 3 decimal places
|
201 |
-
numeric_cols = ['Best Output Token Score', 'Best Output Location Score']
|
202 |
-
results_df[numeric_cols] = results_df[numeric_cols].round(3)
|
203 |
-
|
204 |
-
return Leaderboard(
|
205 |
-
value=results_df,
|
206 |
-
datatype=['text', 'text', 'number', 'number', 'number'],
|
207 |
-
select_columns=SelectColumns(
|
208 |
-
default_selection=['Method', 'Model', 'Best Output Token Score', 'Best Output Location Score', 'Best Layer'],
|
209 |
-
cant_deselect=['Method', 'Model'],
|
210 |
-
label="Select Metrics to Display:",
|
211 |
-
),
|
212 |
-
search_columns=['Method', 'Model'],
|
213 |
-
interactive=False,
|
214 |
-
)
|
215 |
-
|
216 |
def init_leaderboard(dataframe, track):
|
217 |
if dataframe is None or dataframe.empty:
|
218 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
@@ -280,8 +183,8 @@ with demo:
|
|
280 |
leaderboard = init_leaderboard_mib(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
281 |
# leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
|
282 |
|
283 |
-
with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
284 |
-
|
285 |
|
286 |
# with gr.Row():
|
287 |
# with gr.Accordion("π Citation", open=False):
|
|
|
79 |
|
80 |
|
81 |
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
|
82 |
+
# LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causal(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
|
83 |
|
84 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
85 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
|
|
116 |
interactive=False,
|
117 |
)
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
def init_leaderboard(dataframe, track):
|
120 |
if dataframe is None or dataframe.empty:
|
121 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
183 |
leaderboard = init_leaderboard_mib(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
184 |
# leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
|
185 |
|
186 |
+
# with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
187 |
+
# leaderboard = init_leaderboard_mib_causal(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
|
188 |
|
189 |
# with gr.Row():
|
190 |
# with gr.Accordion("π Citation", open=False):
|