Spaces:
Running
Running
Merge branch 'main' of https://huggingface.co/spaces/mech-interp-bench/leaderboard
Browse files- app.py +90 -18
- src/about.py +7 -4
- src/leaderboard/read_evals.py +251 -297
app.py
CHANGED
@@ -399,34 +399,76 @@ def init_leaderboard_mib_subgraph(dataframe, track):
|
|
399 |
|
400 |
|
401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
|
403 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
404 |
model_name_mapping = {
|
405 |
"Qwen2ForCausalLM": "Qwen-2.5",
|
406 |
"GPT2ForCausalLM": "GPT-2",
|
|
|
407 |
"Gemma2ForCausalLM": "Gemma-2",
|
408 |
"LlamaForCausalLM": "Llama-3.1"
|
409 |
}
|
410 |
|
411 |
benchmark_mapping = {
|
412 |
-
"
|
413 |
-
"
|
414 |
"arithmetic_addition": "Arithmetic (+)",
|
415 |
"arithmetic_subtraction": "Arithmetic (-)",
|
416 |
-
"
|
417 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
}
|
419 |
|
420 |
display_mapping = {}
|
421 |
for task in TasksMib_Causalgraph:
|
422 |
for model in task.value.models:
|
423 |
-
|
424 |
-
|
425 |
-
|
|
|
|
|
426 |
|
427 |
renamed_df = dataframe.rename(columns=display_mapping)
|
428 |
-
|
429 |
-
print(renamed_df)
|
430 |
|
431 |
# Create only necessary columns
|
432 |
return Leaderboard(
|
@@ -488,8 +530,10 @@ def get_hf_username(hf_repo):
|
|
488 |
# Define the preset substrings for filtering
|
489 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
490 |
TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
|
|
|
491 |
MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
492 |
|
|
|
493 |
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
|
494 |
selected_model_substrings: List[str]) -> pd.DataFrame:
|
495 |
"""
|
@@ -648,21 +692,21 @@ with demo:
|
|
648 |
# Then modify the Causal Graph tab section
|
649 |
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
650 |
with gr.Tabs() as causalgraph_tabs:
|
651 |
-
with gr.TabItem("Detailed View", id=0):
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
with gr.TabItem("
|
657 |
gr.Markdown("""
|
658 |
### Filtering Options
|
659 |
Use the dropdown menus below to filter results by specific tasks or models.
|
660 |
You can combine filters to see specific task-model combinations.
|
661 |
""")
|
662 |
task_substring_checkbox = gr.CheckboxGroup(
|
663 |
-
choices=
|
664 |
label="View tasks:",
|
665 |
-
value=
|
666 |
)
|
667 |
model_substring_checkbox = gr.CheckboxGroup(
|
668 |
choices = MODEL_SUBSTRINGS,
|
@@ -685,11 +729,39 @@ with demo:
|
|
685 |
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
686 |
outputs=leaderboard_aggregated
|
687 |
)
|
688 |
-
with gr.TabItem("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
689 |
leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
690 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
691 |
"Causal Graph"
|
692 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
693 |
|
694 |
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
695 |
# Track selection
|
|
|
399 |
|
400 |
|
401 |
|
402 |
+
# @dataclass
|
403 |
+
# class TaskMIB_Causalgraph:
|
404 |
+
# benchmark: str # task name in json (ioi/arithmetic)
|
405 |
+
# models: list[str] # list of models to show as sub-columns
|
406 |
+
# col_name: str # display name in leaderboard
|
407 |
+
# metrics: list[str] # metrics to store (average_score)
|
408 |
+
|
409 |
+
# class TasksMib_Causalgraph(Enum):
|
410 |
+
# task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "ioi_task", ["average_score"])
|
411 |
+
# task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
|
412 |
+
# task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
|
413 |
+
# task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
|
414 |
+
|
415 |
+
# @classmethod
|
416 |
+
# def get_all_tasks(cls):
|
417 |
+
# """Returns a list of all task benchmarks"""
|
418 |
+
# return [task.value.benchmark for task in cls]
|
419 |
+
|
420 |
+
# @classmethod
|
421 |
+
# def get_all_models(cls):
|
422 |
+
# """Returns a list of all unique models across all tasks"""
|
423 |
+
# models = set()
|
424 |
+
# for task in cls:
|
425 |
+
# models.update(task.value.models)
|
426 |
+
# return sorted(list(models))
|
427 |
+
|
428 |
+
# ioi_task
|
429 |
+
# 4_answer_MCQA
|
430 |
+
|
431 |
|
432 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
433 |
model_name_mapping = {
|
434 |
"Qwen2ForCausalLM": "Qwen-2.5",
|
435 |
"GPT2ForCausalLM": "GPT-2",
|
436 |
+
"GPT2LMHeadModel": "GPT-2",
|
437 |
"Gemma2ForCausalLM": "Gemma-2",
|
438 |
"LlamaForCausalLM": "Llama-3.1"
|
439 |
}
|
440 |
|
441 |
benchmark_mapping = {
|
442 |
+
"ioi_task": "IOI",
|
443 |
+
"4_answer_MCQA": "MCQA",
|
444 |
"arithmetic_addition": "Arithmetic (+)",
|
445 |
"arithmetic_subtraction": "Arithmetic (-)",
|
446 |
+
"ARC_easy": "ARC (Easy)",
|
447 |
+
"RAVEL_task": "RAVEL"
|
448 |
+
}
|
449 |
+
|
450 |
+
target_variables_mapping = {
|
451 |
+
"output_token": "Output Token",
|
452 |
+
"output_position": "Output Position",
|
453 |
+
"answer_pointer": "Answer Pointer",
|
454 |
+
"answer": "Answer",
|
455 |
+
"Continent": "Continent",
|
456 |
+
"Language": "Language",
|
457 |
+
"Country": "Country",
|
458 |
+
"Language": "Language"
|
459 |
}
|
460 |
|
461 |
display_mapping = {}
|
462 |
for task in TasksMib_Causalgraph:
|
463 |
for model in task.value.models:
|
464 |
+
for target_variables in task.value.target_variables:
|
465 |
+
field_name = f"{model}_{task.value.col_name}_{target_variables}"
|
466 |
+
display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]} - {target_variables_mapping[target_variables]}"
|
467 |
+
display_mapping[field_name] = display_name
|
468 |
+
|
469 |
|
470 |
renamed_df = dataframe.rename(columns=display_mapping)
|
471 |
+
|
|
|
472 |
|
473 |
# Create only necessary columns
|
474 |
return Leaderboard(
|
|
|
530 |
# Define the preset substrings for filtering
|
531 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
532 |
TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
|
533 |
+
TASK_CAUSAL_SUBSTRINGS = ["IOI", "MCQA", "ARC (Easy)", "RAVEL"]
|
534 |
MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
535 |
|
536 |
+
|
537 |
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
|
538 |
selected_model_substrings: List[str]) -> pd.DataFrame:
|
539 |
"""
|
|
|
692 |
# Then modify the Causal Graph tab section
|
693 |
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
694 |
with gr.Tabs() as causalgraph_tabs:
|
695 |
+
# with gr.TabItem("Detailed View", id=0):
|
696 |
+
# leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
|
697 |
+
# LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
|
698 |
+
# "Causal Graph"
|
699 |
+
# )
|
700 |
+
with gr.TabItem("Highest View", id=0):
|
701 |
gr.Markdown("""
|
702 |
### Filtering Options
|
703 |
Use the dropdown menus below to filter results by specific tasks or models.
|
704 |
You can combine filters to see specific task-model combinations.
|
705 |
""")
|
706 |
task_substring_checkbox = gr.CheckboxGroup(
|
707 |
+
choices=TASK_CAUSAL_SUBSTRINGS,
|
708 |
label="View tasks:",
|
709 |
+
value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
|
710 |
)
|
711 |
model_substring_checkbox = gr.CheckboxGroup(
|
712 |
choices = MODEL_SUBSTRINGS,
|
|
|
729 |
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
730 |
outputs=leaderboard_aggregated
|
731 |
)
|
732 |
+
with gr.TabItem("Averaged View", id=1):
|
733 |
+
|
734 |
+
task_substring_checkbox = gr.CheckboxGroup(
|
735 |
+
choices=TASK_CAUSAL_SUBSTRINGS,
|
736 |
+
label="View tasks:",
|
737 |
+
value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
|
738 |
+
)
|
739 |
+
model_substring_checkbox = gr.CheckboxGroup(
|
740 |
+
choices = MODEL_SUBSTRINGS,
|
741 |
+
label = "View models:",
|
742 |
+
value = MODEL_SUBSTRINGS
|
743 |
+
)
|
744 |
+
|
745 |
leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
746 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
747 |
"Causal Graph"
|
748 |
)
|
749 |
+
original_leaderboard = gr.State(value=data)
|
750 |
+
task_substring_checkbox.change(
|
751 |
+
fn=update_leaderboard,
|
752 |
+
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
753 |
+
outputs=leaderboard_averaged
|
754 |
+
)
|
755 |
+
model_substring_checkbox.change(
|
756 |
+
fn=update_leaderboard,
|
757 |
+
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
758 |
+
outputs=leaderboard_averaged
|
759 |
+
)
|
760 |
+
|
761 |
+
# leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
762 |
+
# LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
763 |
+
# "Causal Graph"
|
764 |
+
# )
|
765 |
|
766 |
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
767 |
# Track selection
|
src/about.py
CHANGED
@@ -78,12 +78,15 @@ class TaskMIB_Causalgraph:
|
|
78 |
models: list[str] # list of models to show as sub-columns
|
79 |
col_name: str # display name in leaderboard
|
80 |
metrics: list[str] # metrics to store (average_score)
|
|
|
|
|
|
|
81 |
|
82 |
class TasksMib_Causalgraph(Enum):
|
83 |
-
task0 =
|
84 |
-
task1 =
|
85 |
-
task2 =
|
86 |
-
task3 =
|
87 |
|
88 |
@classmethod
|
89 |
def get_all_tasks(cls):
|
|
|
78 |
models: list[str] # list of models to show as sub-columns
|
79 |
col_name: str # display name in leaderboard
|
80 |
metrics: list[str] # metrics to store (average_score)
|
81 |
+
target_variables: list[str]
|
82 |
+
|
83 |
+
|
84 |
|
85 |
class TasksMib_Causalgraph(Enum):
|
86 |
+
task0 = TaskMIB_Causalgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"], ["output_token", "output_position"])
|
87 |
+
task1 = TaskMIB_Causalgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"], ["answer_pointer", "answer"])
|
88 |
+
task2 = TaskMIB_Causalgraph("ravel", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "RAVEL_task", ["average_score"], ["Continent", "Language", "Country", "Language"])
|
89 |
+
task3 = TaskMIB_Causalgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ARC_easy", ["average_score"], ["answer_pointer", "answer"])
|
90 |
|
91 |
@classmethod
|
92 |
def get_all_tasks(cls):
|
src/leaderboard/read_evals.py
CHANGED
@@ -2,19 +2,22 @@ import glob
|
|
2 |
import json
|
3 |
import math
|
4 |
import os
|
|
|
|
|
5 |
from dataclasses import dataclass
|
|
|
|
|
|
|
6 |
|
7 |
import dateutil
|
8 |
import numpy as np
|
|
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
-
from src.about import TasksMib_Subgraph
|
14 |
|
15 |
-
from typing import List, Dict, Any
|
16 |
-
from collections import defaultdict
|
17 |
-
import pandas as pd
|
18 |
|
19 |
|
20 |
|
@@ -205,224 +208,10 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
|
|
205 |
|
206 |
|
207 |
|
208 |
-
# def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
|
209 |
-
# """
|
210 |
-
# Process a single JSON file and convert it to a DataFrame.
|
211 |
-
|
212 |
-
# Args:
|
213 |
-
# json_file: Dictionary containing the analysis results
|
214 |
-
# method_counter: Counter for handling duplicate method names
|
215 |
-
|
216 |
-
# Returns:
|
217 |
-
# pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
|
218 |
-
# """
|
219 |
-
# method_name = json_file['method_name']
|
220 |
-
# unique_method_name = f"{method_name}_{method_counter}"
|
221 |
-
# method_scores = []
|
222 |
-
|
223 |
-
# for result in json_file['results']:
|
224 |
-
# model = result['model_id']
|
225 |
-
|
226 |
-
# for task, scores in result['task_scores'].items():
|
227 |
-
# # Process each layer's data
|
228 |
-
# intervention_scores = defaultdict(list)
|
229 |
-
|
230 |
-
# for layer_data in scores:
|
231 |
-
# for intervention_data in layer_data['layer_scores']:
|
232 |
-
# # Calculate average score for counterfactuals
|
233 |
-
# avg_cf_score = np.mean([
|
234 |
-
# cf['score']
|
235 |
-
# for cf in intervention_data['counterfactual_scores']
|
236 |
-
# ])
|
237 |
-
|
238 |
-
# if np.isnan(avg_cf_score):
|
239 |
-
# avg_cf_score = 0.0
|
240 |
-
|
241 |
-
# # Group scores by intervention
|
242 |
-
# intervention_key = '_'.join(intervention_data['intervention'])
|
243 |
-
# intervention_scores[intervention_key].append(avg_cf_score)
|
244 |
-
|
245 |
-
# # Average across layers for each intervention
|
246 |
-
# for intervention, layer_scores in intervention_scores.items():
|
247 |
-
# column = f"{model}_{task}_{intervention}"
|
248 |
-
# avg_score = np.mean(layer_scores) if layer_scores else 0.0
|
249 |
-
# method_scores.append((column, f"{avg_score:.3f}"))
|
250 |
-
|
251 |
-
# # Sort by column names for consistency
|
252 |
-
# method_scores.sort(key=lambda x: x[0])
|
253 |
-
# data = {
|
254 |
-
# unique_method_name: {
|
255 |
-
# col: score for col, score in method_scores
|
256 |
-
# }
|
257 |
-
# }
|
258 |
-
|
259 |
-
# return pd.DataFrame.from_dict(data, orient='index')
|
260 |
-
|
261 |
-
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
262 |
-
# model_result_filepaths = []
|
263 |
-
|
264 |
-
# # print(f"Scanning directory: {results_path}")
|
265 |
-
# for root, dirnames, files in os.walk(results_path):
|
266 |
-
# # print(f"Current directory: {root}")
|
267 |
-
# # print(f"Found files: {files}")
|
268 |
-
# if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
269 |
-
# continue
|
270 |
-
|
271 |
-
# try:
|
272 |
-
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
273 |
-
# except dateutil.parser._parser.ParserError:
|
274 |
-
# files = [files[-1]]
|
275 |
-
|
276 |
-
# for file in files:
|
277 |
-
# model_result_filepaths.append(os.path.join(root, file))
|
278 |
-
|
279 |
-
# # print(f"Found json files: {model_result_filepaths}")
|
280 |
-
|
281 |
-
# method_counters = defaultdict(int)
|
282 |
-
# dataframes = []
|
283 |
-
|
284 |
-
# for json_file in model_result_filepaths:
|
285 |
-
# try:
|
286 |
-
# with open(filepath, 'r') as f:
|
287 |
-
# json_data = json.load(f)
|
288 |
-
# method_name = json_data['method_name']
|
289 |
-
# method_counters[method_name] += 1
|
290 |
-
|
291 |
-
# # Process single JSON file
|
292 |
-
# df = process_single_json(json_data, method_counters[method_name])
|
293 |
-
# dataframes.append(df)
|
294 |
-
# except Exception as e:
|
295 |
-
# print(f"Error processing {json_file}: {e}")
|
296 |
-
# continue
|
297 |
-
|
298 |
-
# return dataframes
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
from dataclasses import dataclass
|
304 |
-
import json
|
305 |
-
import numpy as np
|
306 |
-
import pandas as pd
|
307 |
-
from typing import Dict, List, Any
|
308 |
-
import os
|
309 |
-
from datetime import datetime
|
310 |
-
import dateutil
|
311 |
-
from collections import defaultdict
|
312 |
-
|
313 |
-
@dataclass
|
314 |
-
class EvalResult_MIB_CAUSALGRAPH:
|
315 |
-
"""Represents one full evaluation for a method across all models in MIB for causal graph track."""
|
316 |
-
method_name: str # name of the interpretation method
|
317 |
-
results: Dict # nested dict of results for each model and task
|
318 |
-
|
319 |
-
def init_from_json_file(self, json_filepath: str):
|
320 |
-
"""Inits results from the method result file"""
|
321 |
-
with open(json_filepath) as fp:
|
322 |
-
data = json.load(fp)
|
323 |
-
|
324 |
-
method_name = data.get("method_name")
|
325 |
-
|
326 |
-
# Initialize results dictionary
|
327 |
-
results = {}
|
328 |
-
for task in ["IOI", "MCQA", "arithmetic", "ARC-easy"]:
|
329 |
-
results[task] = {}
|
330 |
-
|
331 |
-
# Process each model's results
|
332 |
-
for result in data.get("results", []):
|
333 |
-
model_id = result.get("model_id", "")
|
334 |
-
model_name = model_id.replace(".", "_")
|
335 |
-
|
336 |
-
for task, scores in result.get("task_scores", {}).items():
|
337 |
-
intervention_scores = defaultdict(list)
|
338 |
-
|
339 |
-
for layer_data in scores:
|
340 |
-
for intervention_data in layer_data['layer_scores']:
|
341 |
-
# Calculate average score for counterfactuals
|
342 |
-
avg_cf_score = np.mean([
|
343 |
-
cf['score'] if 'score' in cf else 0
|
344 |
-
for cf in intervention_data['counterfactual_scores']
|
345 |
-
])
|
346 |
-
|
347 |
-
if np.isnan(avg_cf_score):
|
348 |
-
avg_cf_score = 0.0
|
349 |
-
|
350 |
-
intervention_key = '_'.join(intervention_data['intervention'])
|
351 |
-
intervention_scores[intervention_key].append(avg_cf_score)
|
352 |
-
|
353 |
-
# Average across layers for each intervention
|
354 |
-
results[task][model_name] = {
|
355 |
-
interv: np.mean(scores) if scores else 0.0
|
356 |
-
for interv, scores in intervention_scores.items()
|
357 |
-
}
|
358 |
-
|
359 |
-
return EvalResult_MIB_CAUSALGRAPH(
|
360 |
-
method_name=method_name,
|
361 |
-
results=results
|
362 |
-
)
|
363 |
-
|
364 |
-
def to_dict(self, metric_type="average"):
|
365 |
-
"""Converts the Eval Result to a dict for dataframe display"""
|
366 |
-
data_dict = {
|
367 |
-
"Method": self.method_name,
|
368 |
-
"Average": "-" # Initialize first to make the order consistent
|
369 |
-
}
|
370 |
-
|
371 |
-
# Initialize columns for all task-model combinations
|
372 |
-
all_scores = []
|
373 |
-
for task, task_results in self.results.items():
|
374 |
-
for model, intervention_scores in task_results.items():
|
375 |
-
if not intervention_scores:
|
376 |
-
continue
|
377 |
-
|
378 |
-
col_name = f"{task}_{model}"
|
379 |
-
scores = list(intervention_scores.values())
|
380 |
-
if not scores:
|
381 |
-
data_dict[col_name] = '-'
|
382 |
-
continue
|
383 |
-
|
384 |
-
avg_score = np.mean(scores)
|
385 |
-
data_dict[col_name] = f"{avg_score:.3f}"
|
386 |
-
all_scores.append(avg_score)
|
387 |
-
|
388 |
-
data_dict["Average"] = f"{np.mean(all_scores):.3f}"
|
389 |
-
return data_dict
|
390 |
-
|
391 |
-
|
392 |
|
393 |
|
394 |
-
# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
395 |
-
# """
|
396 |
-
# Aggregates rows with the same base method name by taking the max value for each column.
|
397 |
-
# Works with Method as a regular column instead of index.
|
398 |
-
# """
|
399 |
-
# df_copy = df.copy()
|
400 |
-
# print("\nBase methods extraction:")
|
401 |
-
# base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
402 |
-
# else name for name in df_copy['Method']]
|
403 |
-
# print(f"Original methods: {df_copy['Method'].tolist()}")
|
404 |
-
# print(f"Base methods: {base_methods}")
|
405 |
|
406 |
|
407 |
-
# df_copy['base_method'] = base_methods
|
408 |
-
|
409 |
-
# # Convert scores to numeric values
|
410 |
-
# score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
|
411 |
-
# for col in score_columns:
|
412 |
-
# df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
|
413 |
-
|
414 |
-
# # Group by base method name and take the max
|
415 |
-
# aggregated_df = df_copy.groupby('base_method')[score_columns].max().round(3)
|
416 |
-
|
417 |
-
# # Reset index to make base_method a regular column and rename it to Method
|
418 |
-
# aggregated_df = aggregated_df.reset_index()
|
419 |
-
# aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
|
420 |
-
|
421 |
-
# # Convert back to string format
|
422 |
-
# for col in score_columns:
|
423 |
-
# aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}")
|
424 |
-
|
425 |
-
# return aggregated_df
|
426 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
427 |
"""
|
428 |
Aggregates rows with the same base method name by taking the max value for each column.
|
@@ -444,21 +233,21 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
|
444 |
# Convert scores to numeric values
|
445 |
score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
|
446 |
for col in score_columns:
|
447 |
-
df_copy[col] = df_copy[col]
|
448 |
|
449 |
# Group by base method name and take the max, handling NaN values
|
450 |
-
aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(
|
451 |
|
452 |
-
#
|
453 |
aggregated_df = aggregated_df.reset_index()
|
454 |
aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
|
455 |
|
456 |
-
# Convert numeric values back to strings with 3 decimal places
|
457 |
-
for col in score_columns:
|
458 |
-
aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else x)
|
459 |
-
|
460 |
return aggregated_df
|
461 |
|
|
|
|
|
|
|
|
|
462 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
463 |
"""
|
464 |
Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
@@ -467,99 +256,264 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
467 |
# Create a copy of the DataFrame
|
468 |
df_copy = df.copy()
|
469 |
|
470 |
-
#
|
471 |
-
|
472 |
-
df_copy = df_copy.drop('Average', axis=1)
|
473 |
-
|
474 |
-
# Get score columns (excluding Method)
|
475 |
-
score_columns = [col for col in df_copy.columns if col != 'Method']
|
476 |
|
477 |
-
#
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
if model_task not in model_task_groups:
|
486 |
-
model_task_groups[model_task] = []
|
487 |
-
model_task_groups[model_task].append(col)
|
488 |
|
489 |
-
# Create new DataFrame with Method
|
490 |
averaged_data = []
|
491 |
for _, row in df_copy.iterrows():
|
492 |
-
|
|
|
|
|
493 |
for model_task, cols in model_task_groups.items():
|
494 |
-
|
495 |
-
|
496 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
|
|
|
498 |
averaged_df = pd.DataFrame(averaged_data)
|
499 |
-
|
500 |
-
|
501 |
-
averaged_df = averaged_df.sort_values('Average', ascending=False)
|
502 |
|
503 |
return averaged_df
|
504 |
|
505 |
|
506 |
-
def get_raw_eval_results_mib_causalgraph(results_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
507 |
-
"""From the path of the results folder root, extract all needed info for MIB causal graph results"""
|
508 |
-
model_result_filepaths = []
|
509 |
-
|
510 |
-
for root, dirnames, files in os.walk(results_path):
|
511 |
-
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
512 |
-
continue
|
513 |
-
|
514 |
-
try:
|
515 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
516 |
-
except dateutil.parser._parser.ParserError:
|
517 |
-
files = [files[-1]]
|
518 |
-
|
519 |
-
for file in files:
|
520 |
-
model_result_filepaths.append(os.path.join(root, file))
|
521 |
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
|
535 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
536 |
|
537 |
-
#
|
538 |
-
if
|
539 |
-
data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
|
540 |
|
541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
|
543 |
-
if not data_dicts:
|
544 |
-
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
545 |
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
552 |
|
553 |
-
print("Before aggregation:")
|
554 |
-
print(detailed_df)
|
555 |
|
556 |
-
# Create aggregated DataFrame
|
557 |
-
aggregated_df = aggregate_methods(detailed_df)
|
558 |
|
559 |
-
# Create intervention-averaged DataFrame
|
560 |
-
intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
|
561 |
|
562 |
-
return detailed_df, aggregated_df, intervention_averaged_df
|
563 |
|
564 |
|
565 |
|
|
|
2 |
import json
|
3 |
import math
|
4 |
import os
|
5 |
+
import re
|
6 |
+
import ast
|
7 |
from dataclasses import dataclass
|
8 |
+
from datetime import datetime
|
9 |
+
from typing import List, Dict, Any, Tuple
|
10 |
+
from collections import defaultdict
|
11 |
|
12 |
import dateutil
|
13 |
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
|
16 |
from src.display.formatting import make_clickable_model
|
17 |
+
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, AutoEvalColumn_mib_causalgraph
|
18 |
from src.submission.check_validity import is_model_on_hub
|
19 |
+
from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
|
20 |
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
|
|
|
208 |
|
209 |
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
216 |
"""
|
217 |
Aggregates rows with the same base method name by taking the max value for each column.
|
|
|
233 |
# Convert scores to numeric values
|
234 |
score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
|
235 |
for col in score_columns:
|
236 |
+
df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
|
237 |
|
238 |
# Group by base method name and take the max, handling NaN values
|
239 |
+
aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(3)
|
240 |
|
241 |
+
# Reset index to make base_method a regular column and rename it to Method
|
242 |
aggregated_df = aggregated_df.reset_index()
|
243 |
aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
|
244 |
|
|
|
|
|
|
|
|
|
245 |
return aggregated_df
|
246 |
|
247 |
+
|
248 |
+
|
249 |
+
|
250 |
+
|
251 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
252 |
"""
|
253 |
Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
|
|
256 |
# Create a copy of the DataFrame
|
257 |
df_copy = df.copy()
|
258 |
|
259 |
+
# Get all columns except Method and Average
|
260 |
+
columns_to_process = [col for col in df_copy.columns if col not in ['Method', 'Average']]
|
|
|
|
|
|
|
|
|
261 |
|
262 |
+
# Extract model and task information from column names
|
263 |
+
model_task_groups = defaultdict(list)
|
264 |
+
for col in columns_to_process:
|
265 |
+
# Split by underscore and extract model, task
|
266 |
+
parts = col.split('_')
|
267 |
+
if len(parts) >= 2:
|
268 |
+
model_task = f"{parts[0]}_{parts[1]}"
|
269 |
+
model_task_groups[model_task].append(col)
|
|
|
|
|
|
|
270 |
|
271 |
+
# Create new DataFrame with Method and averaged columns
|
272 |
averaged_data = []
|
273 |
for _, row in df_copy.iterrows():
|
274 |
+
new_row = {'Method': row['Method']}
|
275 |
+
|
276 |
+
# Calculate average for each model_task group
|
277 |
for model_task, cols in model_task_groups.items():
|
278 |
+
values = [row[col] for col in cols if pd.notna(row[col])]
|
279 |
+
if values:
|
280 |
+
new_row[model_task] = round(np.mean(values), 3)
|
281 |
+
else:
|
282 |
+
new_row[model_task] = np.nan
|
283 |
+
|
284 |
+
# Calculate overall average
|
285 |
+
model_task_values = [v for k, v in new_row.items() if k != 'Method' and pd.notna(v)]
|
286 |
+
if model_task_values:
|
287 |
+
new_row['Average'] = round(np.mean(model_task_values), 3)
|
288 |
+
else:
|
289 |
+
new_row['Average'] = np.nan
|
290 |
+
|
291 |
+
averaged_data.append(new_row)
|
292 |
|
293 |
+
# Create DataFrame and sort by Average
|
294 |
averaged_df = pd.DataFrame(averaged_data)
|
295 |
+
if 'Average' in averaged_df.columns:
|
296 |
+
averaged_df = averaged_df.sort_values('Average', ascending=False)
|
|
|
297 |
|
298 |
return averaged_df
|
299 |
|
300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
+
@dataclass
|
303 |
+
class EvalResult_MIB_CAUSALGRAPH:
|
304 |
+
"""Represents one full evaluation for a method across all models for causal variable localization."""
|
305 |
+
eval_name: str # method name as identifier
|
306 |
+
method_name: str # name of the interpretation method
|
307 |
+
model_name: str # name of the model
|
308 |
+
task_name: str # name of the task
|
309 |
+
target_variables: str # target variables (e.g., "answer", "answer_pointer")
|
310 |
+
average_accuracy: float # average accuracy score
|
311 |
+
highest_accuracy: float # highest accuracy score
|
312 |
+
|
313 |
+
@staticmethod
|
314 |
+
def init_from_consolidated_json(json_data: Dict):
|
315 |
+
"""
|
316 |
+
Initialize results from the consolidated JSON format, treating each entry as a separate result
|
317 |
+
|
318 |
+
Args:
|
319 |
+
json_data: The parsed JSON data with tuple keys
|
320 |
+
|
321 |
+
Returns:
|
322 |
+
List of EvalResult_MIB_CAUSALGRAPH objects
|
323 |
+
"""
|
324 |
+
results = []
|
325 |
+
|
326 |
+
for key, entry in json_data.items():
|
327 |
+
try:
|
328 |
+
# Parse tuple key: "('method', 'model', 'task', 'variable')"
|
329 |
+
try:
|
330 |
+
key_tuple = ast.literal_eval(key)
|
331 |
+
method_name, model_name, task_name, target_variable = key_tuple
|
332 |
+
except:
|
333 |
+
# Alternative parsing with regex
|
334 |
+
pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
|
335 |
+
match = re.match(pattern, key)
|
336 |
+
if match:
|
337 |
+
method_name, model_name, task_name, target_variable = match.groups()
|
338 |
+
else:
|
339 |
+
print(f"Couldn't parse key: {key}")
|
340 |
+
continue
|
341 |
+
|
342 |
+
# Get average and highest accuracy
|
343 |
+
average_accuracy = entry.get("average_accuracy", 0.0)
|
344 |
+
highest_accuracy = entry.get("highest_accuracy", 0.0)
|
345 |
+
|
346 |
+
# Create a result object for this entry
|
347 |
+
result = EvalResult_MIB_CAUSALGRAPH(
|
348 |
+
eval_name=f"{method_name}_{model_name}_{task_name}_{target_variable}",
|
349 |
+
method_name=method_name,
|
350 |
+
model_name=model_name,
|
351 |
+
task_name=task_name,
|
352 |
+
target_variables=target_variable,
|
353 |
+
average_accuracy=average_accuracy,
|
354 |
+
highest_accuracy=highest_accuracy
|
355 |
+
)
|
356 |
+
|
357 |
+
results.append(result)
|
358 |
+
|
359 |
+
except Exception as e:
|
360 |
+
print(f"Error processing entry {key}: {e}")
|
361 |
+
continue
|
362 |
+
|
363 |
+
return results
|
364 |
|
365 |
+
def to_dict(self, metric_type="Highest"):
|
366 |
+
"""
|
367 |
+
Converts the Eval Result to a dict for dataframe display
|
368 |
|
369 |
+
Args:
|
370 |
+
metric_type: Either "Mean" to use average_accuracy or "Highest" to use highest_accuracy
|
371 |
+
"""
|
372 |
+
# Create column name in the exact format requested
|
373 |
+
# col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
374 |
+
col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
375 |
+
print(f"col_name is {col_name}")
|
376 |
|
377 |
+
# Select the appropriate accuracy metric based on metric_type
|
378 |
+
score = self.average_accuracy if metric_type == "Mean" else self.highest_accuracy
|
|
|
379 |
|
380 |
+
# Create data dictionary with method name and the score
|
381 |
+
data_dict = {
|
382 |
+
"eval_name": self.eval_name,
|
383 |
+
"Method": self.method_name,
|
384 |
+
col_name: score
|
385 |
+
}
|
386 |
+
|
387 |
+
return data_dict
|
388 |
|
|
|
|
|
389 |
|
390 |
+
def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
391 |
+
"""
|
392 |
+
Processes the consolidated JSON format for causal variable localization results
|
393 |
+
Treats each entry as a separate result and then combines them by method
|
394 |
+
|
395 |
+
Args:
|
396 |
+
results_path: Path to the directory containing results
|
397 |
+
|
398 |
+
Returns:
|
399 |
+
Tuple of four DataFrames:
|
400 |
+
- detailed_df_highest: Detailed view with highest accuracy scores
|
401 |
+
- detailed_df_mean: Detailed view with mean accuracy scores
|
402 |
+
- intervention_averaged_highest_df: Averaged by intervention using highest accuracy
|
403 |
+
- intervention_averaged_mean_df: Averaged by intervention using mean accuracy
|
404 |
+
"""
|
405 |
+
# Find the consolidated JSON file
|
406 |
+
json_files = []
|
407 |
+
for root, _, files in os.walk(results_path):
|
408 |
+
for file in files:
|
409 |
+
if file.endswith('.json'):
|
410 |
+
json_files.append(os.path.join(root, file))
|
411 |
+
|
412 |
+
if not json_files:
|
413 |
+
print(f"No JSON files found in {results_path}")
|
414 |
+
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
415 |
+
|
416 |
+
# Load and process the consolidated JSON format
|
417 |
+
raw_data = None
|
418 |
+
for json_file in json_files:
|
419 |
+
try:
|
420 |
+
with open(json_file, 'r') as f:
|
421 |
+
data = json.load(f)
|
422 |
+
|
423 |
+
# Check if this is the consolidated format by examining a sample key
|
424 |
+
sample_key = next(iter(data), None)
|
425 |
+
if sample_key and isinstance(sample_key, str) and '(' in sample_key and ')' in sample_key:
|
426 |
+
raw_data = data
|
427 |
+
print(f"Found consolidated data file: {json_file}")
|
428 |
+
break
|
429 |
+
except Exception as e:
|
430 |
+
print(f"Error reading {json_file}: {e}")
|
431 |
+
|
432 |
+
if raw_data is None:
|
433 |
+
print("No valid consolidated JSON file found")
|
434 |
+
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
435 |
+
|
436 |
+
# Get all results
|
437 |
+
eval_results = EvalResult_MIB_CAUSALGRAPH.init_from_consolidated_json(raw_data)
|
438 |
+
|
439 |
+
if not eval_results:
|
440 |
+
print("No results could be extracted from the JSON data")
|
441 |
+
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
442 |
+
|
443 |
+
# Create two sets of dictionaries - one for highest accuracy and one for mean accuracy
|
444 |
+
highest_results = [result.to_dict(metric_type="Highest") for result in eval_results]
|
445 |
+
mean_results = [result.to_dict(metric_type="Mean") for result in eval_results]
|
446 |
+
|
447 |
+
# Process highest accuracy results
|
448 |
+
# Group results by method
|
449 |
+
highest_method_groups = {}
|
450 |
+
for result_dict in highest_results:
|
451 |
+
method = result_dict["Method"]
|
452 |
+
if method not in highest_method_groups:
|
453 |
+
highest_method_groups[method] = {
|
454 |
+
"eval_name": method,
|
455 |
+
"Method": method
|
456 |
+
}
|
457 |
+
|
458 |
+
# Copy all score columns to the method's group
|
459 |
+
for key, value in result_dict.items():
|
460 |
+
if key not in ["eval_name", "Method"]:
|
461 |
+
highest_method_groups[method][key] = value
|
462 |
+
|
463 |
+
# Create the detailed DataFrame for highest accuracy
|
464 |
+
highest_records = list(highest_method_groups.values())
|
465 |
+
detailed_df_highest = pd.DataFrame(highest_records)
|
466 |
+
|
467 |
+
# Process mean accuracy results
|
468 |
+
# Group results by method
|
469 |
+
mean_method_groups = {}
|
470 |
+
for result_dict in mean_results:
|
471 |
+
method = result_dict["Method"]
|
472 |
+
if method not in mean_method_groups:
|
473 |
+
mean_method_groups[method] = {
|
474 |
+
"eval_name": method,
|
475 |
+
"Method": method
|
476 |
+
}
|
477 |
+
|
478 |
+
# Copy all score columns to the method's group
|
479 |
+
for key, value in result_dict.items():
|
480 |
+
if key not in ["eval_name", "Method"]:
|
481 |
+
mean_method_groups[method][key] = value
|
482 |
+
|
483 |
+
# Create the detailed DataFrame for mean accuracy
|
484 |
+
mean_records = list(mean_method_groups.values())
|
485 |
+
detailed_df_mean = pd.DataFrame(mean_records)
|
486 |
+
|
487 |
+
if detailed_df_highest.empty or detailed_df_mean.empty:
|
488 |
+
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
489 |
+
|
490 |
+
# Calculate and add Average column for both DataFrames
|
491 |
+
score_columns_highest = [col for col in detailed_df_highest.columns if col not in ["eval_name", "Method"]]
|
492 |
+
score_columns_mean = [col for col in detailed_df_mean.columns if col not in ["eval_name", "Method"]]
|
493 |
+
|
494 |
+
if score_columns_highest:
|
495 |
+
detailed_df_highest["Average"] = detailed_df_highest[score_columns_highest].mean(axis=1).round(3)
|
496 |
+
|
497 |
+
if score_columns_mean:
|
498 |
+
detailed_df_mean["Average"] = detailed_df_mean[score_columns_mean].mean(axis=1).round(3)
|
499 |
+
|
500 |
+
# Sort by Average descending
|
501 |
+
if "Average" in detailed_df_highest.columns:
|
502 |
+
detailed_df_highest = detailed_df_highest.sort_values("Average", ascending=False)
|
503 |
+
|
504 |
+
if "Average" in detailed_df_mean.columns:
|
505 |
+
detailed_df_mean = detailed_df_mean.sort_values("Average", ascending=False)
|
506 |
+
|
507 |
+
# # Create intervention-averaged DataFrames for both metrics
|
508 |
+
# intervention_averaged_highest_df = create_intervention_averaged_df(detailed_df_highest)
|
509 |
+
# intervention_averaged_mean_df = create_intervention_averaged_df(detailed_df_mean)
|
510 |
+
|
511 |
+
# return detailed_df_highest, detailed_df_mean, intervention_averaged_highest_df
|
512 |
+
return detailed_df_highest, detailed_df_highest, detailed_df_mean
|
513 |
|
|
|
|
|
514 |
|
|
|
|
|
515 |
|
|
|
|
|
516 |
|
|
|
517 |
|
518 |
|
519 |
|