Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
475701c
1
Parent(s):
a2e0e8f
add target variable to causal graph
Browse files- app.py +23 -12
- src/about.py +5 -4
- src/leaderboard/read_evals.py +4 -1
app.py
CHANGED
@@ -443,23 +443,32 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
|
|
443 |
"4_answer_MCQA": "MCQA",
|
444 |
"arithmetic_addition": "Arithmetic (+)",
|
445 |
"arithmetic_subtraction": "Arithmetic (-)",
|
446 |
-
"
|
447 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
}
|
449 |
|
450 |
display_mapping = {}
|
451 |
for task in TasksMib_Causalgraph:
|
452 |
for model in task.value.models:
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
|
458 |
-
# print("\nDebugging display_mapping:", display_mapping)
|
459 |
|
460 |
renamed_df = dataframe.rename(columns=display_mapping)
|
461 |
|
462 |
-
# print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
|
463 |
|
464 |
# Create only necessary columns
|
465 |
return Leaderboard(
|
@@ -521,8 +530,10 @@ def get_hf_username(hf_repo):
|
|
521 |
# Define the preset substrings for filtering
|
522 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
523 |
TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
|
|
|
524 |
MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
525 |
|
|
|
526 |
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
|
527 |
selected_model_substrings: List[str]) -> pd.DataFrame:
|
528 |
"""
|
@@ -693,9 +704,9 @@ with demo:
|
|
693 |
You can combine filters to see specific task-model combinations.
|
694 |
""")
|
695 |
task_substring_checkbox = gr.CheckboxGroup(
|
696 |
-
choices=
|
697 |
label="View tasks:",
|
698 |
-
value=
|
699 |
)
|
700 |
model_substring_checkbox = gr.CheckboxGroup(
|
701 |
choices = MODEL_SUBSTRINGS,
|
@@ -721,9 +732,9 @@ with demo:
|
|
721 |
with gr.TabItem("Averaged View", id=1):
|
722 |
|
723 |
task_substring_checkbox = gr.CheckboxGroup(
|
724 |
-
choices=
|
725 |
label="View tasks:",
|
726 |
-
value=
|
727 |
)
|
728 |
model_substring_checkbox = gr.CheckboxGroup(
|
729 |
choices = MODEL_SUBSTRINGS,
|
|
|
443 |
"4_answer_MCQA": "MCQA",
|
444 |
"arithmetic_addition": "Arithmetic (+)",
|
445 |
"arithmetic_subtraction": "Arithmetic (-)",
|
446 |
+
"ARC_easy": "ARC (Easy)",
|
447 |
+
"RAVEL_task": "RAVEL"
|
448 |
+
}
|
449 |
+
|
450 |
+
target_variables_mapping = {
|
451 |
+
"output_token": "Output Token",
|
452 |
+
"output_position": "Output Position",
|
453 |
+
"answer_pointer": "Answer Pointer",
|
454 |
+
"answer": "Answer",
|
455 |
+
"Continent": "Continent",
|
456 |
+
"Language": "Language",
|
457 |
+
"Country": "Country",
|
458 |
+
"Language": "Language"
|
459 |
}
|
460 |
|
461 |
display_mapping = {}
|
462 |
for task in TasksMib_Causalgraph:
|
463 |
for model in task.value.models:
|
464 |
+
for target_variables in task.value.target_variables:
|
465 |
+
field_name = f"{model}_{task.value.col_name}_{target_variables}"
|
466 |
+
display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]} - {target_variables_mapping[target_variables]}"
|
467 |
+
display_mapping[field_name] = display_name
|
468 |
|
|
|
469 |
|
470 |
renamed_df = dataframe.rename(columns=display_mapping)
|
471 |
|
|
|
472 |
|
473 |
# Create only necessary columns
|
474 |
return Leaderboard(
|
|
|
530 |
# Define the preset substrings for filtering
|
531 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
532 |
TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
|
533 |
+
TASK_CAUSAL_SUBSTRINGS = ["IOI", "MCQA", "ARC (Easy)", "RAVEL"]
|
534 |
MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
535 |
|
536 |
+
|
537 |
def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
|
538 |
selected_model_substrings: List[str]) -> pd.DataFrame:
|
539 |
"""
|
|
|
704 |
You can combine filters to see specific task-model combinations.
|
705 |
""")
|
706 |
task_substring_checkbox = gr.CheckboxGroup(
|
707 |
+
choices=TASK_CAUSAL_SUBSTRINGS,
|
708 |
label="View tasks:",
|
709 |
+
value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
|
710 |
)
|
711 |
model_substring_checkbox = gr.CheckboxGroup(
|
712 |
choices = MODEL_SUBSTRINGS,
|
|
|
732 |
with gr.TabItem("Averaged View", id=1):
|
733 |
|
734 |
task_substring_checkbox = gr.CheckboxGroup(
|
735 |
+
choices=TASK_CAUSAL_SUBSTRINGS,
|
736 |
label="View tasks:",
|
737 |
+
value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
|
738 |
)
|
739 |
model_substring_checkbox = gr.CheckboxGroup(
|
740 |
choices = MODEL_SUBSTRINGS,
|
src/about.py
CHANGED
@@ -78,14 +78,15 @@ class TaskMIB_Causalgraph:
|
|
78 |
models: list[str] # list of models to show as sub-columns
|
79 |
col_name: str # display name in leaderboard
|
80 |
metrics: list[str] # metrics to store (average_score)
|
|
|
81 |
|
82 |
|
83 |
|
84 |
class TasksMib_Causalgraph(Enum):
|
85 |
-
task0 =
|
86 |
-
task1 =
|
87 |
-
task2 =
|
88 |
-
task3 =
|
89 |
|
90 |
@classmethod
|
91 |
def get_all_tasks(cls):
|
|
|
78 |
models: list[str] # list of models to show as sub-columns
|
79 |
col_name: str # display name in leaderboard
|
80 |
metrics: list[str] # metrics to store (average_score)
|
81 |
+
target_variables: list[str]
|
82 |
|
83 |
|
84 |
|
85 |
class TasksMib_Causalgraph(Enum):
|
86 |
+
task0 = TaskMIB_Causalgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"], ["output_token", "output_position"])
|
87 |
+
task1 = TaskMIB_Causalgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"], ["answer_pointer", "answer"])
|
88 |
+
task2 = TaskMIB_Causalgraph("ravel", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "RAVEL_task", ["average_score"], ["Continent", "Language", "Country", "Language"])
|
89 |
+
task3 = TaskMIB_Causalgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ARC_easy", ["average_score"], ["answer_pointer", "answer"])
|
90 |
|
91 |
@classmethod
|
92 |
def get_all_tasks(cls):
|
src/leaderboard/read_evals.py
CHANGED
@@ -298,6 +298,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
298 |
return averaged_df
|
299 |
|
300 |
|
|
|
301 |
@dataclass
|
302 |
class EvalResult_MIB_CAUSALGRAPH:
|
303 |
"""Represents one full evaluation for a method across all models for causal variable localization."""
|
@@ -370,7 +371,7 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
370 |
"""
|
371 |
# Create column name in the exact format requested
|
372 |
# col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
373 |
-
col_name = f"{self.model_name}_{self.task_name}"
|
374 |
print(f"col_name is {col_name}")
|
375 |
|
376 |
# Select the appropriate accuracy metric based on metric_type
|
@@ -526,6 +527,8 @@ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFram
|
|
526 |
|
527 |
|
528 |
|
|
|
|
|
529 |
@dataclass
|
530 |
class EvalResult:
|
531 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
|
|
298 |
return averaged_df
|
299 |
|
300 |
|
301 |
+
|
302 |
@dataclass
|
303 |
class EvalResult_MIB_CAUSALGRAPH:
|
304 |
"""Represents one full evaluation for a method across all models for causal variable localization."""
|
|
|
371 |
"""
|
372 |
# Create column name in the exact format requested
|
373 |
# col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
374 |
+
col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
375 |
print(f"col_name is {col_name}")
|
376 |
|
377 |
# Select the appropriate accuracy metric based on metric_type
|
|
|
527 |
|
528 |
|
529 |
|
530 |
+
|
531 |
+
|
532 |
@dataclass
|
533 |
class EvalResult:
|
534 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|