Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

jasonshaoshun commited on May 5

Commit

475701c

1 Parent(s): a2e0e8f

add target variable to causal graph

Browse files

Files changed (3) hide show

app.py +23 -12
src/about.py +5 -4
src/leaderboard/read_evals.py +4 -1

app.py CHANGED Viewed

@@ -443,23 +443,32 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
         "4_answer_MCQA": "MCQA",
         "arithmetic_addition": "Arithmetic (+)",
         "arithmetic_subtraction": "Arithmetic (-)",
-        "arc_easy": "ARC (Easy)",
-        "arc_challenge": "ARC (Challenge)"
     }
     display_mapping = {}
     for task in TasksMib_Causalgraph:
         for model in task.value.models:
-            # print(f"Task: {task.value.benchmark}, Model: {model}")
-            field_name = f"{model}_{task.value.col_name}"
-            display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
-            display_mapping[field_name] = display_name
-    # print("\nDebugging display_mapping:", display_mapping)
     renamed_df = dataframe.rename(columns=display_mapping)
-    # print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
     # Create only necessary columns
     return Leaderboard(
@@ -521,8 +530,10 @@ def get_hf_username(hf_repo):
 # Define the preset substrings for filtering
 PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
 TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
 MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
 def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
                                  selected_model_substrings: List[str]) -> pd.DataFrame:
     """
@@ -693,9 +704,9 @@ with demo:
                     You can combine filters to see specific task-model combinations.
                     """)
                     task_substring_checkbox = gr.CheckboxGroup(
-                        choices=TASK_SUBSTRINGS,
                         label="View tasks:",
-                        value=TASK_SUBSTRINGS,  # Default to all substrings selected
                     )
                     model_substring_checkbox = gr.CheckboxGroup(
                         choices = MODEL_SUBSTRINGS,
@@ -721,9 +732,9 @@ with demo:
                 with gr.TabItem("Averaged View", id=1):
                     task_substring_checkbox = gr.CheckboxGroup(
-                        choices=TASK_SUBSTRINGS,
                         label="View tasks:",
-                        value=TASK_SUBSTRINGS,  # Default to all substrings selected
                     )
                     model_substring_checkbox = gr.CheckboxGroup(
                         choices = MODEL_SUBSTRINGS,

         "4_answer_MCQA": "MCQA",
         "arithmetic_addition": "Arithmetic (+)",
         "arithmetic_subtraction": "Arithmetic (-)",
+        "ARC_easy": "ARC (Easy)",
+        "RAVEL_task": "RAVEL"
+    }
+    target_variables_mapping = {
+        "output_token": "Output Token",
+        "output_position": "Output Position",
+        "answer_pointer": "Answer Pointer",
+        "answer": "Answer",
+        "Continent": "Continent",
+        "Language": "Language",
+        "Country": "Country",
+        "Language": "Language"
     }
     display_mapping = {}
     for task in TasksMib_Causalgraph:
         for model in task.value.models:
+            for target_variables in task.value.target_variables:
+                field_name = f"{model}_{task.value.col_name}_{target_variables}"
+                display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]} - {target_variables_mapping[target_variables]}"
+                display_mapping[field_name] = display_name
     renamed_df = dataframe.rename(columns=display_mapping)
     # Create only necessary columns
     return Leaderboard(
 # Define the preset substrings for filtering
 PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
 TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
+TASK_CAUSAL_SUBSTRINGS = ["IOI", "MCQA", "ARC (Easy)", "RAVEL"]
 MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
 def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
                                  selected_model_substrings: List[str]) -> pd.DataFrame:
     """
                     You can combine filters to see specific task-model combinations.
                     """)
                     task_substring_checkbox = gr.CheckboxGroup(
+                        choices=TASK_CAUSAL_SUBSTRINGS,
                         label="View tasks:",
+                        value=TASK_CAUSAL_SUBSTRINGS,  # Default to all substrings selected
                     )
                     model_substring_checkbox = gr.CheckboxGroup(
                         choices = MODEL_SUBSTRINGS,
                 with gr.TabItem("Averaged View", id=1):
                     task_substring_checkbox = gr.CheckboxGroup(
+                        choices=TASK_CAUSAL_SUBSTRINGS,
                         label="View tasks:",
+                        value=TASK_CAUSAL_SUBSTRINGS,  # Default to all substrings selected
                     )
                     model_substring_checkbox = gr.CheckboxGroup(
                         choices = MODEL_SUBSTRINGS,

src/about.py CHANGED Viewed

@@ -78,14 +78,15 @@ class TaskMIB_Causalgraph:
     models: list[str]   # list of models to show as sub-columns
     col_name: str       # display name in leaderboard
     metrics: list[str]  # metrics to store (average_score)
 class TasksMib_Causalgraph(Enum):
-    task0 = TaskMIB_Subgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"])
-    task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
-    task2 = TaskMIB_Subgraph("arithmetic_addition", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
-    task3 = TaskMIB_Subgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
     @classmethod
     def get_all_tasks(cls):

     models: list[str]   # list of models to show as sub-columns
     col_name: str       # display name in leaderboard
     metrics: list[str]  # metrics to store (average_score)
+    target_variables: list[str]
 class TasksMib_Causalgraph(Enum):
+    task0 = TaskMIB_Causalgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"], ["output_token", "output_position"])
+    task1 = TaskMIB_Causalgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"], ["answer_pointer", "answer"])
+    task2 = TaskMIB_Causalgraph("ravel", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "RAVEL_task", ["average_score"], ["Continent", "Language", "Country", "Language"])
+    task3 = TaskMIB_Causalgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ARC_easy", ["average_score"], ["answer_pointer", "answer"])
     @classmethod
     def get_all_tasks(cls):

src/leaderboard/read_evals.py CHANGED Viewed

@@ -298,6 +298,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
     return averaged_df
 @dataclass
 class EvalResult_MIB_CAUSALGRAPH:
     """Represents one full evaluation for a method across all models for causal variable localization."""
@@ -370,7 +371,7 @@ class EvalResult_MIB_CAUSALGRAPH:
         """
         # Create column name in the exact format requested
         # col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
-        col_name = f"{self.model_name}_{self.task_name}"
         print(f"col_name is {col_name}")
         # Select the appropriate accuracy metric based on metric_type
@@ -526,6 +527,8 @@ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFram
 @dataclass
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.

     return averaged_df
 @dataclass
 class EvalResult_MIB_CAUSALGRAPH:
     """Represents one full evaluation for a method across all models for causal variable localization."""
         """
         # Create column name in the exact format requested
         # col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
+        col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
         print(f"col_name is {col_name}")
         # Select the appropriate accuracy metric based on metric_type
 @dataclass
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.