jasonshaoshun commited on
Commit
475701c
·
1 Parent(s): a2e0e8f

add target variable to causal graph

Browse files
Files changed (3) hide show
  1. app.py +23 -12
  2. src/about.py +5 -4
  3. src/leaderboard/read_evals.py +4 -1
app.py CHANGED
@@ -443,23 +443,32 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
443
  "4_answer_MCQA": "MCQA",
444
  "arithmetic_addition": "Arithmetic (+)",
445
  "arithmetic_subtraction": "Arithmetic (-)",
446
- "arc_easy": "ARC (Easy)",
447
- "arc_challenge": "ARC (Challenge)"
 
 
 
 
 
 
 
 
 
 
 
448
  }
449
 
450
  display_mapping = {}
451
  for task in TasksMib_Causalgraph:
452
  for model in task.value.models:
453
- # print(f"Task: {task.value.benchmark}, Model: {model}")
454
- field_name = f"{model}_{task.value.col_name}"
455
- display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
456
- display_mapping[field_name] = display_name
457
 
458
- # print("\nDebugging display_mapping:", display_mapping)
459
 
460
  renamed_df = dataframe.rename(columns=display_mapping)
461
 
462
- # print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
463
 
464
  # Create only necessary columns
465
  return Leaderboard(
@@ -521,8 +530,10 @@ def get_hf_username(hf_repo):
521
  # Define the preset substrings for filtering
522
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
523
  TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
 
524
  MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
525
 
 
526
  def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
527
  selected_model_substrings: List[str]) -> pd.DataFrame:
528
  """
@@ -693,9 +704,9 @@ with demo:
693
  You can combine filters to see specific task-model combinations.
694
  """)
695
  task_substring_checkbox = gr.CheckboxGroup(
696
- choices=TASK_SUBSTRINGS,
697
  label="View tasks:",
698
- value=TASK_SUBSTRINGS, # Default to all substrings selected
699
  )
700
  model_substring_checkbox = gr.CheckboxGroup(
701
  choices = MODEL_SUBSTRINGS,
@@ -721,9 +732,9 @@ with demo:
721
  with gr.TabItem("Averaged View", id=1):
722
 
723
  task_substring_checkbox = gr.CheckboxGroup(
724
- choices=TASK_SUBSTRINGS,
725
  label="View tasks:",
726
- value=TASK_SUBSTRINGS, # Default to all substrings selected
727
  )
728
  model_substring_checkbox = gr.CheckboxGroup(
729
  choices = MODEL_SUBSTRINGS,
 
443
  "4_answer_MCQA": "MCQA",
444
  "arithmetic_addition": "Arithmetic (+)",
445
  "arithmetic_subtraction": "Arithmetic (-)",
446
+ "ARC_easy": "ARC (Easy)",
447
+ "RAVEL_task": "RAVEL"
448
+ }
449
+
450
+ target_variables_mapping = {
451
+ "output_token": "Output Token",
452
+ "output_position": "Output Position",
453
+ "answer_pointer": "Answer Pointer",
454
+ "answer": "Answer",
455
+ "Continent": "Continent",
456
+ "Language": "Language",
457
+ "Country": "Country",
458
+ "Language": "Language"
459
  }
460
 
461
  display_mapping = {}
462
  for task in TasksMib_Causalgraph:
463
  for model in task.value.models:
464
+ for target_variables in task.value.target_variables:
465
+ field_name = f"{model}_{task.value.col_name}_{target_variables}"
466
+ display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]} - {target_variables_mapping[target_variables]}"
467
+ display_mapping[field_name] = display_name
468
 
 
469
 
470
  renamed_df = dataframe.rename(columns=display_mapping)
471
 
 
472
 
473
  # Create only necessary columns
474
  return Leaderboard(
 
530
  # Define the preset substrings for filtering
531
  PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
532
  TASK_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC"]
533
+ TASK_CAUSAL_SUBSTRINGS = ["IOI", "MCQA", "ARC (Easy)", "RAVEL"]
534
  MODEL_SUBSTRINGS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
535
 
536
+
537
  def filter_columns_by_substrings(dataframe: pd.DataFrame, selected_task_substrings: List[str],
538
  selected_model_substrings: List[str]) -> pd.DataFrame:
539
  """
 
704
  You can combine filters to see specific task-model combinations.
705
  """)
706
  task_substring_checkbox = gr.CheckboxGroup(
707
+ choices=TASK_CAUSAL_SUBSTRINGS,
708
  label="View tasks:",
709
+ value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
710
  )
711
  model_substring_checkbox = gr.CheckboxGroup(
712
  choices = MODEL_SUBSTRINGS,
 
732
  with gr.TabItem("Averaged View", id=1):
733
 
734
  task_substring_checkbox = gr.CheckboxGroup(
735
+ choices=TASK_CAUSAL_SUBSTRINGS,
736
  label="View tasks:",
737
+ value=TASK_CAUSAL_SUBSTRINGS, # Default to all substrings selected
738
  )
739
  model_substring_checkbox = gr.CheckboxGroup(
740
  choices = MODEL_SUBSTRINGS,
src/about.py CHANGED
@@ -78,14 +78,15 @@ class TaskMIB_Causalgraph:
78
  models: list[str] # list of models to show as sub-columns
79
  col_name: str # display name in leaderboard
80
  metrics: list[str] # metrics to store (average_score)
 
81
 
82
 
83
 
84
  class TasksMib_Causalgraph(Enum):
85
- task0 = TaskMIB_Subgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"])
86
- task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
87
- task2 = TaskMIB_Subgraph("arithmetic_addition", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
88
- task3 = TaskMIB_Subgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
89
 
90
  @classmethod
91
  def get_all_tasks(cls):
 
78
  models: list[str] # list of models to show as sub-columns
79
  col_name: str # display name in leaderboard
80
  metrics: list[str] # metrics to store (average_score)
81
+ target_variables: list[str]
82
 
83
 
84
 
85
  class TasksMib_Causalgraph(Enum):
86
+ task0 = TaskMIB_Causalgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"], ["output_token", "output_position"])
87
+ task1 = TaskMIB_Causalgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"], ["answer_pointer", "answer"])
88
+ task2 = TaskMIB_Causalgraph("ravel", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "RAVEL_task", ["average_score"], ["Continent", "Language", "Country", "Language"])
89
+ task3 = TaskMIB_Causalgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ARC_easy", ["average_score"], ["answer_pointer", "answer"])
90
 
91
  @classmethod
92
  def get_all_tasks(cls):
src/leaderboard/read_evals.py CHANGED
@@ -298,6 +298,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
298
  return averaged_df
299
 
300
 
 
301
  @dataclass
302
  class EvalResult_MIB_CAUSALGRAPH:
303
  """Represents one full evaluation for a method across all models for causal variable localization."""
@@ -370,7 +371,7 @@ class EvalResult_MIB_CAUSALGRAPH:
370
  """
371
  # Create column name in the exact format requested
372
  # col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
373
- col_name = f"{self.model_name}_{self.task_name}"
374
  print(f"col_name is {col_name}")
375
 
376
  # Select the appropriate accuracy metric based on metric_type
@@ -526,6 +527,8 @@ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFram
526
 
527
 
528
 
 
 
529
  @dataclass
530
  class EvalResult:
531
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
 
298
  return averaged_df
299
 
300
 
301
+
302
  @dataclass
303
  class EvalResult_MIB_CAUSALGRAPH:
304
  """Represents one full evaluation for a method across all models for causal variable localization."""
 
371
  """
372
  # Create column name in the exact format requested
373
  # col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
374
+ col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
375
  print(f"col_name is {col_name}")
376
 
377
  # Select the appropriate accuracy metric based on metric_type
 
527
 
528
 
529
 
530
+
531
+
532
  @dataclass
533
  class EvalResult:
534
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.