jasonshaoshun commited on
Commit
53c7136
·
1 Parent(s): 202dbe2
Files changed (5) hide show
  1. app.py +2 -2
  2. src/about.py +3 -3
  3. src/display/utils.py +30 -8
  4. src/leaderboard/read_evals.py +26 -31
  5. src/populate.py +51 -18
app.py CHANGED
@@ -75,7 +75,6 @@ except Exception:
75
  LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
76
 
77
  # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
78
-
79
  # In app.py, modify the LEADERBOARD initialization
80
  LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
81
  EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
@@ -84,6 +83,7 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
84
  BENCHMARK_COLS_MIB_CAUSALGRAPH
85
  )
86
 
 
87
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
88
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
89
 
@@ -210,7 +210,7 @@ with demo:
210
 
211
  with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
212
  leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
213
-
214
  # Then modify the Causal Graph tab section
215
  with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
216
  with gr.Tabs() as causalgraph_tabs:
 
75
  LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
76
 
77
  # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
 
78
  # In app.py, modify the LEADERBOARD initialization
79
  LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
80
  EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
 
83
  BENCHMARK_COLS_MIB_CAUSALGRAPH
84
  )
85
 
86
+
87
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
88
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
89
 
 
210
 
211
  with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
212
  leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
213
+
214
  # Then modify the Causal Graph tab section
215
  with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
216
  with gr.Tabs() as causalgraph_tabs:
src/about.py CHANGED
@@ -47,7 +47,7 @@ class TasksMib_Subgraph(Enum):
47
  @dataclass
48
  class TaskMIB_Causalgraph:
49
  benchmark: str # MCQA
50
- models: list[str] # LlamaForCausalLM
51
  layers: list[str] # 0-31
52
  col_name: str # display name in leaderboard
53
  interventions: list[str] # output_token, output_location
@@ -57,8 +57,8 @@ class TaskMIB_Causalgraph:
57
  class TasksMib_Causalgraph(Enum):
58
  task0 = TaskMIB_Causalgraph(
59
  "MCQA",
60
- ["LlamaForCausalLM"],
61
- [str(i) for i in range(32)],
62
  "mcqa",
63
  ["output_token", "output_location"],
64
  ["symbol_counterfactual", "randomLetter_counterfactual",
 
47
  @dataclass
48
  class TaskMIB_Causalgraph:
49
  benchmark: str # MCQA
50
+ models: list[str] # List of all models
51
  layers: list[str] # 0-31
52
  col_name: str # display name in leaderboard
53
  interventions: list[str] # output_token, output_location
 
57
  class TasksMib_Causalgraph(Enum):
58
  task0 = TaskMIB_Causalgraph(
59
  "MCQA",
60
+ ["LlamaForCausalLM", "Qwen2ForCausalLM", "Gemma2ForCausalLM"], # Updated model list
61
+ [str(i) for i in range(32)], # 0-31 layers
62
  "mcqa",
63
  ["output_token", "output_location"],
64
  ["symbol_counterfactual", "randomLetter_counterfactual",
src/display/utils.py CHANGED
@@ -102,7 +102,22 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
102
 
103
 
104
 
105
- # Initialize the MIB causal graph columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  auto_eval_column_dict_mib_causalgraph = []
107
 
108
  # Method name column
@@ -111,13 +126,20 @@ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnCon
111
  # For each model-task-intervention combination
112
  for task in TasksMib_Causalgraph:
113
  for model in task.value.models:
114
- for intervention in task.value.interventions:
115
- col_name = f"{model}_{task.value.benchmark}_{intervention}".lower()
116
- auto_eval_column_dict_mib_causalgraph.append([
117
- col_name,
118
- ColumnContent,
119
- ColumnContent(col_name, "number", True)
120
- ])
 
 
 
 
 
 
 
121
 
122
  # Create the dataclass
123
  AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)
 
102
 
103
 
104
 
105
+ # # Initialize the MIB causal graph columns
106
+ # auto_eval_column_dict_mib_causalgraph = []
107
+
108
+ # # Method name column
109
+ # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
110
+
111
+ # # For each model-task-intervention combination
112
+ # for task in TasksMib_Causalgraph:
113
+ # for model in task.value.models:
114
+ # for intervention in task.value.interventions:
115
+ # col_name = f"{model}_{task.value.benchmark}_{intervention}".lower()
116
+ # auto_eval_column_dict_mib_causalgraph.append([
117
+ # col_name,
118
+ # ColumnContent,
119
+ # ColumnContent(col_name, "number", True)
120
+ # ])
121
  auto_eval_column_dict_mib_causalgraph = []
122
 
123
  # Method name column
 
126
  # For each model-task-intervention combination
127
  for task in TasksMib_Causalgraph:
128
  for model in task.value.models:
129
+ for layer in task.value.layers:
130
+ for intervention in task.value.interventions:
131
+ for counterfactual in task.value.counterfactuals:
132
+ col_name = f"layer{layer}_{intervention}_{counterfactual}"
133
+ field_name = col_name.lower()
134
+ auto_eval_column_dict_mib_causalgraph.append([
135
+ field_name,
136
+ ColumnContent,
137
+ ColumnContent(col_name, "number", True)
138
+ ])
139
+
140
+
141
+
142
+
143
 
144
  # Create the dataclass
145
  AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)
src/leaderboard/read_evals.py CHANGED
@@ -187,46 +187,41 @@ class EvalResult_MIB_CAUSALGRAPH:
187
  """Represents one full evaluation for a method in MIB causalgraph."""
188
  eval_name: str
189
  method_name: str
190
- results: Dict
191
-
192
  def init_from_json_file(self, json_filepath):
193
  """Inits results from the method result file"""
194
  with open(json_filepath) as fp:
195
  data = json.load(fp)
196
-
197
  method_name = data.get("method_name")
198
  results = {}
199
 
200
- # First average across counterfactuals
201
- for result in data.get("results", []):
202
- model_id = result.get("model_id")
203
- task_scores = result.get("task_scores", {})
204
 
205
- model_results = {}
206
- for task, scores in task_scores.items():
207
- layer_scores = []
208
- for layer_data in scores:
209
- layer = layer_data.get("layer")
210
- layer_scores_data = []
211
-
212
- for intervention_data in layer_data.get("layer_scores", []):
213
- # Average across counterfactuals
214
- avg_score = np.mean([cf['score'] for cf in intervention_data['counterfactual_scores']])
215
- if np.isnan(avg_score):
216
- avg_score = 0.0
217
- layer_scores_data.append({
218
- 'intervention': intervention_data['intervention'][0],
219
- 'score': avg_score
220
- })
221
-
222
- layer_scores.append({
223
- 'layer': layer,
224
- 'scores': layer_scores_data
225
- })
226
 
227
- model_results[task] = layer_scores
228
-
229
- results[model_id] = model_results
 
 
 
 
 
 
 
 
 
 
 
230
 
231
  return EvalResult_MIB_CAUSALGRAPH(
232
  eval_name=method_name,
 
187
  """Represents one full evaluation for a method in MIB causalgraph."""
188
  eval_name: str
189
  method_name: str
190
+ results: Dict
191
+
192
  def init_from_json_file(self, json_filepath):
193
  """Inits results from the method result file"""
194
  with open(json_filepath) as fp:
195
  data = json.load(fp)
196
+
197
  method_name = data.get("method_name")
198
  results = {}
199
 
200
+ # Get results for each model
201
+ for model_result in data.get("results", []):
202
+ model_id = model_result.get("model_id", "") # Will be one of the three models
203
+ task_scores = model_result.get("task_scores", {})
204
 
205
+ # Process MCQA task scores
206
+ mcqa_scores = {}
207
+ for layer_data in task_scores.get("MCQA", []):
208
+ layer = layer_data.get("layer")
209
+ layer_scores = layer_data.get("layer_scores", [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ # Store scores for each intervention and counterfactual
212
+ for intervention_data in layer_scores:
213
+ intervention = intervention_data["intervention"][0]
214
+ counterfactual_scores = intervention_data["counterfactual_scores"]
215
+
216
+ for cf_score in counterfactual_scores:
217
+ counterfactual = cf_score["counterfactual"][0]
218
+ score = cf_score["score"]
219
+
220
+ # Create key for this combination
221
+ key = f"layer{layer}_{intervention}_{counterfactual}"
222
+ mcqa_scores[key] = score
223
+
224
+ results[model_id] = mcqa_scores
225
 
226
  return EvalResult_MIB_CAUSALGRAPH(
227
  eval_name=method_name,
src/populate.py CHANGED
@@ -133,37 +133,70 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
133
 
134
  # return averaged_df
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
137
  """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
138
  df_copy = df.copy()
139
 
140
- # Store Method column if it exists
141
  method_col = None
142
  if 'Method' in df_copy.columns:
143
  method_col = df_copy['Method']
144
  df_copy = df_copy.drop('Method', axis=1)
145
 
146
- # Remove eval_name if present
147
  if 'eval_name' in df_copy.columns:
148
  df_copy = df_copy.drop('eval_name', axis=1)
149
 
150
- # Group columns by model_task
151
- model_task_groups = {}
152
- for col in df_copy.columns:
153
- model_task = '_'.join(col.split('_')[:2]) # Get model_task part
154
- if model_task not in model_task_groups:
155
- model_task_groups[model_task] = []
156
- model_task_groups[model_task].append(col)
157
-
158
- # Create new DataFrame with averaged intervention scores
159
- averaged_df = pd.DataFrame({
160
- model_task: df_copy[cols].mean(axis=1).round(3)
161
- for model_task, cols in model_task_groups.items()
162
- })
163
-
164
- # Add Method column back
165
  if method_col is not None:
166
- averaged_df.insert(0, 'Method', method_col)
 
 
 
167
 
168
  return averaged_df
169
 
 
133
 
134
  # return averaged_df
135
 
136
+ # def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
137
+ # """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
138
+ # df_copy = df.copy()
139
+
140
+ # # Store Method column if it exists
141
+ # method_col = None
142
+ # if 'Method' in df_copy.columns:
143
+ # method_col = df_copy['Method']
144
+ # df_copy = df_copy.drop('Method', axis=1)
145
+
146
+ # # Remove eval_name if present
147
+ # if 'eval_name' in df_copy.columns:
148
+ # df_copy = df_copy.drop('eval_name', axis=1)
149
+
150
+ # # Group columns by model_task
151
+ # model_task_groups = {}
152
+ # for col in df_copy.columns:
153
+ # model_task = '_'.join(col.split('_')[:2]) # Get model_task part
154
+ # if model_task not in model_task_groups:
155
+ # model_task_groups[model_task] = []
156
+ # model_task_groups[model_task].append(col)
157
+
158
+ # # Create new DataFrame with averaged intervention scores
159
+ # averaged_df = pd.DataFrame({
160
+ # model_task: df_copy[cols].mean(axis=1).round(3)
161
+ # for model_task, cols in model_task_groups.items()
162
+ # })
163
+
164
+ # # Add Method column back
165
+ # if method_col is not None:
166
+ # averaged_df.insert(0, 'Method', method_col)
167
+
168
+ # return averaged_df
169
+
170
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
171
  """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
172
  df_copy = df.copy()
173
 
174
+ # Store Method column
175
  method_col = None
176
  if 'Method' in df_copy.columns:
177
  method_col = df_copy['Method']
178
  df_copy = df_copy.drop('Method', axis=1)
179
 
 
180
  if 'eval_name' in df_copy.columns:
181
  df_copy = df_copy.drop('eval_name', axis=1)
182
 
183
+ # Group columns by model and task
184
+ result_cols = {}
185
+ for task in TasksMib_Causalgraph:
186
+ for model in task.value.models: # Will iterate over all three models
187
+ model = model.lower()
188
+ for intervention in task.value.interventions:
189
+ col_name = f"{model}_{task.value.benchmark.lower()}_{intervention}"
190
+ matching_cols = [c for c in df_copy.columns if c.startswith(col_name)]
191
+ if matching_cols:
192
+ result_cols[col_name] = matching_cols
193
+
194
+ averaged_df = pd.DataFrame()
 
 
 
195
  if method_col is not None:
196
+ averaged_df['Method'] = method_col
197
+
198
+ for col_name, cols in result_cols.items():
199
+ averaged_df[col_name] = df_copy[cols].mean(axis=1).round(3)
200
 
201
  return averaged_df
202