jasonshaoshun commited on
Commit
29701ab
·
1 Parent(s): f65df62
src/display/utils.py CHANGED
@@ -140,7 +140,30 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
140
  # ColumnContent(col_name, "number", True)
141
  # ])
142
 
143
- # In utils.py, modify auto_eval_column_dict_mib_causalgraph:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  auto_eval_column_dict_mib_causalgraph = []
145
 
146
  # Method name column
@@ -148,12 +171,12 @@ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnCon
148
 
149
  # For each model-task-intervention-counterfactual combination
150
  for task in TasksMib_Causalgraph:
151
- for model in ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"]: # exact model names
 
152
  for layer in task.value.layers:
153
  for intervention in task.value.interventions:
154
  for counterfactual in task.value.counterfactuals:
155
- # Match the exact format from the data
156
- col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower()
157
  auto_eval_column_dict_mib_causalgraph.append([
158
  col_name,
159
  ColumnContent,
 
140
  # ColumnContent(col_name, "number", True)
141
  # ])
142
 
143
+ # # In utils.py, modify auto_eval_column_dict_mib_causalgraph:
144
+ # auto_eval_column_dict_mib_causalgraph = []
145
+
146
+ # # Method name column
147
+ # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
148
+
149
+ # # For each model-task-intervention-counterfactual combination
150
+ # for task in TasksMib_Causalgraph:
151
+ # for model in ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"]: # exact model names
152
+ # for layer in task.value.layers:
153
+ # for intervention in task.value.interventions:
154
+ # for counterfactual in task.value.counterfactuals:
155
+ # # Match the exact format from the data
156
+ # col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower()
157
+ # auto_eval_column_dict_mib_causalgraph.append([
158
+ # col_name,
159
+ # ColumnContent,
160
+ # ColumnContent(col_name, "number", True)
161
+ # ])
162
+
163
+
164
+
165
+
166
+
167
  auto_eval_column_dict_mib_causalgraph = []
168
 
169
  # Method name column
 
171
 
172
  # For each model-task-intervention-counterfactual combination
173
  for task in TasksMib_Causalgraph:
174
+ for model in task.value.models: # Use exact model names from JSON
175
+ model_name = model # Don't convert to lowercase
176
  for layer in task.value.layers:
177
  for intervention in task.value.interventions:
178
  for counterfactual in task.value.counterfactuals:
179
+ col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}"
 
180
  auto_eval_column_dict_mib_causalgraph.append([
181
  col_name,
182
  ColumnContent,
src/leaderboard/read_evals.py CHANGED
@@ -182,52 +182,52 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
182
 
183
 
184
 
185
- @dataclass
186
- class EvalResult_MIB_CAUSALGRAPH:
187
- """Represents one full evaluation for a method in MIB causalgraph."""
188
- eval_name: str
189
- method_name: str
190
- results: Dict
191
-
192
- def init_from_json_file(self, json_filepath):
193
- """Inits results from the method result file"""
194
- with open(json_filepath) as fp:
195
- data = json.load(fp)
196
-
197
- method_name = data.get("method_name")
198
- results = {}
199
 
200
- # Get results for each model
201
- for model_result in data.get("results", []):
202
- model_id = model_result.get("model_id", "") # Will be one of the three models
203
- task_scores = model_result.get("task_scores", {})
204
 
205
- # Process MCQA task scores
206
- mcqa_scores = {}
207
- for layer_data in task_scores.get("MCQA", []):
208
- layer = layer_data.get("layer")
209
- layer_scores = layer_data.get("layer_scores", [])
210
 
211
- # Store scores for each intervention and counterfactual
212
- for intervention_data in layer_scores:
213
- intervention = intervention_data["intervention"][0]
214
- counterfactual_scores = intervention_data["counterfactual_scores"]
215
 
216
- for cf_score in counterfactual_scores:
217
- counterfactual = cf_score["counterfactual"][0]
218
- score = cf_score["score"]
219
 
220
- # Create key for this combination
221
- key = f"layer{layer}_{intervention}_{counterfactual}"
222
- mcqa_scores[key] = score
223
 
224
- results[model_id] = mcqa_scores
225
 
226
- return EvalResult_MIB_CAUSALGRAPH(
227
- eval_name=method_name,
228
- method_name=method_name,
229
- results=results
230
- )
231
 
232
  # def to_dict(self):
233
  # """Converts the Eval Result to a dict for dataframe display"""
@@ -308,24 +308,90 @@ class EvalResult_MIB_CAUSALGRAPH:
308
  # data_dict[col_name] = intervention_data['score']
309
 
310
  # return data_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  def to_dict(self):
312
  """Converts the Eval Result to a dict for dataframe display"""
313
  data_dict = {
314
  "eval_name": self.eval_name,
315
  "Method": self.method_name,
316
  }
317
-
318
- # Process each model's results
319
- for model_id, scores in self.results.items():
320
- model_name = model_id.lower()
321
- # The scores are already in the format we want
322
- for key, value in scores.items():
323
- col_name = f"{model_name}_{key}"
324
- data_dict[col_name] = value
325
-
326
  return data_dict
327
 
328
 
 
 
 
 
 
 
 
329
  # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
330
  # """Extract evaluation results for MIB causalgraph"""
331
  # model_result_filepaths = []
 
182
 
183
 
184
 
185
+ # @dataclass
186
+ # class EvalResult_MIB_CAUSALGRAPH:
187
+ # """Represents one full evaluation for a method in MIB causalgraph."""
188
+ # eval_name: str
189
+ # method_name: str
190
+ # results: Dict
191
+
192
+ # def init_from_json_file(self, json_filepath):
193
+ # """Inits results from the method result file"""
194
+ # with open(json_filepath) as fp:
195
+ # data = json.load(fp)
196
+
197
+ # method_name = data.get("method_name")
198
+ # results = {}
199
 
200
+ # # Get results for each model
201
+ # for model_result in data.get("results", []):
202
+ # model_id = model_result.get("model_id", "") # Will be one of the three models
203
+ # task_scores = model_result.get("task_scores", {})
204
 
205
+ # # Process MCQA task scores
206
+ # mcqa_scores = {}
207
+ # for layer_data in task_scores.get("MCQA", []):
208
+ # layer = layer_data.get("layer")
209
+ # layer_scores = layer_data.get("layer_scores", [])
210
 
211
+ # # Store scores for each intervention and counterfactual
212
+ # for intervention_data in layer_scores:
213
+ # intervention = intervention_data["intervention"][0]
214
+ # counterfactual_scores = intervention_data["counterfactual_scores"]
215
 
216
+ # for cf_score in counterfactual_scores:
217
+ # counterfactual = cf_score["counterfactual"][0]
218
+ # score = cf_score["score"]
219
 
220
+ # # Create key for this combination
221
+ # key = f"layer{layer}_{intervention}_{counterfactual}"
222
+ # mcqa_scores[key] = score
223
 
224
+ # results[model_id] = mcqa_scores
225
 
226
+ # return EvalResult_MIB_CAUSALGRAPH(
227
+ # eval_name=method_name,
228
+ # method_name=method_name,
229
+ # results=results
230
+ # )
231
 
232
  # def to_dict(self):
233
  # """Converts the Eval Result to a dict for dataframe display"""
 
308
  # data_dict[col_name] = intervention_data['score']
309
 
310
  # return data_dict
311
+
312
+
313
+ # def to_dict(self):
314
+ # """Converts the Eval Result to a dict for dataframe display"""
315
+ # data_dict = {
316
+ # "eval_name": self.eval_name,
317
+ # "Method": self.method_name,
318
+ # }
319
+
320
+ # # Process each model's results
321
+ # for model_id, scores in self.results.items():
322
+ # model_name = model_id.lower()
323
+ # # The scores are already in the format we want
324
+ # for key, value in scores.items():
325
+ # col_name = f"{model_name}_{key}"
326
+ # data_dict[col_name] = value
327
+
328
+ # return data_dict
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+ @dataclass
338
+ class EvalResult_MIB_CAUSALGRAPH:
339
+ eval_name: str
340
+ method_name: str
341
+ results: Dict
342
+
343
+ def init_from_json_file(self, json_filepath):
344
+ """Inits results from the method result file"""
345
+ with open(json_filepath) as fp:
346
+ data = json.load(fp)
347
+
348
+ method_name = data.get("method_name")
349
+ results = {}
350
+
351
+ # Process each model's results
352
+ for model_result in data.get("results", []):
353
+ model_id = model_result.get("model_id", "")
354
+ task_scores = model_result.get("task_scores", {})
355
+
356
+ # Process MCQA scores
357
+ for layer_data in task_scores.get("MCQA", []):
358
+ layer = layer_data.get("layer")
359
+ for score_data in layer_data.get("layer_scores", []):
360
+ intervention = score_data["intervention"][0]
361
+ for cf_score in score_data["counterfactual_scores"]:
362
+ counterfactual = cf_score["counterfactual"][0]
363
+ score = cf_score["score"]
364
+
365
+ # Create key matching the expected column format
366
+ key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
367
+ results[key] = score
368
+
369
+ return EvalResult_MIB_CAUSALGRAPH(
370
+ eval_name=method_name,
371
+ method_name=method_name,
372
+ results=results
373
+ )
374
+
375
  def to_dict(self):
376
  """Converts the Eval Result to a dict for dataframe display"""
377
  data_dict = {
378
  "eval_name": self.eval_name,
379
  "Method": self.method_name,
380
  }
381
+
382
+ # Add all results directly
383
+ data_dict.update(self.results)
384
+
 
 
 
 
 
385
  return data_dict
386
 
387
 
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
  # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
396
  # """Extract evaluation results for MIB causalgraph"""
397
  # model_result_filepaths = []
src/populate.py CHANGED
@@ -221,6 +221,25 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
221
  # # Only return detailed_df for display
222
  # return detailed_df
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
225
  print(f"results_path is {results_path}, requests_path is {requests_path}")
226
  raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
@@ -228,7 +247,7 @@ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, co
228
  # Convert each result to dict format for detailed df
229
  all_data_json = [v.to_dict() for v in raw_data]
230
  detailed_df = pd.DataFrame.from_records(all_data_json)
231
- print("Columns in detailed_df:", detailed_df.columns.tolist()) # Print actual columns
232
 
233
  # Create aggregated df
234
  aggregated_df = aggregate_methods(detailed_df)
 
221
  # # Only return detailed_df for display
222
  # return detailed_df
223
 
224
+ # def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
225
+ # print(f"results_path is {results_path}, requests_path is {requests_path}")
226
+ # raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
227
+
228
+ # # Convert each result to dict format for detailed df
229
+ # all_data_json = [v.to_dict() for v in raw_data]
230
+ # detailed_df = pd.DataFrame.from_records(all_data_json)
231
+ # print("Columns in detailed_df:", detailed_df.columns.tolist()) # Print actual columns
232
+
233
+ # # Create aggregated df
234
+ # aggregated_df = aggregate_methods(detailed_df)
235
+ # print("Columns in aggregated_df:", aggregated_df.columns.tolist())
236
+
237
+ # # Create intervention-averaged df
238
+ # intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
239
+ # print("Columns in intervention_averaged_df:", intervention_averaged_df.columns.tolist())
240
+
241
+ # return detailed_df, aggregated_df, intervention_averaged_df
242
+
243
  def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
244
  print(f"results_path is {results_path}, requests_path is {requests_path}")
245
  raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
 
247
  # Convert each result to dict format for detailed df
248
  all_data_json = [v.to_dict() for v in raw_data]
249
  detailed_df = pd.DataFrame.from_records(all_data_json)
250
+ print("Columns in detailed_df:", detailed_df.columns.tolist())
251
 
252
  # Create aggregated df
253
  aggregated_df = aggregate_methods(detailed_df)