jasonshaoshun commited on
Commit
0ae72a8
·
1 Parent(s): 89390c2

caulsal-track debug

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +85 -91
src/leaderboard/read_evals.py CHANGED
@@ -174,103 +174,100 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
174
 
175
 
176
 
177
- # def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
178
- # """
179
- # Process a single JSON file and convert it to a DataFrame.
180
 
181
- # Args:
182
- # json_file: Dictionary containing the analysis results
183
- # method_counter: Counter for handling duplicate method names
184
 
185
- # Returns:
186
- # pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
187
- # """
188
- # method_name = json_file['method_name']
189
- # unique_method_name = f"{method_name}_{method_counter}"
190
- # method_scores = []
191
-
192
- # for result in json_file['results']:
193
- # model = result['model_id']
194
 
195
- # for task, scores in result['task_scores'].items():
196
- # # Process each layer's data
197
- # intervention_scores = defaultdict(list)
198
 
199
- # for layer_data in scores:
200
- # for intervention_data in layer_data['layer_scores']:
201
- # # Calculate average score for counterfactuals
202
- # avg_cf_score = np.mean([
203
- # cf['score']
204
- # for cf in intervention_data['counterfactual_scores']
205
- # ])
206
 
207
- # if np.isnan(avg_cf_score):
208
- # avg_cf_score = 0.0
209
 
210
- # # Group scores by intervention
211
- # intervention_key = '_'.join(intervention_data['intervention'])
212
- # intervention_scores[intervention_key].append(avg_cf_score)
213
 
214
- # # Average across layers for each intervention
215
- # for intervention, layer_scores in intervention_scores.items():
216
- # column = f"{model}_{task}_{intervention}"
217
- # avg_score = np.mean(layer_scores) if layer_scores else 0.0
218
- # method_scores.append((column, f"{avg_score:.3f}"))
219
-
220
- # # Sort by column names for consistency
221
- # method_scores.sort(key=lambda x: x[0])
222
- # data = {
223
- # unique_method_name: {
224
- # col: score for col, score in method_scores
225
- # }
226
- # }
227
-
228
- # return pd.DataFrame.from_dict(data, orient='index')
229
-
230
- # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
231
- # model_result_filepaths = []
232
-
233
- # # print(f"Scanning directory: {results_path}")
234
- # for root, dirnames, files in os.walk(results_path):
235
- # # print(f"Current directory: {root}")
236
- # # print(f"Found files: {files}")
237
- # if len(files) == 0 or any([not f.endswith(".json") for f in files]):
238
- # continue
239
 
240
- # try:
241
- # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
242
- # except dateutil.parser._parser.ParserError:
243
- # files = [files[-1]]
244
 
245
- # for file in files:
246
- # model_result_filepaths.append(os.path.join(root, file))
247
 
248
- # # print(f"Found json files: {model_result_filepaths}")
249
 
250
- # method_counters = defaultdict(int)
251
- # dataframes = []
252
 
253
- # for json_file in model_result_filepaths:
254
- # try:
255
- # with open(filepath, 'r') as f:
256
- # json_data = json.load(f)
257
- # method_name = json_data['method_name']
258
- # method_counters[method_name] += 1
259
 
260
- # # Process single JSON file
261
- # df = process_single_json(json_data, method_counters[method_name])
262
- # dataframes.append(df)
263
- # except Exception as e:
264
- # print(f"Error processing {json_file}: {e}")
265
- # continue
266
-
267
- # # # Concatenate all DataFrames
268
- # # if dataframes:
269
- # # final_df = pd.concat(dataframes, axis=0)
270
- # # return final_df
271
- # # else:
272
- # # return pd.DataFrame()
273
- # return dataframes
274
 
275
  from dataclasses import dataclass
276
  import json
@@ -285,7 +282,6 @@ from collections import defaultdict
285
  @dataclass
286
  class EvalResult_MIB_CAUSALGRAPH:
287
  """Represents one full evaluation for a method across all models in MIB for causal graph track."""
288
- eval_name: str # method name as identifier
289
  method_name: str # name of the interpretation method
290
  results: Dict # nested dict of results for each model and task
291
 
@@ -298,7 +294,7 @@ class EvalResult_MIB_CAUSALGRAPH:
298
 
299
  # Initialize results dictionary
300
  results = {}
301
- for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
302
  results[task] = {}
303
 
304
  # Process each model's results
@@ -330,7 +326,6 @@ class EvalResult_MIB_CAUSALGRAPH:
330
  }
331
 
332
  return EvalResult_MIB_CAUSALGRAPH(
333
- eval_name=method_name,
334
  method_name=method_name,
335
  results=results
336
  )
@@ -338,7 +333,6 @@ class EvalResult_MIB_CAUSALGRAPH:
338
  def to_dict(self, metric_type="average"):
339
  """Converts the Eval Result to a dict for dataframe display"""
340
  data_dict = {
341
- "eval_name": self.eval_name,
342
  "Method": self.method_name,
343
  }
344
 
@@ -359,7 +353,7 @@ class EvalResult_MIB_CAUSALGRAPH:
359
  data_dict[col_name] = f"{avg_score:.3f}"
360
  all_scores.append(avg_score)
361
 
362
- data_dict["Average"] = f"{np.mean(all_scores):.3f}" if all_scores else '-'
363
  return data_dict
364
 
365
 
@@ -480,8 +474,8 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
480
  # Create the detailed DataFrame
481
  detailed_df = pd.DataFrame(data_dicts)
482
  detailed_df.set_index("Method", inplace=True)
483
- if "eval_name" in detailed_df.columns:
484
- detailed_df.drop("eval_name", axis=1, inplace=True)
485
 
486
  # Create aggregated DataFrame
487
  aggregated_df = aggregate_methods(detailed_df)
 
174
 
175
 
176
 
177
+ def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
178
+ """
179
+ Process a single JSON file and convert it to a DataFrame.
180
 
181
+ Args:
182
+ json_file: Dictionary containing the analysis results
183
+ method_counter: Counter for handling duplicate method names
184
 
185
+ Returns:
186
+ pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
187
+ """
188
+ method_name = json_file['method_name']
189
+ unique_method_name = f"{method_name}_{method_counter}"
190
+ method_scores = []
191
+
192
+ for result in json_file['results']:
193
+ model = result['model_id']
194
 
195
+ for task, scores in result['task_scores'].items():
196
+ # Process each layer's data
197
+ intervention_scores = defaultdict(list)
198
 
199
+ for layer_data in scores:
200
+ for intervention_data in layer_data['layer_scores']:
201
+ # Calculate average score for counterfactuals
202
+ avg_cf_score = np.mean([
203
+ cf['score']
204
+ for cf in intervention_data['counterfactual_scores']
205
+ ])
206
 
207
+ if np.isnan(avg_cf_score):
208
+ avg_cf_score = 0.0
209
 
210
+ # Group scores by intervention
211
+ intervention_key = '_'.join(intervention_data['intervention'])
212
+ intervention_scores[intervention_key].append(avg_cf_score)
213
 
214
+ # Average across layers for each intervention
215
+ for intervention, layer_scores in intervention_scores.items():
216
+ column = f"{model}_{task}_{intervention}"
217
+ avg_score = np.mean(layer_scores) if layer_scores else 0.0
218
+ method_scores.append((column, f"{avg_score:.3f}"))
219
+
220
+ # Sort by column names for consistency
221
+ method_scores.sort(key=lambda x: x[0])
222
+ data = {
223
+ unique_method_name: {
224
+ col: score for col, score in method_scores
225
+ }
226
+ }
227
+
228
+ return pd.DataFrame.from_dict(data, orient='index')
229
+
230
+ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
231
+ model_result_filepaths = []
232
+
233
+ # print(f"Scanning directory: {results_path}")
234
+ for root, dirnames, files in os.walk(results_path):
235
+ # print(f"Current directory: {root}")
236
+ # print(f"Found files: {files}")
237
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
238
+ continue
239
 
240
+ try:
241
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
242
+ except dateutil.parser._parser.ParserError:
243
+ files = [files[-1]]
244
 
245
+ for file in files:
246
+ model_result_filepaths.append(os.path.join(root, file))
247
 
248
+ # print(f"Found json files: {model_result_filepaths}")
249
 
250
+ method_counters = defaultdict(int)
251
+ dataframes = []
252
 
253
+ for json_file in model_result_filepaths:
254
+ try:
255
+ with open(filepath, 'r') as f:
256
+ json_data = json.load(f)
257
+ method_name = json_data['method_name']
258
+ method_counters[method_name] += 1
259
 
260
+ # Process single JSON file
261
+ df = process_single_json(json_data, method_counters[method_name])
262
+ dataframes.append(df)
263
+ except Exception as e:
264
+ print(f"Error processing {json_file}: {e}")
265
+ continue
266
+
267
+ return dataframes
268
+
269
+
270
+
 
 
 
271
 
272
  from dataclasses import dataclass
273
  import json
 
282
  @dataclass
283
  class EvalResult_MIB_CAUSALGRAPH:
284
  """Represents one full evaluation for a method across all models in MIB for causal graph track."""
 
285
  method_name: str # name of the interpretation method
286
  results: Dict # nested dict of results for each model and task
287
 
 
294
 
295
  # Initialize results dictionary
296
  results = {}
297
+ for task in ["MCQA"]:
298
  results[task] = {}
299
 
300
  # Process each model's results
 
326
  }
327
 
328
  return EvalResult_MIB_CAUSALGRAPH(
 
329
  method_name=method_name,
330
  results=results
331
  )
 
333
  def to_dict(self, metric_type="average"):
334
  """Converts the Eval Result to a dict for dataframe display"""
335
  data_dict = {
 
336
  "Method": self.method_name,
337
  }
338
 
 
353
  data_dict[col_name] = f"{avg_score:.3f}"
354
  all_scores.append(avg_score)
355
 
356
+ data_dict["Average"] = f"{np.mean(all_scores):.3f}"
357
  return data_dict
358
 
359
 
 
474
  # Create the detailed DataFrame
475
  detailed_df = pd.DataFrame(data_dicts)
476
  detailed_df.set_index("Method", inplace=True)
477
+ # if "eval_name" in detailed_df.columns:
478
+ # detailed_df.drop("eval_name", axis=1, inplace=True)
479
 
480
  # Create aggregated DataFrame
481
  aggregated_df = aggregate_methods(detailed_df)