jasonshaoshun commited on
Commit
691f4a8
·
1 Parent(s): 0ae72a8

caulsal-track debug

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +91 -95
src/leaderboard/read_evals.py CHANGED
@@ -174,97 +174,97 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
174
 
175
 
176
 
177
- def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
178
- """
179
- Process a single JSON file and convert it to a DataFrame.
180
 
181
- Args:
182
- json_file: Dictionary containing the analysis results
183
- method_counter: Counter for handling duplicate method names
184
 
185
- Returns:
186
- pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
187
- """
188
- method_name = json_file['method_name']
189
- unique_method_name = f"{method_name}_{method_counter}"
190
- method_scores = []
191
-
192
- for result in json_file['results']:
193
- model = result['model_id']
194
 
195
- for task, scores in result['task_scores'].items():
196
- # Process each layer's data
197
- intervention_scores = defaultdict(list)
198
 
199
- for layer_data in scores:
200
- for intervention_data in layer_data['layer_scores']:
201
- # Calculate average score for counterfactuals
202
- avg_cf_score = np.mean([
203
- cf['score']
204
- for cf in intervention_data['counterfactual_scores']
205
- ])
206
 
207
- if np.isnan(avg_cf_score):
208
- avg_cf_score = 0.0
209
 
210
- # Group scores by intervention
211
- intervention_key = '_'.join(intervention_data['intervention'])
212
- intervention_scores[intervention_key].append(avg_cf_score)
213
 
214
- # Average across layers for each intervention
215
- for intervention, layer_scores in intervention_scores.items():
216
- column = f"{model}_{task}_{intervention}"
217
- avg_score = np.mean(layer_scores) if layer_scores else 0.0
218
- method_scores.append((column, f"{avg_score:.3f}"))
219
-
220
- # Sort by column names for consistency
221
- method_scores.sort(key=lambda x: x[0])
222
- data = {
223
- unique_method_name: {
224
- col: score for col, score in method_scores
225
- }
226
- }
227
-
228
- return pd.DataFrame.from_dict(data, orient='index')
229
-
230
- def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
231
- model_result_filepaths = []
232
-
233
- # print(f"Scanning directory: {results_path}")
234
- for root, dirnames, files in os.walk(results_path):
235
- # print(f"Current directory: {root}")
236
- # print(f"Found files: {files}")
237
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
238
- continue
239
 
240
- try:
241
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
242
- except dateutil.parser._parser.ParserError:
243
- files = [files[-1]]
244
 
245
- for file in files:
246
- model_result_filepaths.append(os.path.join(root, file))
247
 
248
- # print(f"Found json files: {model_result_filepaths}")
249
 
250
- method_counters = defaultdict(int)
251
- dataframes = []
252
 
253
- for json_file in model_result_filepaths:
254
- try:
255
- with open(filepath, 'r') as f:
256
- json_data = json.load(f)
257
- method_name = json_data['method_name']
258
- method_counters[method_name] += 1
259
 
260
- # Process single JSON file
261
- df = process_single_json(json_data, method_counters[method_name])
262
- dataframes.append(df)
263
- except Exception as e:
264
- print(f"Error processing {json_file}: {e}")
265
- continue
266
 
267
- return dataframes
268
 
269
 
270
 
@@ -449,24 +449,20 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
449
  data_dicts = []
450
 
451
  for filepath in model_result_filepaths:
452
- try:
453
- with open(filepath, 'r') as f:
454
- json_data = json.load(f)
455
- method_name = json_data['method_name']
456
- method_counters[method_name] += 1
457
-
458
- eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
459
- result = eval_result.init_from_json_file(filepath)
460
- data_dict = result.to_dict()
461
-
462
- # Add method counter to the method name if it's not the first instance
463
- if method_counters[method_name] > 1:
464
- data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
465
-
466
- data_dicts.append(data_dict)
467
- except Exception as e:
468
- print(f"Error processing {filepath}: {e}")
469
- continue
470
 
471
  if not data_dicts:
472
  return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
 
174
 
175
 
176
 
177
+ # def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
178
+ # """
179
+ # Process a single JSON file and convert it to a DataFrame.
180
 
181
+ # Args:
182
+ # json_file: Dictionary containing the analysis results
183
+ # method_counter: Counter for handling duplicate method names
184
 
185
+ # Returns:
186
+ # pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
187
+ # """
188
+ # method_name = json_file['method_name']
189
+ # unique_method_name = f"{method_name}_{method_counter}"
190
+ # method_scores = []
191
+
192
+ # for result in json_file['results']:
193
+ # model = result['model_id']
194
 
195
+ # for task, scores in result['task_scores'].items():
196
+ # # Process each layer's data
197
+ # intervention_scores = defaultdict(list)
198
 
199
+ # for layer_data in scores:
200
+ # for intervention_data in layer_data['layer_scores']:
201
+ # # Calculate average score for counterfactuals
202
+ # avg_cf_score = np.mean([
203
+ # cf['score']
204
+ # for cf in intervention_data['counterfactual_scores']
205
+ # ])
206
 
207
+ # if np.isnan(avg_cf_score):
208
+ # avg_cf_score = 0.0
209
 
210
+ # # Group scores by intervention
211
+ # intervention_key = '_'.join(intervention_data['intervention'])
212
+ # intervention_scores[intervention_key].append(avg_cf_score)
213
 
214
+ # # Average across layers for each intervention
215
+ # for intervention, layer_scores in intervention_scores.items():
216
+ # column = f"{model}_{task}_{intervention}"
217
+ # avg_score = np.mean(layer_scores) if layer_scores else 0.0
218
+ # method_scores.append((column, f"{avg_score:.3f}"))
219
+
220
+ # # Sort by column names for consistency
221
+ # method_scores.sort(key=lambda x: x[0])
222
+ # data = {
223
+ # unique_method_name: {
224
+ # col: score for col, score in method_scores
225
+ # }
226
+ # }
227
+
228
+ # return pd.DataFrame.from_dict(data, orient='index')
229
+
230
+ # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
231
+ # model_result_filepaths = []
232
+
233
+ # # print(f"Scanning directory: {results_path}")
234
+ # for root, dirnames, files in os.walk(results_path):
235
+ # # print(f"Current directory: {root}")
236
+ # # print(f"Found files: {files}")
237
+ # if len(files) == 0 or any([not f.endswith(".json") for f in files]):
238
+ # continue
239
 
240
+ # try:
241
+ # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
242
+ # except dateutil.parser._parser.ParserError:
243
+ # files = [files[-1]]
244
 
245
+ # for file in files:
246
+ # model_result_filepaths.append(os.path.join(root, file))
247
 
248
+ # # print(f"Found json files: {model_result_filepaths}")
249
 
250
+ # method_counters = defaultdict(int)
251
+ # dataframes = []
252
 
253
+ # for json_file in model_result_filepaths:
254
+ # try:
255
+ # with open(filepath, 'r') as f:
256
+ # json_data = json.load(f)
257
+ # method_name = json_data['method_name']
258
+ # method_counters[method_name] += 1
259
 
260
+ # # Process single JSON file
261
+ # df = process_single_json(json_data, method_counters[method_name])
262
+ # dataframes.append(df)
263
+ # except Exception as e:
264
+ # print(f"Error processing {json_file}: {e}")
265
+ # continue
266
 
267
+ # return dataframes
268
 
269
 
270
 
 
449
  data_dicts = []
450
 
451
  for filepath in model_result_filepaths:
452
+ with open(filepath, 'r') as f:
453
+ json_data = json.load(f)
454
+ method_name = json_data['method_name']
455
+ method_counters[method_name] += 1
456
+
457
+ eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
458
+ result = eval_result.init_from_json_file(filepath)
459
+ data_dict = result.to_dict()
460
+
461
+ # Add method counter to the method name if it's not the first instance
462
+ if method_counters[method_name] > 1:
463
+ data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
464
+
465
+ data_dicts.append(data_dict)
 
 
 
 
466
 
467
  if not data_dicts:
468
  return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()