jasonshaoshun commited on
Commit
89390c2
·
1 Parent(s): 5f51841

caulsal-track debug

Browse files
Files changed (2) hide show
  1. src/leaderboard/read_evals.py +108 -13
  2. src/populate.py +11 -9
src/leaderboard/read_evals.py CHANGED
@@ -275,10 +275,12 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
275
  from dataclasses import dataclass
276
  import json
277
  import numpy as np
 
278
  from typing import Dict, List, Any
279
  import os
280
  from datetime import datetime
281
  import dateutil
 
282
 
283
  @dataclass
284
  class EvalResult_MIB_CAUSALGRAPH:
@@ -354,14 +356,86 @@ class EvalResult_MIB_CAUSALGRAPH:
354
  continue
355
 
356
  avg_score = np.mean(scores)
357
- data_dict[col_name] = round(avg_score, 3)
358
  all_scores.append(avg_score)
359
 
360
- data_dict["Average"] = round(np.mean(all_scores), 3) if all_scores else '-'
361
  return data_dict
362
 
363
 
364
- def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  """From the path of the results folder root, extract all needed info for MIB causal graph results"""
366
  model_result_filepaths = []
367
 
@@ -377,24 +451,45 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
377
  for file in files:
378
  model_result_filepaths.append(os.path.join(root, file))
379
 
380
- eval_results = []
381
- for model_result_filepath in model_result_filepaths:
 
 
382
  try:
383
- eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {}) # Create empty instance
384
- result = eval_result.init_from_json_file(model_result_filepath)
385
- # Verify the result can be converted to dict format
386
- result.to_dict()
387
- eval_results.append(result)
 
 
 
 
 
 
 
 
 
388
  except Exception as e:
389
- print(f"Error processing {model_result_filepath}: {e}")
390
  continue
391
 
392
- return eval_results
393
-
394
 
 
 
 
 
 
395
 
 
 
396
 
 
 
397
 
 
398
 
399
 
400
 
 
275
  from dataclasses import dataclass
276
  import json
277
  import numpy as np
278
+ import pandas as pd
279
  from typing import Dict, List, Any
280
  import os
281
  from datetime import datetime
282
  import dateutil
283
+ from collections import defaultdict
284
 
285
  @dataclass
286
  class EvalResult_MIB_CAUSALGRAPH:
 
356
  continue
357
 
358
  avg_score = np.mean(scores)
359
+ data_dict[col_name] = f"{avg_score:.3f}"
360
  all_scores.append(avg_score)
361
 
362
+ data_dict["Average"] = f"{np.mean(all_scores):.3f}" if all_scores else '-'
363
  return data_dict
364
 
365
 
366
+ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
367
+ """
368
+ Aggregates rows with the same base method name by taking the max value for each column.
369
+ """
370
+ # Create a copy of the DataFrame
371
+ df_copy = df.copy()
372
+
373
+ # Extract base method names (remove _2, _3, etc. suffixes)
374
+ base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
375
+ else name for name in df_copy.index]
376
+ df_copy.index = base_methods
377
+
378
+ # Convert scores to numeric values
379
+ def extract_score(score_str):
380
+ if isinstance(score_str, str):
381
+ return float(score_str)
382
+ return 0.0
383
+
384
+ numeric_df = df_copy.applymap(extract_score)
385
+
386
+ # Group by base method name and take the max
387
+ aggregated_df = numeric_df.groupby(level=0).max().round(3)
388
+
389
+ # Convert back to string format
390
+ aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
391
+
392
+ return aggregated_df
393
+
394
+
395
+ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
396
+ """
397
+ Creates a DataFrame where columns are model_task and cells are averaged over interventions.
398
+ """
399
+ # Create a copy of the DataFrame
400
+ df_copy = df.copy()
401
+
402
+ # Remove the Average column if it exists
403
+ if 'Average' in df_copy.columns:
404
+ df_copy = df_copy.drop('Average', axis=1)
405
+
406
+ # Function to extract score value from string
407
+ def extract_score(score_str):
408
+ if isinstance(score_str, str):
409
+ return float(score_str)
410
+ return 0.0
411
+
412
+ # Convert all scores to numeric values
413
+ numeric_df = df_copy.applymap(extract_score)
414
+
415
+ # Group columns by model_task
416
+ model_task_groups = {}
417
+ for col in numeric_df.columns:
418
+ model_task = '_'.join(col.split('_')[:2]) # Get model_task part
419
+ if model_task not in model_task_groups:
420
+ model_task_groups[model_task] = []
421
+ model_task_groups[model_task].append(col)
422
+
423
+ # Create new DataFrame with averaged intervention scores
424
+ averaged_df = pd.DataFrame({
425
+ model_task: numeric_df[cols].mean(axis=1).round(3)
426
+ for model_task, cols in model_task_groups.items()
427
+ })
428
+
429
+ # Add overall average column
430
+ averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
431
+
432
+ # Sort by Average column
433
+ averaged_df = averaged_df.sort_values('Average', ascending=False)
434
+
435
+ return averaged_df
436
+
437
+
438
+ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
439
  """From the path of the results folder root, extract all needed info for MIB causal graph results"""
440
  model_result_filepaths = []
441
 
 
451
  for file in files:
452
  model_result_filepaths.append(os.path.join(root, file))
453
 
454
+ method_counters = defaultdict(int)
455
+ data_dicts = []
456
+
457
+ for filepath in model_result_filepaths:
458
  try:
459
+ with open(filepath, 'r') as f:
460
+ json_data = json.load(f)
461
+ method_name = json_data['method_name']
462
+ method_counters[method_name] += 1
463
+
464
+ eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
465
+ result = eval_result.init_from_json_file(filepath)
466
+ data_dict = result.to_dict()
467
+
468
+ # Add method counter to the method name if it's not the first instance
469
+ if method_counters[method_name] > 1:
470
+ data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
471
+
472
+ data_dicts.append(data_dict)
473
  except Exception as e:
474
+ print(f"Error processing {filepath}: {e}")
475
  continue
476
 
477
+ if not data_dicts:
478
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
479
 
480
+ # Create the detailed DataFrame
481
+ detailed_df = pd.DataFrame(data_dicts)
482
+ detailed_df.set_index("Method", inplace=True)
483
+ if "eval_name" in detailed_df.columns:
484
+ detailed_df.drop("eval_name", axis=1, inplace=True)
485
 
486
+ # Create aggregated DataFrame
487
+ aggregated_df = aggregate_methods(detailed_df)
488
 
489
+ # Create intervention-averaged DataFrame
490
+ intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
491
 
492
+ return detailed_df, aggregated_df, intervention_averaged_df
493
 
494
 
495
 
src/populate.py CHANGED
@@ -127,14 +127,16 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
127
  def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
128
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
129
 
130
- raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
131
- all_data_json = [v.to_dict() for v in raw_data]
 
132
  detailed_df = pd.DataFrame.from_records(all_data_json)
133
 
134
- # detailed_df = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
 
135
 
136
- # Print the actual columns for debugging
137
- print("Original columns:", detailed_df.columns.tolist())
138
 
139
  # # Rename columns to match schema
140
  # column_mapping = {}
@@ -149,11 +151,11 @@ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str) ->
149
 
150
  # detailed_df = detailed_df.rename(columns=column_mapping)
151
 
152
- # Create aggregated df
153
- aggregated_df = aggregate_methods(detailed_df)
154
 
155
- # Create intervention-averaged df
156
- intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
157
 
158
  # print("Transformed columns:", detailed_df.columns.tolist())
159
 
 
127
  def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
128
  # print(f"results_path is {results_path}, requests_path is {requests_path}")
129
 
130
+ raw_detailed_df, raw_aggregated_df, raw_intervention_averaged_df = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
131
+
132
+ all_data_json = [v.to_dict() for v in raw_detailed_df]
133
  detailed_df = pd.DataFrame.from_records(all_data_json)
134
 
135
+ all_data_json = [v.to_dict() for v in raw_aggregated_df]
136
+ aggregated_df = pd.DataFrame.from_records(all_data_json)
137
 
138
+ all_data_json = [v.to_dict() for v in raw_intervention_averaged_df]
139
+ intervention_averaged_df = pd.DataFrame.from_records(all_data_json)
140
 
141
  # # Rename columns to match schema
142
  # column_mapping = {}
 
151
 
152
  # detailed_df = detailed_df.rename(columns=column_mapping)
153
 
154
+ # # Create aggregated df
155
+ # aggregated_df = aggregate_methods(detailed_df)
156
 
157
+ # # Create intervention-averaged df
158
+ # intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
159
 
160
  # print("Transformed columns:", detailed_df.columns.tolist())
161