jasonshaoshun commited on
Commit
0ab8298
·
1 Parent(s): 6ea8a3e

caulsal-track debug

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +99 -25
src/leaderboard/read_evals.py CHANGED
@@ -357,38 +357,112 @@ class EvalResult_MIB_CAUSALGRAPH:
357
  return data_dict
358
 
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
361
  """
362
  Aggregates rows with the same base method name by taking the max value for each column.
 
363
  """
364
  # Create a copy of the DataFrame
365
  df_copy = df.copy()
366
 
367
  # Extract base method names (remove _2, _3, etc. suffixes)
368
  base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
369
- else name for name in df_copy.index]
370
- df_copy.index = base_methods
371
 
372
  # Convert scores to numeric values
373
- def extract_score(score_str):
374
- if isinstance(score_str, str):
375
- return float(score_str)
376
- return 0.0
377
-
378
- numeric_df = df_copy.applymap(extract_score)
379
 
380
  # Group by base method name and take the max
381
- aggregated_df = numeric_df.groupby(level=0).max().round(3)
 
 
 
 
382
 
383
  # Convert back to string format
384
- aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
 
385
 
386
  return aggregated_df
387
 
388
-
389
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
390
  """
391
  Creates a DataFrame where columns are model_task and cells are averaged over interventions.
 
392
  """
393
  # Create a copy of the DataFrame
394
  df_copy = df.copy()
@@ -397,31 +471,31 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
397
  if 'Average' in df_copy.columns:
398
  df_copy = df_copy.drop('Average', axis=1)
399
 
400
- # Function to extract score value from string
401
- def extract_score(score_str):
402
- if isinstance(score_str, str):
403
- return float(score_str)
404
- return 0.0
405
 
406
  # Convert all scores to numeric values
407
- numeric_df = df_copy.applymap(extract_score)
 
408
 
409
  # Group columns by model_task
410
  model_task_groups = {}
411
- for col in numeric_df.columns:
412
  model_task = '_'.join(col.split('_')[:2]) # Get model_task part
413
  if model_task not in model_task_groups:
414
  model_task_groups[model_task] = []
415
  model_task_groups[model_task].append(col)
416
 
417
- # Create new DataFrame with averaged intervention scores
418
- averaged_df = pd.DataFrame({
419
- model_task: numeric_df[cols].mean(axis=1).round(3)
420
- for model_task, cols in model_task_groups.items()
421
- })
 
 
 
422
 
423
- # Add overall average column
424
- averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
425
 
426
  # Sort by Average column
427
  averaged_df = averaged_df.sort_values('Average', ascending=False)
 
357
  return data_dict
358
 
359
 
360
+ # def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
361
+ # """
362
+ # Aggregates rows with the same base method name by taking the max value for each column.
363
+ # """
364
+ # # Create a copy of the DataFrame
365
+ # df_copy = df.copy()
366
+
367
+ # # Extract base method names (remove _2, _3, etc. suffixes)
368
+ # base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
369
+ # else name for name in df_copy.index]
370
+ # df_copy.index = base_methods
371
+
372
+ # # Convert scores to numeric values
373
+ # def extract_score(score_str):
374
+ # if isinstance(score_str, str):
375
+ # return float(score_str)
376
+ # return 0.0
377
+
378
+ # numeric_df = df_copy.applymap(extract_score)
379
+
380
+ # # Group by base method name and take the max
381
+ # aggregated_df = numeric_df.groupby(level=0).max().round(3)
382
+
383
+ # # Convert back to string format
384
+ # aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
385
+
386
+ # return aggregated_df
387
+
388
+
389
+ # def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
390
+ # """
391
+ # Creates a DataFrame where columns are model_task and cells are averaged over interventions.
392
+ # """
393
+ # # Create a copy of the DataFrame
394
+ # df_copy = df.copy()
395
+
396
+ # # Remove the Average column if it exists
397
+ # if 'Average' in df_copy.columns:
398
+ # df_copy = df_copy.drop('Average', axis=1)
399
+
400
+ # # Function to extract score value from string
401
+ # def extract_score(score_str):
402
+ # if isinstance(score_str, str):
403
+ # return float(score_str)
404
+ # return 0.0
405
+
406
+ # # Convert all scores to numeric values
407
+ # numeric_df = df_copy.applymap(extract_score)
408
+
409
+ # # Group columns by model_task
410
+ # model_task_groups = {}
411
+ # for col in numeric_df.columns:
412
+ # model_task = '_'.join(col.split('_')[:2]) # Get model_task part
413
+ # if model_task not in model_task_groups:
414
+ # model_task_groups[model_task] = []
415
+ # model_task_groups[model_task].append(col)
416
+
417
+ # # Create new DataFrame with averaged intervention scores
418
+ # averaged_df = pd.DataFrame({
419
+ # model_task: numeric_df[cols].mean(axis=1).round(3)
420
+ # for model_task, cols in model_task_groups.items()
421
+ # })
422
+
423
+ # # Add overall average column
424
+ # averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
425
+
426
+ # # Sort by Average column
427
+ # averaged_df = averaged_df.sort_values('Average', ascending=False)
428
+
429
+ # return averaged_df
430
+
431
  def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
432
  """
433
  Aggregates rows with the same base method name by taking the max value for each column.
434
+ Works with Method as a regular column instead of index.
435
  """
436
  # Create a copy of the DataFrame
437
  df_copy = df.copy()
438
 
439
  # Extract base method names (remove _2, _3, etc. suffixes)
440
  base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
441
+ else name for name in df_copy['Method']]
442
+ df_copy['base_method'] = base_methods
443
 
444
  # Convert scores to numeric values
445
+ score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
446
+ for col in score_columns:
447
+ df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
 
 
 
448
 
449
  # Group by base method name and take the max
450
+ aggregated_df = df_copy.groupby('base_method')[score_columns].max().round(3)
451
+
452
+ # Reset index to make base_method a regular column and rename it to Method
453
+ aggregated_df = aggregated_df.reset_index()
454
+ aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
455
 
456
  # Convert back to string format
457
+ for col in score_columns:
458
+ aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}")
459
 
460
  return aggregated_df
461
 
 
462
  def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
463
  """
464
  Creates a DataFrame where columns are model_task and cells are averaged over interventions.
465
+ Works with Method as a regular column.
466
  """
467
  # Create a copy of the DataFrame
468
  df_copy = df.copy()
 
471
  if 'Average' in df_copy.columns:
472
  df_copy = df_copy.drop('Average', axis=1)
473
 
474
+ # Get score columns (excluding Method)
475
+ score_columns = [col for col in df_copy.columns if col != 'Method']
 
 
 
476
 
477
  # Convert all scores to numeric values
478
+ for col in score_columns:
479
+ df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
480
 
481
  # Group columns by model_task
482
  model_task_groups = {}
483
+ for col in score_columns:
484
  model_task = '_'.join(col.split('_')[:2]) # Get model_task part
485
  if model_task not in model_task_groups:
486
  model_task_groups[model_task] = []
487
  model_task_groups[model_task].append(col)
488
 
489
+ # Create new DataFrame with Method column and averaged intervention scores
490
+ averaged_data = []
491
+ for _, row in df_copy.iterrows():
492
+ averaged_row = {'Method': row['Method']}
493
+ for model_task, cols in model_task_groups.items():
494
+ averaged_row[model_task] = np.mean([row[col] for col in cols]).round(3)
495
+ averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(3)
496
+ averaged_data.append(averaged_row)
497
 
498
+ averaged_df = pd.DataFrame(averaged_data)
 
499
 
500
  # Sort by Average column
501
  averaged_df = averaged_df.sort_values('Average', ascending=False)