Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
0ab8298
1
Parent(s):
6ea8a3e
caulsal-track debug
Browse files- src/leaderboard/read_evals.py +99 -25
src/leaderboard/read_evals.py
CHANGED
@@ -357,38 +357,112 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
357 |
return data_dict
|
358 |
|
359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
361 |
"""
|
362 |
Aggregates rows with the same base method name by taking the max value for each column.
|
|
|
363 |
"""
|
364 |
# Create a copy of the DataFrame
|
365 |
df_copy = df.copy()
|
366 |
|
367 |
# Extract base method names (remove _2, _3, etc. suffixes)
|
368 |
base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
369 |
-
else name for name in df_copy
|
370 |
-
df_copy
|
371 |
|
372 |
# Convert scores to numeric values
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
return 0.0
|
377 |
-
|
378 |
-
numeric_df = df_copy.applymap(extract_score)
|
379 |
|
380 |
# Group by base method name and take the max
|
381 |
-
aggregated_df =
|
|
|
|
|
|
|
|
|
382 |
|
383 |
# Convert back to string format
|
384 |
-
|
|
|
385 |
|
386 |
return aggregated_df
|
387 |
|
388 |
-
|
389 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
390 |
"""
|
391 |
Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
|
|
392 |
"""
|
393 |
# Create a copy of the DataFrame
|
394 |
df_copy = df.copy()
|
@@ -397,31 +471,31 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
397 |
if 'Average' in df_copy.columns:
|
398 |
df_copy = df_copy.drop('Average', axis=1)
|
399 |
|
400 |
-
#
|
401 |
-
|
402 |
-
if isinstance(score_str, str):
|
403 |
-
return float(score_str)
|
404 |
-
return 0.0
|
405 |
|
406 |
# Convert all scores to numeric values
|
407 |
-
|
|
|
408 |
|
409 |
# Group columns by model_task
|
410 |
model_task_groups = {}
|
411 |
-
for col in
|
412 |
model_task = '_'.join(col.split('_')[:2]) # Get model_task part
|
413 |
if model_task not in model_task_groups:
|
414 |
model_task_groups[model_task] = []
|
415 |
model_task_groups[model_task].append(col)
|
416 |
|
417 |
-
# Create new DataFrame with averaged intervention scores
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
|
|
|
|
|
|
422 |
|
423 |
-
|
424 |
-
averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
|
425 |
|
426 |
# Sort by Average column
|
427 |
averaged_df = averaged_df.sort_values('Average', ascending=False)
|
|
|
357 |
return data_dict
|
358 |
|
359 |
|
360 |
+
# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
361 |
+
# """
|
362 |
+
# Aggregates rows with the same base method name by taking the max value for each column.
|
363 |
+
# """
|
364 |
+
# # Create a copy of the DataFrame
|
365 |
+
# df_copy = df.copy()
|
366 |
+
|
367 |
+
# # Extract base method names (remove _2, _3, etc. suffixes)
|
368 |
+
# base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
369 |
+
# else name for name in df_copy.index]
|
370 |
+
# df_copy.index = base_methods
|
371 |
+
|
372 |
+
# # Convert scores to numeric values
|
373 |
+
# def extract_score(score_str):
|
374 |
+
# if isinstance(score_str, str):
|
375 |
+
# return float(score_str)
|
376 |
+
# return 0.0
|
377 |
+
|
378 |
+
# numeric_df = df_copy.applymap(extract_score)
|
379 |
+
|
380 |
+
# # Group by base method name and take the max
|
381 |
+
# aggregated_df = numeric_df.groupby(level=0).max().round(3)
|
382 |
+
|
383 |
+
# # Convert back to string format
|
384 |
+
# aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
|
385 |
+
|
386 |
+
# return aggregated_df
|
387 |
+
|
388 |
+
|
389 |
+
# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
390 |
+
# """
|
391 |
+
# Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
392 |
+
# """
|
393 |
+
# # Create a copy of the DataFrame
|
394 |
+
# df_copy = df.copy()
|
395 |
+
|
396 |
+
# # Remove the Average column if it exists
|
397 |
+
# if 'Average' in df_copy.columns:
|
398 |
+
# df_copy = df_copy.drop('Average', axis=1)
|
399 |
+
|
400 |
+
# # Function to extract score value from string
|
401 |
+
# def extract_score(score_str):
|
402 |
+
# if isinstance(score_str, str):
|
403 |
+
# return float(score_str)
|
404 |
+
# return 0.0
|
405 |
+
|
406 |
+
# # Convert all scores to numeric values
|
407 |
+
# numeric_df = df_copy.applymap(extract_score)
|
408 |
+
|
409 |
+
# # Group columns by model_task
|
410 |
+
# model_task_groups = {}
|
411 |
+
# for col in numeric_df.columns:
|
412 |
+
# model_task = '_'.join(col.split('_')[:2]) # Get model_task part
|
413 |
+
# if model_task not in model_task_groups:
|
414 |
+
# model_task_groups[model_task] = []
|
415 |
+
# model_task_groups[model_task].append(col)
|
416 |
+
|
417 |
+
# # Create new DataFrame with averaged intervention scores
|
418 |
+
# averaged_df = pd.DataFrame({
|
419 |
+
# model_task: numeric_df[cols].mean(axis=1).round(3)
|
420 |
+
# for model_task, cols in model_task_groups.items()
|
421 |
+
# })
|
422 |
+
|
423 |
+
# # Add overall average column
|
424 |
+
# averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
|
425 |
+
|
426 |
+
# # Sort by Average column
|
427 |
+
# averaged_df = averaged_df.sort_values('Average', ascending=False)
|
428 |
+
|
429 |
+
# return averaged_df
|
430 |
+
|
431 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
432 |
"""
|
433 |
Aggregates rows with the same base method name by taking the max value for each column.
|
434 |
+
Works with Method as a regular column instead of index.
|
435 |
"""
|
436 |
# Create a copy of the DataFrame
|
437 |
df_copy = df.copy()
|
438 |
|
439 |
# Extract base method names (remove _2, _3, etc. suffixes)
|
440 |
base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
441 |
+
else name for name in df_copy['Method']]
|
442 |
+
df_copy['base_method'] = base_methods
|
443 |
|
444 |
# Convert scores to numeric values
|
445 |
+
score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
|
446 |
+
for col in score_columns:
|
447 |
+
df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
|
|
|
|
|
|
|
448 |
|
449 |
# Group by base method name and take the max
|
450 |
+
aggregated_df = df_copy.groupby('base_method')[score_columns].max().round(3)
|
451 |
+
|
452 |
+
# Reset index to make base_method a regular column and rename it to Method
|
453 |
+
aggregated_df = aggregated_df.reset_index()
|
454 |
+
aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
|
455 |
|
456 |
# Convert back to string format
|
457 |
+
for col in score_columns:
|
458 |
+
aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}")
|
459 |
|
460 |
return aggregated_df
|
461 |
|
|
|
462 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
463 |
"""
|
464 |
Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
465 |
+
Works with Method as a regular column.
|
466 |
"""
|
467 |
# Create a copy of the DataFrame
|
468 |
df_copy = df.copy()
|
|
|
471 |
if 'Average' in df_copy.columns:
|
472 |
df_copy = df_copy.drop('Average', axis=1)
|
473 |
|
474 |
+
# Get score columns (excluding Method)
|
475 |
+
score_columns = [col for col in df_copy.columns if col != 'Method']
|
|
|
|
|
|
|
476 |
|
477 |
# Convert all scores to numeric values
|
478 |
+
for col in score_columns:
|
479 |
+
df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
|
480 |
|
481 |
# Group columns by model_task
|
482 |
model_task_groups = {}
|
483 |
+
for col in score_columns:
|
484 |
model_task = '_'.join(col.split('_')[:2]) # Get model_task part
|
485 |
if model_task not in model_task_groups:
|
486 |
model_task_groups[model_task] = []
|
487 |
model_task_groups[model_task].append(col)
|
488 |
|
489 |
+
# Create new DataFrame with Method column and averaged intervention scores
|
490 |
+
averaged_data = []
|
491 |
+
for _, row in df_copy.iterrows():
|
492 |
+
averaged_row = {'Method': row['Method']}
|
493 |
+
for model_task, cols in model_task_groups.items():
|
494 |
+
averaged_row[model_task] = np.mean([row[col] for col in cols]).round(3)
|
495 |
+
averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(3)
|
496 |
+
averaged_data.append(averaged_row)
|
497 |
|
498 |
+
averaged_df = pd.DataFrame(averaged_data)
|
|
|
499 |
|
500 |
# Sort by Average column
|
501 |
averaged_df = averaged_df.sort_values('Average', ascending=False)
|