Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
e1a39f1
1
Parent(s):
85f4717
caulsal-track debug
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -358,89 +358,20 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
358 |
return data_dict
|
359 |
|
360 |
|
361 |
-
# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
362 |
-
# """
|
363 |
-
# Aggregates rows with the same base method name by taking the max value for each column.
|
364 |
-
# """
|
365 |
-
# # Create a copy of the DataFrame
|
366 |
-
# df_copy = df.copy()
|
367 |
-
|
368 |
-
# # Extract base method names (remove _2, _3, etc. suffixes)
|
369 |
-
# base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
370 |
-
# else name for name in df_copy.index]
|
371 |
-
# df_copy.index = base_methods
|
372 |
-
|
373 |
-
# # Convert scores to numeric values
|
374 |
-
# def extract_score(score_str):
|
375 |
-
# if isinstance(score_str, str):
|
376 |
-
# return float(score_str)
|
377 |
-
# return 0.0
|
378 |
-
|
379 |
-
# numeric_df = df_copy.applymap(extract_score)
|
380 |
-
|
381 |
-
# # Group by base method name and take the max
|
382 |
-
# aggregated_df = numeric_df.groupby(level=0).max().round(3)
|
383 |
-
|
384 |
-
# # Convert back to string format
|
385 |
-
# aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
|
386 |
-
|
387 |
-
# return aggregated_df
|
388 |
|
389 |
|
390 |
-
# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
391 |
-
# """
|
392 |
-
# Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
393 |
-
# """
|
394 |
-
# # Create a copy of the DataFrame
|
395 |
-
# df_copy = df.copy()
|
396 |
-
|
397 |
-
# # Remove the Average column if it exists
|
398 |
-
# if 'Average' in df_copy.columns:
|
399 |
-
# df_copy = df_copy.drop('Average', axis=1)
|
400 |
-
|
401 |
-
# # Function to extract score value from string
|
402 |
-
# def extract_score(score_str):
|
403 |
-
# if isinstance(score_str, str):
|
404 |
-
# return float(score_str)
|
405 |
-
# return 0.0
|
406 |
-
|
407 |
-
# # Convert all scores to numeric values
|
408 |
-
# numeric_df = df_copy.applymap(extract_score)
|
409 |
-
|
410 |
-
# # Group columns by model_task
|
411 |
-
# model_task_groups = {}
|
412 |
-
# for col in numeric_df.columns:
|
413 |
-
# model_task = '_'.join(col.split('_')[:2]) # Get model_task part
|
414 |
-
# if model_task not in model_task_groups:
|
415 |
-
# model_task_groups[model_task] = []
|
416 |
-
# model_task_groups[model_task].append(col)
|
417 |
-
|
418 |
-
# # Create new DataFrame with averaged intervention scores
|
419 |
-
# averaged_df = pd.DataFrame({
|
420 |
-
# model_task: numeric_df[cols].mean(axis=1).round(3)
|
421 |
-
# for model_task, cols in model_task_groups.items()
|
422 |
-
# })
|
423 |
-
|
424 |
-
# # Add overall average column
|
425 |
-
# averaged_df['Average'] = averaged_df.mean(axis=1).round(3)
|
426 |
-
|
427 |
-
# # Sort by Average column
|
428 |
-
# averaged_df = averaged_df.sort_values('Average', ascending=False)
|
429 |
-
|
430 |
-
# return averaged_df
|
431 |
-
|
432 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
433 |
"""
|
434 |
Aggregates rows with the same base method name by taking the max value for each column.
|
435 |
Works with Method as a regular column instead of index.
|
436 |
"""
|
437 |
-
|
438 |
df_copy = df.copy()
|
439 |
-
|
440 |
-
# Extract base method names (remove _2, _3, etc. suffixes)
|
441 |
base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
442 |
else name for name in df_copy['Method']]
|
443 |
-
df_copy['
|
|
|
444 |
|
445 |
# Convert scores to numeric values
|
446 |
score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
|
@@ -551,6 +482,9 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
|
|
551 |
# if "eval_name" in detailed_df.columns:
|
552 |
# detailed_df.drop("eval_name", axis=1, inplace=True)
|
553 |
|
|
|
|
|
|
|
554 |
# Create aggregated DataFrame
|
555 |
aggregated_df = aggregate_methods(detailed_df)
|
556 |
|
|
|
358 |
return data_dict
|
359 |
|
360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
|
362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
364 |
"""
|
365 |
Aggregates rows with the same base method name by taking the max value for each column.
|
366 |
Works with Method as a regular column instead of index.
|
367 |
"""
|
368 |
+
|
369 |
df_copy = df.copy()
|
370 |
+
print("\nBase methods extraction:")
|
|
|
371 |
base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
372 |
else name for name in df_copy['Method']]
|
373 |
+
print(f"Original methods: {df_copy['Method'].tolist()}")
|
374 |
+
print(f"Base methods: {base_methods}")
|
375 |
|
376 |
# Convert scores to numeric values
|
377 |
score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
|
|
|
482 |
# if "eval_name" in detailed_df.columns:
|
483 |
# detailed_df.drop("eval_name", axis=1, inplace=True)
|
484 |
|
485 |
+
print("Before aggregation:")
|
486 |
+
print(detailed_df)
|
487 |
+
|
488 |
# Create aggregated DataFrame
|
489 |
aggregated_df = aggregate_methods(detailed_df)
|
490 |
|