Spaces:
Restarting
Restarting
jasonshaoshun
commited on
Commit
·
4780a48
1
Parent(s):
1d8e193
caulsal-track debug
Browse files- app.py +1 -277
- caulsal_metric.py +5 -0
- src/about.py +38 -70
- src/display/utils.py +23 -200
- src/leaderboard/read_evals.py +141 -250
- src/populate.py +21 -140
app.py
CHANGED
@@ -23,10 +23,8 @@ from src.display.utils import (
|
|
23 |
BENCHMARK_COLS,
|
24 |
BENCHMARK_COLS_MULTIMODAL,
|
25 |
BENCHMARK_COLS_MIB_SUBGRAPH,
|
26 |
-
BENCHMARK_COLS_MIB_CAUSALGRAPH,
|
27 |
COLS,
|
28 |
COLS_MIB_SUBGRAPH,
|
29 |
-
COLS_MIB_CAUSALGRAPH,
|
30 |
COLS_MULTIMODAL,
|
31 |
EVAL_COLS,
|
32 |
EVAL_TYPES,
|
@@ -281,9 +279,7 @@ LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_M
|
|
281 |
# In app.py, modify the LEADERBOARD initialization
|
282 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
283 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
|
284 |
-
EVAL_REQUESTS_PATH
|
285 |
-
COLS_MIB_CAUSALGRAPH,
|
286 |
-
BENCHMARK_COLS_MIB_CAUSALGRAPH
|
287 |
)
|
288 |
|
289 |
|
@@ -300,95 +296,6 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
|
|
300 |
|
301 |
|
302 |
|
303 |
-
# def init_leaderboard_mib_subgraph(dataframe, track):
|
304 |
-
# # print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
|
305 |
-
|
306 |
-
# if dataframe is None or dataframe.empty:
|
307 |
-
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
308 |
-
|
309 |
-
# # filter for correct track
|
310 |
-
# # dataframe = dataframe.loc[dataframe["Track"] == track]
|
311 |
-
|
312 |
-
# # print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
|
313 |
-
|
314 |
-
# return Leaderboard(
|
315 |
-
# value=dataframe,
|
316 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
317 |
-
# select_columns=SelectColumns(
|
318 |
-
# default_selection=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default],
|
319 |
-
# cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden],
|
320 |
-
# label="Select Columns to Display:",
|
321 |
-
# ),
|
322 |
-
# search_columns=["Method"], # Changed from AutoEvalColumn_mib_subgraph.model.name to "Method"
|
323 |
-
# hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden],
|
324 |
-
# bool_checkboxgroup_label="Hide models",
|
325 |
-
# interactive=False,
|
326 |
-
# )
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
# def init_leaderboard_mib_subgraph(dataframe, track):
|
333 |
-
# """Initialize the subgraph leaderboard with grouped column selection by benchmark."""
|
334 |
-
# if dataframe is None or dataframe.empty:
|
335 |
-
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
336 |
-
|
337 |
-
# print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
|
338 |
-
|
339 |
-
# # Create groups of columns by benchmark
|
340 |
-
# benchmark_groups = []
|
341 |
-
|
342 |
-
# # For each benchmark in our TasksMib_Subgraph enum...
|
343 |
-
# for task in TasksMib_Subgraph:
|
344 |
-
# benchmark = task.value.benchmark
|
345 |
-
# # Get all valid columns for this benchmark's models
|
346 |
-
# benchmark_cols = [
|
347 |
-
# f"{benchmark}_{model}"
|
348 |
-
# for model in task.value.models
|
349 |
-
# if f"{benchmark}_{model}" in dataframe.columns
|
350 |
-
# ]
|
351 |
-
# if benchmark_cols: # Only add if we have valid columns
|
352 |
-
# benchmark_groups.append(benchmark_cols)
|
353 |
-
# print(f"\nBenchmark group for {benchmark}:", benchmark_cols)
|
354 |
-
|
355 |
-
# # Create model groups as well
|
356 |
-
# model_groups = []
|
357 |
-
# all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models))
|
358 |
-
|
359 |
-
# # For each unique model...
|
360 |
-
# for model in all_models:
|
361 |
-
# # Get all valid columns for this model across benchmarks
|
362 |
-
# model_cols = [
|
363 |
-
# f"{task.value.benchmark}_{model}"
|
364 |
-
# for task in TasksMib_Subgraph
|
365 |
-
# if model in task.value.models
|
366 |
-
# and f"{task.value.benchmark}_{model}" in dataframe.columns
|
367 |
-
# ]
|
368 |
-
# if model_cols: # Only add if we have valid columns
|
369 |
-
# model_groups.append(model_cols)
|
370 |
-
# print(f"\nModel group for {model}:", model_cols)
|
371 |
-
|
372 |
-
# # Combine all groups
|
373 |
-
# all_groups = benchmark_groups + model_groups
|
374 |
-
|
375 |
-
# # Flatten groups for default selection (show everything initially)
|
376 |
-
# all_columns = [col for group in all_groups for col in group]
|
377 |
-
# print("\nAll available columns:", all_columns)
|
378 |
-
|
379 |
-
# return Leaderboard(
|
380 |
-
# value=dataframe,
|
381 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
382 |
-
# select_columns=SelectColumns(
|
383 |
-
# default_selection=all_columns, # Show all columns initially
|
384 |
-
# label="Select Results:"
|
385 |
-
# ),
|
386 |
-
# search_columns=["Method"],
|
387 |
-
# hide_columns=[],
|
388 |
-
# interactive=False,
|
389 |
-
# )
|
390 |
-
|
391 |
-
|
392 |
def init_leaderboard_mib_subgraph(dataframe, track):
|
393 |
"""Initialize the subgraph leaderboard with display names for better readability."""
|
394 |
if dataframe is None or dataframe.empty:
|
@@ -478,189 +385,6 @@ def init_leaderboard_mib_subgraph(dataframe, track):
|
|
478 |
|
479 |
|
480 |
|
481 |
-
# # Complete column groups for both benchmarks and models
|
482 |
-
# # Define keywords for filtering
|
483 |
-
# benchmark_keywords = ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]
|
484 |
-
# model_keywords = ["qwen2_5", "gpt2", "gemma2", "llama3"]
|
485 |
-
|
486 |
-
# # Optional: Define display names
|
487 |
-
# mappings = {
|
488 |
-
# "ioi_llama3": "IOI (LLaMA-3)",
|
489 |
-
# "ioi_qwen2_5": "IOI (Qwen-2.5)",
|
490 |
-
# "ioi_gpt2": "IOI (GPT-2)",
|
491 |
-
# "ioi_gemma2": "IOI (Gemma-2)",
|
492 |
-
# "mcqa_llama3": "MCQA (LLaMA-3)",
|
493 |
-
# "mcqa_qwen2_5": "MCQA (Qwen-2.5)",
|
494 |
-
# "mcqa_gemma2": "MCQA (Gemma-2)",
|
495 |
-
# "arithmetic_addition_llama3": "Arithmetic Addition (LLaMA-3)",
|
496 |
-
# "arithmetic_subtraction_llama3": "Arithmetic Subtraction (LLaMA-3)",
|
497 |
-
# "arc_easy_llama3": "ARC Easy (LLaMA-3)",
|
498 |
-
# "arc_easy_gemma2": "ARC Easy (Gemma-2)",
|
499 |
-
# "arc_challenge_llama3": "ARC Challenge (LLaMA-3)",
|
500 |
-
# "eval_name": "Evaluation Name",
|
501 |
-
# "Method": "Method",
|
502 |
-
# "Average": "Average Score"
|
503 |
-
# }
|
504 |
-
# # mappings = {}
|
505 |
-
|
506 |
-
# # Create SmartSelectColumns instance
|
507 |
-
# smart_columns = SmartSelectColumns(
|
508 |
-
# benchmark_keywords=benchmark_keywords,
|
509 |
-
# model_keywords=model_keywords,
|
510 |
-
# column_mapping=mappings,
|
511 |
-
# initial_selected=["Method", "Average"]
|
512 |
-
# )
|
513 |
-
|
514 |
-
# print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
|
515 |
-
|
516 |
-
# # Create Leaderboard
|
517 |
-
# leaderboard = Leaderboard(
|
518 |
-
# value=renamed_df,
|
519 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
520 |
-
# select_columns=smart_columns,
|
521 |
-
# search_columns=["Method"],
|
522 |
-
# hide_columns=[],
|
523 |
-
# interactive=False
|
524 |
-
# )
|
525 |
-
# print(f"Successfully created leaderboard.")
|
526 |
-
# return leaderboard
|
527 |
-
|
528 |
-
# print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
|
529 |
-
|
530 |
-
# # Define simple keywords for filtering
|
531 |
-
# benchmark_keywords = ["ioi", "mcqa", "arithmetic", "arc"]
|
532 |
-
# model_keywords = ["qwen2_5", "gpt2", "gemma2", "llama3"]
|
533 |
-
|
534 |
-
# # Create SmartSelectColumns instance with exact same parameters as working version
|
535 |
-
# smart_columns = SmartSelectColumns(
|
536 |
-
# benchmark_keywords=benchmark_keywords,
|
537 |
-
# model_keywords=model_keywords,
|
538 |
-
# initial_selected=["Method", "Average"],
|
539 |
-
# allow=True,
|
540 |
-
# label=None,
|
541 |
-
# show_label=True,
|
542 |
-
# info=None
|
543 |
-
# )
|
544 |
-
|
545 |
-
# try:
|
546 |
-
# print("\nCreating leaderboard...")
|
547 |
-
# # Get groups before creating leaderboard
|
548 |
-
# smart_columns.get_filtered_groups(dataframe.columns)
|
549 |
-
|
550 |
-
# leaderboard = Leaderboard(
|
551 |
-
# value=dataframe,
|
552 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
553 |
-
# select_columns=smart_columns,
|
554 |
-
# search_columns=["Method"],
|
555 |
-
# hide_columns=[],
|
556 |
-
# interactive=False
|
557 |
-
# )
|
558 |
-
# print("Leaderboard created successfully")
|
559 |
-
# return leaderboard
|
560 |
-
|
561 |
-
# except Exception as e:
|
562 |
-
# print("Error creating leaderboard:", str(e))
|
563 |
-
# raise
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
# def init_leaderboard_mib_subgraph(dataframe, track):
|
571 |
-
# """Initialize the subgraph leaderboard with group-based column selection."""
|
572 |
-
# if dataframe is None or dataframe.empty:
|
573 |
-
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
574 |
-
|
575 |
-
# print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
|
576 |
-
|
577 |
-
# # Create selection mapping for benchmark groups
|
578 |
-
# selection_mapping = {}
|
579 |
-
|
580 |
-
# # Create benchmark groups with descriptive names
|
581 |
-
# for task in TasksMib_Subgraph:
|
582 |
-
# benchmark = task.value.benchmark
|
583 |
-
# # Get all columns for this benchmark's models
|
584 |
-
# benchmark_cols = [
|
585 |
-
# f"{benchmark}_{model}"
|
586 |
-
# for model in task.value.models
|
587 |
-
# if f"{benchmark}_{model}" in dataframe.columns
|
588 |
-
# ]
|
589 |
-
# if benchmark_cols:
|
590 |
-
# # Use a descriptive group name as the key
|
591 |
-
# group_name = f"Benchmark: {benchmark.upper()}"
|
592 |
-
# selection_mapping[group_name] = benchmark_cols
|
593 |
-
# print(f"\n{group_name} maps to:", benchmark_cols)
|
594 |
-
|
595 |
-
# # Create model groups with descriptive names
|
596 |
-
# all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models))
|
597 |
-
# for model in all_models:
|
598 |
-
# # Get all columns for this model across benchmarks
|
599 |
-
# model_cols = [
|
600 |
-
# f"{task.value.benchmark}_{model}"
|
601 |
-
# for task in TasksMib_Subgraph
|
602 |
-
# if model in task.value.models
|
603 |
-
# and f"{task.value.benchmark}_{model}" in dataframe.columns
|
604 |
-
# ]
|
605 |
-
# if model_cols:
|
606 |
-
# # Use a descriptive group name as the key
|
607 |
-
# group_name = f"Model: {model}"
|
608 |
-
# selection_mapping[group_name] = model_cols
|
609 |
-
# print(f"\n{group_name} maps to:", model_cols)
|
610 |
-
|
611 |
-
# # The selection options are the group names
|
612 |
-
# selection_options = list(selection_mapping.keys())
|
613 |
-
# print("\nSelection options:", selection_options)
|
614 |
-
|
615 |
-
# return Leaderboard(
|
616 |
-
# value=dataframe,
|
617 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
618 |
-
# select_columns=SelectColumns(
|
619 |
-
# default_selection=selection_options, # Show all groups by default
|
620 |
-
# label="Select Benchmark or Model Groups:"
|
621 |
-
# ),
|
622 |
-
# search_columns=["Method"],
|
623 |
-
# hide_columns=[],
|
624 |
-
# interactive=False,
|
625 |
-
# )
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
# def init_leaderboard_mib_causalgraph(dataframe, track):
|
635 |
-
# # print("Debugging column issues:")
|
636 |
-
# # print("\nActual DataFrame columns:")
|
637 |
-
# # print(dataframe.columns.tolist())
|
638 |
-
|
639 |
-
# # print("\nExpected columns for Leaderboard:")
|
640 |
-
# expected_cols = [c.name for c in fields(AutoEvalColumn_mib_causalgraph)]
|
641 |
-
# # print(expected_cols)
|
642 |
-
|
643 |
-
# # print("\nMissing columns:")
|
644 |
-
# missing_cols = [col for col in expected_cols if col not in dataframe.columns]
|
645 |
-
# # print(missing_cols)
|
646 |
-
|
647 |
-
# # print("\nSample of DataFrame content:")
|
648 |
-
# # print(dataframe.head().to_string())
|
649 |
-
|
650 |
-
# return Leaderboard(
|
651 |
-
# value=dataframe,
|
652 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
|
653 |
-
# select_columns=SelectColumns(
|
654 |
-
# default_selection=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.displayed_by_default],
|
655 |
-
# cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.never_hidden],
|
656 |
-
# label="Select Columns to Display:",
|
657 |
-
# ),
|
658 |
-
# search_columns=["Method"],
|
659 |
-
# hide_columns=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.hidden],
|
660 |
-
# bool_checkboxgroup_label="Hide models",
|
661 |
-
# interactive=False,
|
662 |
-
# )
|
663 |
-
|
664 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
665 |
# print("Debugging column issues:")
|
666 |
# print("\nActual DataFrame columns:")
|
|
|
23 |
BENCHMARK_COLS,
|
24 |
BENCHMARK_COLS_MULTIMODAL,
|
25 |
BENCHMARK_COLS_MIB_SUBGRAPH,
|
|
|
26 |
COLS,
|
27 |
COLS_MIB_SUBGRAPH,
|
|
|
28 |
COLS_MULTIMODAL,
|
29 |
EVAL_COLS,
|
30 |
EVAL_TYPES,
|
|
|
279 |
# In app.py, modify the LEADERBOARD initialization
|
280 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
281 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
|
282 |
+
EVAL_REQUESTS_PATH
|
|
|
|
|
283 |
)
|
284 |
|
285 |
|
|
|
296 |
|
297 |
|
298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
def init_leaderboard_mib_subgraph(dataframe, track):
|
300 |
"""Initialize the subgraph leaderboard with display names for better readability."""
|
301 |
if dataframe is None or dataframe.empty:
|
|
|
385 |
|
386 |
|
387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
389 |
# print("Debugging column issues:")
|
390 |
# print("\nActual DataFrame columns:")
|
caulsal_metric.py
CHANGED
@@ -135,6 +135,11 @@ def create_summary_dataframe(json_files: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
135 |
|
136 |
return df
|
137 |
|
|
|
|
|
|
|
|
|
|
|
138 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
139 |
"""
|
140 |
Aggregates rows with the same base method name by taking the max value for each column.
|
|
|
135 |
|
136 |
return df
|
137 |
|
138 |
+
|
139 |
+
# averaged_cf = average_counterfactuals(json_files)
|
140 |
+
# layer_averaged = find_layer_averages(averaged_cf)
|
141 |
+
# detailed_df = create_summary_dataframe(layer_averaged)
|
142 |
+
|
143 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
144 |
"""
|
145 |
Aggregates rows with the same base method name by taking the max value for each column.
|
src/about.py
CHANGED
@@ -61,33 +61,27 @@ class TasksMib_Subgraph(Enum):
|
|
61 |
return sorted(list(models))
|
62 |
|
63 |
|
|
|
|
|
64 |
# @dataclass
|
65 |
# class TaskMIB_Causalgraph:
|
66 |
-
# benchmark: str
|
67 |
-
# models: list[str]
|
68 |
-
# layers: list[str]
|
69 |
-
# col_name: str
|
70 |
-
# interventions: list[str]
|
71 |
-
# counterfactuals: list[str]
|
72 |
-
# metrics: list[str]
|
73 |
|
74 |
-
# class TasksMib_Causalgraph(Enum):
|
75 |
-
# task0 = TaskMIB_Causalgraph(
|
76 |
-
# "MCQA",
|
77 |
-
# ["LlamaForCausalLM", "Qwen2ForCausalLM", "Gemma2ForCausalLM"], # Updated model list
|
78 |
-
# [str(i) for i in range(32)], # 0-31 layers
|
79 |
-
# "mcqa",
|
80 |
-
# ["output_token", "output_location"],
|
81 |
-
# ["symbol_counterfactual", "randomLetter_counterfactual",
|
82 |
-
# "answerPosition_counterfactual", "answerPosition_symbol_counterfactual"],
|
83 |
-
# ["score"]
|
84 |
-
# )
|
85 |
|
86 |
# class TasksMib_Causalgraph(Enum):
|
87 |
-
# task0 = TaskMIB_Causalgraph(
|
88 |
-
# "
|
89 |
-
#
|
90 |
-
#
|
|
|
|
|
|
|
91 |
# "mcqa",
|
92 |
# ["output_token", "output_location"],
|
93 |
# ["randomLetter_counterfactual", "answerPosition_counterfactual",
|
@@ -95,58 +89,32 @@ class TasksMib_Subgraph(Enum):
|
|
95 |
# ["score"]
|
96 |
# )
|
97 |
|
98 |
-
# class TasksMib_Causalgraph(Enum):
|
99 |
-
# task0 = TaskMIB_Causalgraph(
|
100 |
-
# "MCQA",
|
101 |
-
# ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"], # Use lowercase consistently
|
102 |
-
# [str(i) for i in range(32)],
|
103 |
-
# "mcqa",
|
104 |
-
# ["output_token", "output_location"],
|
105 |
-
# ["randomLetter_counterfactual", "answerPosition_counterfactual",
|
106 |
-
# "answerPosition_randomLetter_counterfactual"],
|
107 |
-
# ["score"]
|
108 |
-
# )
|
109 |
|
110 |
-
@dataclass
|
111 |
class TaskMIB_Causalgraph:
|
112 |
-
benchmark: str
|
113 |
-
models: list[str]
|
114 |
-
|
115 |
-
|
116 |
-
interventions: list[str]
|
117 |
-
counterfactuals: list[str]
|
118 |
-
metrics: list[str]
|
119 |
|
120 |
-
# class TasksMib_Causalgraph(Enum):
|
121 |
-
# task0 = TaskMIB_Causalgraph(
|
122 |
-
# "MCQA",
|
123 |
-
# ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"],
|
124 |
-
# {
|
125 |
-
# "Qwen2ForCausalLM": [str(i) for i in range(24)], # 0-23
|
126 |
-
# "Gemma2ForCausalLM": [str(i) for i in range(26)], # 0-25
|
127 |
-
# "LlamaForCausalLM": [str(i) for i in range(32)] # 0-31
|
128 |
-
# },
|
129 |
-
# "mcqa",
|
130 |
-
# ["output_token", "output_location"],
|
131 |
-
# ["randomLetter_counterfactual", "answerPosition_counterfactual",
|
132 |
-
# "answerPosition_randomLetter_counterfactual"],
|
133 |
-
# ["score"]
|
134 |
-
# )
|
135 |
class TasksMib_Causalgraph(Enum):
|
136 |
-
task0 =
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
150 |
|
151 |
|
152 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
61 |
return sorted(list(models))
|
62 |
|
63 |
|
64 |
+
|
65 |
+
|
66 |
# @dataclass
|
67 |
# class TaskMIB_Causalgraph:
|
68 |
+
# benchmark: str
|
69 |
+
# models: list[str]
|
70 |
+
# layers: dict[str, list[str]] # Different layers for each model
|
71 |
+
# col_name: str
|
72 |
+
# interventions: list[str]
|
73 |
+
# counterfactuals: list[str]
|
74 |
+
# metrics: list[str]
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
# class TasksMib_Causalgraph(Enum):
|
78 |
+
# task0 = TaskMIB_Causalgraph("MCQA",
|
79 |
+
# ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"],
|
80 |
+
# {
|
81 |
+
# "qwen2forcausallm": [str(i) for i in range(24)], # 0-23
|
82 |
+
# "gemma2forcausallm": [str(i) for i in range(26)], # 0-25
|
83 |
+
# "llamaforcausallm": [str(i) for i in range(32)] # 0-31
|
84 |
+
# },
|
85 |
# "mcqa",
|
86 |
# ["output_token", "output_location"],
|
87 |
# ["randomLetter_counterfactual", "answerPosition_counterfactual",
|
|
|
89 |
# ["score"]
|
90 |
# )
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
+
@dataclass
|
94 |
class TaskMIB_Causalgraph:
|
95 |
+
benchmark: str # task name in json (ioi/arithmetic)
|
96 |
+
models: list[str] # list of models to show as sub-columns
|
97 |
+
col_name: str # display name in leaderboard
|
98 |
+
metrics: list[str] # metrics to store (average_score)
|
|
|
|
|
|
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
class TasksMib_Causalgraph(Enum):
|
101 |
+
task0 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"])
|
102 |
+
|
103 |
+
@classmethod
|
104 |
+
def get_all_tasks(cls):
|
105 |
+
"""Returns a list of all task benchmarks"""
|
106 |
+
return [task.value.benchmark for task in cls]
|
107 |
+
|
108 |
+
@classmethod
|
109 |
+
def get_all_models(cls):
|
110 |
+
"""Returns a list of all unique models across all tasks"""
|
111 |
+
models = set()
|
112 |
+
for task in cls:
|
113 |
+
models.update(task.value.models)
|
114 |
+
return sorted(list(models))
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
|
119 |
|
120 |
NUM_FEWSHOT = 0 # Change with your few shot
|
src/display/utils.py
CHANGED
@@ -58,64 +58,6 @@ AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_
|
|
58 |
|
59 |
|
60 |
|
61 |
-
##############################################################################################################
|
62 |
-
# Version 1
|
63 |
-
|
64 |
-
# auto_eval_column_dict_mib_subgraph = []
|
65 |
-
|
66 |
-
# # Method name column
|
67 |
-
# auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
68 |
-
|
69 |
-
# # For each task and model combination
|
70 |
-
# for task in TasksMib_Subgraph:
|
71 |
-
# for model in task.value.models:
|
72 |
-
# col_name = f"{task.value.benchmark}_{model}" # ioi_gpt2, mcqa_qwen2.5, etc.
|
73 |
-
# auto_eval_column_dict_mib_subgraph.append([
|
74 |
-
# col_name,
|
75 |
-
# ColumnContent,
|
76 |
-
# ColumnContent(col_name, "number", True)
|
77 |
-
# ])
|
78 |
-
|
79 |
-
# # Average column
|
80 |
-
# auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
81 |
-
|
82 |
-
|
83 |
-
# ##############################################################################################################
|
84 |
-
# # Version 2
|
85 |
-
# auto_eval_column_dict_mib_subgraph = []
|
86 |
-
|
87 |
-
# # Method name column
|
88 |
-
# auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
89 |
-
|
90 |
-
# # Add task filter column
|
91 |
-
# task_values = list(set(task.value.benchmark for task in TasksMib_Subgraph))
|
92 |
-
# auto_eval_column_dict_mib_subgraph.append(
|
93 |
-
# ["task_filter", ColumnContent, ColumnContent("Task", "str", True, never_hidden=True)]
|
94 |
-
# )
|
95 |
-
|
96 |
-
# # Add model filter column
|
97 |
-
# model_values = list(set(
|
98 |
-
# model
|
99 |
-
# for task in TasksMib_Subgraph
|
100 |
-
# for model in task.value.models
|
101 |
-
# ))
|
102 |
-
# auto_eval_column_dict_mib_subgraph.append(
|
103 |
-
# ["model_filter", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)]
|
104 |
-
# )
|
105 |
-
|
106 |
-
# # For each task and model combination
|
107 |
-
# for task in TasksMib_Subgraph:
|
108 |
-
# for model in task.value.models:
|
109 |
-
# col_name = f"{task.value.benchmark}_{model}"
|
110 |
-
# auto_eval_column_dict_mib_subgraph.append([
|
111 |
-
# col_name,
|
112 |
-
# ColumnContent,
|
113 |
-
# ColumnContent(col_name, "number", True)
|
114 |
-
# ])
|
115 |
-
|
116 |
-
# # Average column
|
117 |
-
# auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
118 |
-
|
119 |
|
120 |
##############################################################################################################
|
121 |
# Version 3
|
@@ -185,133 +127,6 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
|
185 |
|
186 |
|
187 |
|
188 |
-
|
189 |
-
# # Initialize the MIB causal graph columns
|
190 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
191 |
-
|
192 |
-
# # Method name column
|
193 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
194 |
-
|
195 |
-
# # For each model-task-intervention combination
|
196 |
-
# for task in TasksMib_Causalgraph:
|
197 |
-
# for model in task.value.models:
|
198 |
-
# for intervention in task.value.interventions:
|
199 |
-
# col_name = f"{model}_{task.value.benchmark}_{intervention}".lower()
|
200 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
201 |
-
# col_name,
|
202 |
-
# ColumnContent,
|
203 |
-
# ColumnContent(col_name, "number", True)
|
204 |
-
# ])
|
205 |
-
|
206 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
207 |
-
|
208 |
-
# # Method name column
|
209 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
210 |
-
|
211 |
-
# # For each model-task-intervention combination
|
212 |
-
# for task in TasksMib_Causalgraph:
|
213 |
-
# for model in task.value.models:
|
214 |
-
# model_name = model.lower() # Convert model name to lowercase
|
215 |
-
# for layer in task.value.layers:
|
216 |
-
# for intervention in task.value.interventions:
|
217 |
-
# for counterfactual in task.value.counterfactuals:
|
218 |
-
# # Include model name in the column name
|
219 |
-
# col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}"
|
220 |
-
# field_name = col_name.lower()
|
221 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
222 |
-
# field_name,
|
223 |
-
# ColumnContent,
|
224 |
-
# ColumnContent(col_name, "number", True)
|
225 |
-
# ])
|
226 |
-
|
227 |
-
# # In utils.py, modify auto_eval_column_dict_mib_causalgraph:
|
228 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
229 |
-
|
230 |
-
# # Method name column
|
231 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
232 |
-
|
233 |
-
# # For each model-task-intervention-counterfactual combination
|
234 |
-
# for task in TasksMib_Causalgraph:
|
235 |
-
# for model in ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"]: # exact model names
|
236 |
-
# for layer in task.value.layers:
|
237 |
-
# for intervention in task.value.interventions:
|
238 |
-
# for counterfactual in task.value.counterfactuals:
|
239 |
-
# # Match the exact format from the data
|
240 |
-
# col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower()
|
241 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
242 |
-
# col_name,
|
243 |
-
# ColumnContent,
|
244 |
-
# ColumnContent(col_name, "number", True)
|
245 |
-
# ])
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
251 |
-
|
252 |
-
# # Method name column
|
253 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
254 |
-
|
255 |
-
# # Add eval_name column
|
256 |
-
# auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)])
|
257 |
-
|
258 |
-
# # For each model-task-intervention-counterfactual combination
|
259 |
-
# for task in TasksMib_Causalgraph:
|
260 |
-
# for model in task.value.models: # Use exact model names with correct casing
|
261 |
-
# model_name = model # Don't convert to lowercase
|
262 |
-
# for layer in task.value.layers:
|
263 |
-
# for intervention in task.value.interventions:
|
264 |
-
# for counterfactual in task.value.counterfactuals:
|
265 |
-
# # Match exact format from the actual data
|
266 |
-
# col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}"
|
267 |
-
# # Use the exact column name as both the field name and display name
|
268 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
269 |
-
# col_name,
|
270 |
-
# ColumnContent,
|
271 |
-
# ColumnContent(col_name, "number", True)
|
272 |
-
# ])
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
277 |
-
|
278 |
-
# # Method name column
|
279 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
280 |
-
# auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)])
|
281 |
-
|
282 |
-
# # For each model-task-intervention-counterfactual combination
|
283 |
-
# for task in TasksMib_Causalgraph:
|
284 |
-
# for model in task.value.models:
|
285 |
-
# for layer in task.value.layers[model]: # Use model-specific layers
|
286 |
-
# for intervention in task.value.interventions:
|
287 |
-
# for counterfactual in task.value.counterfactuals:
|
288 |
-
# col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}"
|
289 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
290 |
-
# col_name,
|
291 |
-
# ColumnContent,
|
292 |
-
# ColumnContent(col_name, "number", True)
|
293 |
-
# ])
|
294 |
-
|
295 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
296 |
-
|
297 |
-
# # Method name column
|
298 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
299 |
-
# auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)])
|
300 |
-
|
301 |
-
# # For each model-task-intervention-counterfactual combination
|
302 |
-
# for task in TasksMib_Causalgraph:
|
303 |
-
# for model in task.value.models: # model will already be lowercase
|
304 |
-
# for layer in task.value.layers[model]:
|
305 |
-
# for intervention in task.value.interventions:
|
306 |
-
# for counterfactual in task.value.counterfactuals:
|
307 |
-
# # Use exactly the same format as in DataFrame
|
308 |
-
# col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}"
|
309 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
310 |
-
# col_name,
|
311 |
-
# ColumnContent,
|
312 |
-
# ColumnContent(col_name, "number", True)
|
313 |
-
# ])
|
314 |
-
|
315 |
auto_eval_column_dict_mib_causalgraph = []
|
316 |
|
317 |
# Only include Method column as required
|
@@ -320,15 +135,17 @@ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnCon
|
|
320 |
# For each model-task-intervention-counterfactual combination
|
321 |
for task in TasksMib_Causalgraph:
|
322 |
for model in task.value.models: # model will be lowercase
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
|
|
|
|
332 |
|
333 |
# Create the dataclass
|
334 |
AutoEvalColumn_mib_causalgraph = make_dataclass(
|
@@ -337,14 +154,20 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
|
|
337 |
frozen=True
|
338 |
)
|
339 |
|
340 |
-
# Column selection for display
|
341 |
-
COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
|
342 |
|
343 |
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
|
349 |
|
350 |
|
|
|
58 |
|
59 |
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
##############################################################################################################
|
63 |
# Version 3
|
|
|
127 |
|
128 |
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
auto_eval_column_dict_mib_causalgraph = []
|
131 |
|
132 |
# Only include Method column as required
|
|
|
135 |
# For each model-task-intervention-counterfactual combination
|
136 |
for task in TasksMib_Causalgraph:
|
137 |
for model in task.value.models: # model will be lowercase
|
138 |
+
col_name = f"{task.value.benchmark}_{model}"
|
139 |
+
auto_eval_column_dict_mib_causalgraph.append([
|
140 |
+
col_name,
|
141 |
+
ColumnContent,
|
142 |
+
ColumnContent(col_name, "number", True)
|
143 |
+
])
|
144 |
+
|
145 |
+
# Add the Average column
|
146 |
+
auto_eval_column_dict_mib_causalgraph.append(
|
147 |
+
["average_score", ColumnContent, ColumnContent("Average", "number", True)]
|
148 |
+
)
|
149 |
|
150 |
# Create the dataclass
|
151 |
AutoEvalColumn_mib_causalgraph = make_dataclass(
|
|
|
154 |
frozen=True
|
155 |
)
|
156 |
|
|
|
|
|
157 |
|
158 |
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
# # Column selection for display
|
164 |
+
# COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
|
165 |
+
|
166 |
+
|
167 |
+
# BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"{model}_{task.value.benchmark}_{intervention}".lower()
|
168 |
+
# for task in TasksMib_Causalgraph
|
169 |
+
# for model in task.value.models
|
170 |
+
# for intervention in task.value.interventions]
|
171 |
|
172 |
|
173 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -12,8 +12,10 @@ from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, Tasks, T
|
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
from src.about import TasksMib_Subgraph
|
14 |
|
15 |
-
from typing import List, Dict
|
16 |
from collections import defaultdict
|
|
|
|
|
17 |
|
18 |
|
19 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
@@ -65,21 +67,8 @@ class EvalResult_MIB_SUBGRAPH:
|
|
65 |
for model_result in data.get("results", []):
|
66 |
model_id = model_result.get("model_id", "")
|
67 |
|
68 |
-
# if "/" in model_id:
|
69 |
-
# org = model_id.split("/")[0]
|
70 |
-
# if org == "meta-llama":
|
71 |
-
# model_name = "llama3"
|
72 |
-
# elif org == "Qwen":
|
73 |
-
# model_name = "qwen2_5"
|
74 |
-
# elif "gpt" in model_id.lower():
|
75 |
-
# model_name = "gpt2"
|
76 |
-
# elif org == "google":
|
77 |
-
# model_name = "gemma2"
|
78 |
-
# else:
|
79 |
-
# model_name = model_id.replace(".", "_")
|
80 |
model_name = model_id.replace(".", "_")
|
81 |
|
82 |
-
|
83 |
# Keep exact scores structure from JSON
|
84 |
scores = model_result.get("scores", {})
|
85 |
|
@@ -108,16 +97,7 @@ class EvalResult_MIB_SUBGRAPH:
|
|
108 |
|
109 |
# Initialize all possible columns with '-'
|
110 |
expected_models = TasksMib_Subgraph.get_all_models()
|
111 |
-
expected_tasks = TasksMib_Subgraph.get_all_tasks()
|
112 |
-
# for task in expected_tasks:
|
113 |
-
# for model in task.value.models:
|
114 |
-
# # if model == "gpt2" and task != "ioi":
|
115 |
-
# # continue
|
116 |
-
# # if model == "qwen2_5" and task.startswith(("arithmetic", "arc")):
|
117 |
-
# # continue
|
118 |
-
# # if model == "gemma2" and (task.startswith("arithmetic") or task == "arc_challenge"):
|
119 |
-
# # continue
|
120 |
-
# data_dict[f"{task}_{model}"] = '-'
|
121 |
|
122 |
for task in TasksMib_Subgraph:
|
123 |
for model in task.value.models:
|
@@ -145,23 +125,6 @@ class EvalResult_MIB_SUBGRAPH:
|
|
145 |
data_dict[col_name] = round(score, 2)
|
146 |
all_scores.append(score)
|
147 |
|
148 |
-
# All entries must be present for average
|
149 |
-
# required_entries = [
|
150 |
-
# data_dict['ioi_llama3'] != '-',
|
151 |
-
# data_dict['ioi_qwen2_5'] != '-',
|
152 |
-
# data_dict['ioi_gpt2'] != '-',
|
153 |
-
# data_dict['ioi_gemma2'] != '-',
|
154 |
-
# data_dict['mcqa_llama3'] != '-',
|
155 |
-
# data_dict['mcqa_qwen2_5'] != '-',
|
156 |
-
# data_dict['mcqa_gemma2'] != '-',
|
157 |
-
# data_dict['arithmetic_addition_llama3'] != '-',
|
158 |
-
# data_dict['arithmetic_subtraction_llama3'] != '-',
|
159 |
-
# data_dict['arc_easy_gemma2'] != '-',
|
160 |
-
# data_dict['arc_easy_llama3'] != '-',
|
161 |
-
# data_dict['arc_challenge_llama3'] != '-'
|
162 |
-
# ]
|
163 |
-
|
164 |
-
# data_dict["Average"] = round(np.mean(all_scores), 2) if all(required_entries) else '-'
|
165 |
data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
|
166 |
return data_dict
|
167 |
|
@@ -207,9 +170,63 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
207 |
|
208 |
|
209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
# @dataclass
|
211 |
# class EvalResult_MIB_CAUSALGRAPH:
|
212 |
-
# """Represents one full evaluation for a method in MIB causalgraph."""
|
213 |
# eval_name: str
|
214 |
# method_name: str
|
215 |
# results: Dict
|
@@ -222,31 +239,26 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
222 |
# method_name = data.get("method_name")
|
223 |
# results = {}
|
224 |
|
225 |
-
# #
|
226 |
# for model_result in data.get("results", []):
|
227 |
-
# model_id = model_result.get("model_id", "")
|
|
|
|
|
|
|
228 |
# task_scores = model_result.get("task_scores", {})
|
229 |
|
230 |
-
# # Process MCQA
|
231 |
-
# mcqa_scores = {}
|
232 |
# for layer_data in task_scores.get("MCQA", []):
|
233 |
# layer = layer_data.get("layer")
|
234 |
-
#
|
235 |
-
|
236 |
-
#
|
237 |
-
# for intervention_data in layer_scores:
|
238 |
-
# intervention = intervention_data["intervention"][0]
|
239 |
-
# counterfactual_scores = intervention_data["counterfactual_scores"]
|
240 |
-
|
241 |
-
# for cf_score in counterfactual_scores:
|
242 |
# counterfactual = cf_score["counterfactual"][0]
|
243 |
# score = cf_score["score"]
|
244 |
|
245 |
-
# # Create key
|
246 |
-
# key = f"
|
247 |
-
#
|
248 |
-
|
249 |
-
# results[model_id] = mcqa_scores
|
250 |
|
251 |
# return EvalResult_MIB_CAUSALGRAPH(
|
252 |
# eval_name=method_name,
|
@@ -254,198 +266,70 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
254 |
# results=results
|
255 |
# )
|
256 |
|
257 |
-
# def to_dict(self):
|
258 |
-
# """Converts the Eval Result to a dict for dataframe display"""
|
259 |
-
# data_dict = {
|
260 |
-
# "eval_name": self.eval_name,
|
261 |
-
# "Method": self.method_name,
|
262 |
-
# }
|
263 |
-
|
264 |
-
# # Process each model's results
|
265 |
-
# for model_id, model_results in self.results.items():
|
266 |
-
# for task, task_scores in model_results.items():
|
267 |
-
# # Calculate layer-averaged scores for each intervention
|
268 |
-
# intervention_scores = defaultdict(list)
|
269 |
-
|
270 |
-
# for layer_data in task_scores:
|
271 |
-
# for score_data in layer_data['scores']:
|
272 |
-
# intervention = score_data['intervention']
|
273 |
-
# intervention_scores[intervention].append(score_data['score'])
|
274 |
-
|
275 |
-
# # Average across layers for each intervention
|
276 |
-
# for intervention, scores in intervention_scores.items():
|
277 |
-
# col_name = f"{model_id}_{task}_{intervention}".lower()
|
278 |
-
# data_dict[col_name] = round(np.mean(scores), 3)
|
279 |
-
|
280 |
-
# return data_dict
|
281 |
-
|
282 |
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
|
|
|
|
289 |
|
290 |
-
|
291 |
-
# for model_id, task_scores in self.results.items():
|
292 |
-
# model_name = model_id.lower() # Lowercase for consistency
|
293 |
|
294 |
-
|
295 |
-
|
296 |
-
# layer = layer_data.get("layer")
|
297 |
-
# layer_scores = layer_data.get("layer_scores", [])
|
298 |
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
# counterfactual_scores = intervention_data["counterfactual_scores"]
|
303 |
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
# data_dict = {
|
319 |
-
# "eval_name": self.eval_name,
|
320 |
-
# "Method": self.method_name,
|
321 |
-
# }
|
322 |
-
|
323 |
-
# # Process each model's results
|
324 |
-
# for model_id, scores in self.results.items():
|
325 |
-
# model_name = model_id.lower()
|
326 |
-
# for task, layer_scores in scores.items():
|
327 |
-
# for layer_data in layer_scores:
|
328 |
-
# layer = layer_data.get("layer")
|
329 |
-
# intervention_scores = layer_data.get("scores", [])
|
330 |
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
# def to_dict(self):
|
339 |
-
# """Converts the Eval Result to a dict for dataframe display"""
|
340 |
-
# data_dict = {
|
341 |
-
# "eval_name": self.eval_name,
|
342 |
-
# "Method": self.method_name,
|
343 |
-
# }
|
344 |
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
# col_name = f"{model_name}_{key}"
|
351 |
-
# data_dict[col_name] = value
|
352 |
-
|
353 |
-
# return data_dict
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
@dataclass
|
363 |
-
class EvalResult_MIB_CAUSALGRAPH:
|
364 |
-
eval_name: str
|
365 |
-
method_name: str
|
366 |
-
results: Dict
|
367 |
-
|
368 |
-
def init_from_json_file(self, json_filepath):
|
369 |
-
"""Inits results from the method result file"""
|
370 |
-
with open(json_filepath) as fp:
|
371 |
-
data = json.load(fp)
|
372 |
-
|
373 |
-
method_name = data.get("method_name")
|
374 |
-
results = {}
|
375 |
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
for score_data in layer_data.get("layer_scores", []):
|
385 |
-
intervention = score_data["intervention"][0]
|
386 |
-
for cf_score in score_data["counterfactual_scores"]:
|
387 |
-
counterfactual = cf_score["counterfactual"][0]
|
388 |
-
score = cf_score["score"]
|
389 |
-
|
390 |
-
# Create key matching the expected column format
|
391 |
-
key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
|
392 |
-
results[key] = score
|
393 |
-
|
394 |
-
return EvalResult_MIB_CAUSALGRAPH(
|
395 |
-
eval_name=method_name,
|
396 |
-
method_name=method_name,
|
397 |
-
results=results
|
398 |
-
)
|
399 |
-
|
400 |
-
def to_dict(self):
|
401 |
-
"""Converts the Eval Result to a dict for dataframe display"""
|
402 |
-
data_dict = {
|
403 |
-
"eval_name": self.eval_name,
|
404 |
-
"Method": self.method_name,
|
405 |
-
}
|
406 |
|
407 |
-
|
408 |
-
|
409 |
|
410 |
-
|
411 |
-
|
412 |
|
413 |
|
414 |
|
415 |
|
416 |
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
421 |
-
# """Extract evaluation results for MIB causalgraph"""
|
422 |
-
# model_result_filepaths = []
|
423 |
-
|
424 |
-
# for root, dirnames, files in os.walk(results_path):
|
425 |
-
# if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
426 |
-
# continue
|
427 |
-
|
428 |
-
# try:
|
429 |
-
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
430 |
-
# except dateutil.parser._parser.ParserError:
|
431 |
-
# files = [files[-1]]
|
432 |
-
|
433 |
-
# for file in files:
|
434 |
-
# model_result_filepaths.append(os.path.join(root, file))
|
435 |
-
|
436 |
-
# eval_results = []
|
437 |
-
# for filepath in model_result_filepaths:
|
438 |
-
# try:
|
439 |
-
# eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
|
440 |
-
# result = eval_result.init_from_json_file(filepath)
|
441 |
-
# result.to_dict() # Verify conversion works
|
442 |
-
# eval_results.append(result)
|
443 |
-
# except Exception as e:
|
444 |
-
# print(f"Error processing {filepath}: {e}")
|
445 |
-
# continue
|
446 |
-
|
447 |
-
# return eval_results
|
448 |
-
|
449 |
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
450 |
model_result_filepaths = []
|
451 |
|
@@ -466,23 +350,30 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
|
|
466 |
|
467 |
# print(f"Found json files: {model_result_filepaths}")
|
468 |
|
469 |
-
|
470 |
-
|
|
|
|
|
471 |
try:
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
|
|
|
|
|
|
477 |
except Exception as e:
|
478 |
-
print(f"Error processing {
|
479 |
continue
|
480 |
-
|
481 |
-
#
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
|
|
|
|
486 |
|
487 |
|
488 |
|
|
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
from src.about import TasksMib_Subgraph
|
14 |
|
15 |
+
from typing import List, Dict, Any
|
16 |
from collections import defaultdict
|
17 |
+
import pandas as pd
|
18 |
+
|
19 |
|
20 |
|
21 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
|
|
67 |
for model_result in data.get("results", []):
|
68 |
model_id = model_result.get("model_id", "")
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
model_name = model_id.replace(".", "_")
|
71 |
|
|
|
72 |
# Keep exact scores structure from JSON
|
73 |
scores = model_result.get("scores", {})
|
74 |
|
|
|
97 |
|
98 |
# Initialize all possible columns with '-'
|
99 |
expected_models = TasksMib_Subgraph.get_all_models()
|
100 |
+
# expected_tasks = TasksMib_Subgraph.get_all_tasks()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
for task in TasksMib_Subgraph:
|
103 |
for model in task.value.models:
|
|
|
125 |
data_dict[col_name] = round(score, 2)
|
126 |
all_scores.append(score)
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
|
129 |
return data_dict
|
130 |
|
|
|
170 |
|
171 |
|
172 |
|
173 |
+
|
174 |
+
def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
|
175 |
+
"""
|
176 |
+
Process a single JSON file and convert it to a DataFrame.
|
177 |
+
|
178 |
+
Args:
|
179 |
+
json_file: Dictionary containing the analysis results
|
180 |
+
method_counter: Counter for handling duplicate method names
|
181 |
+
|
182 |
+
Returns:
|
183 |
+
pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
|
184 |
+
"""
|
185 |
+
method_name = json_file['method_name']
|
186 |
+
unique_method_name = f"{method_name}_{method_counter}"
|
187 |
+
method_scores = []
|
188 |
+
|
189 |
+
for result in json_file['results']:
|
190 |
+
model = result['model_id']
|
191 |
+
|
192 |
+
for task, scores in result['task_scores'].items():
|
193 |
+
# Process each layer's data
|
194 |
+
intervention_scores = defaultdict(list)
|
195 |
+
|
196 |
+
for layer_data in scores:
|
197 |
+
for intervention_data in layer_data['layer_scores']:
|
198 |
+
# Calculate average score for counterfactuals
|
199 |
+
avg_cf_score = np.mean([
|
200 |
+
cf['score']
|
201 |
+
for cf in intervention_data['counterfactual_scores']
|
202 |
+
])
|
203 |
+
|
204 |
+
if np.isnan(avg_cf_score):
|
205 |
+
avg_cf_score = 0.0
|
206 |
+
|
207 |
+
# Group scores by intervention
|
208 |
+
intervention_key = '_'.join(intervention_data['intervention'])
|
209 |
+
intervention_scores[intervention_key].append(avg_cf_score)
|
210 |
+
|
211 |
+
# Average across layers for each intervention
|
212 |
+
for intervention, layer_scores in intervention_scores.items():
|
213 |
+
column = f"{model}_{task}_{intervention}"
|
214 |
+
avg_score = np.mean(layer_scores) if layer_scores else 0.0
|
215 |
+
method_scores.append((column, f"{avg_score:.3f}"))
|
216 |
+
|
217 |
+
# Sort by column names for consistency
|
218 |
+
method_scores.sort(key=lambda x: x[0])
|
219 |
+
data = {
|
220 |
+
unique_method_name: {
|
221 |
+
col: score for col, score in method_scores
|
222 |
+
}
|
223 |
+
}
|
224 |
+
|
225 |
+
return pd.DataFrame.from_dict(data, orient='index')
|
226 |
+
|
227 |
+
|
228 |
# @dataclass
|
229 |
# class EvalResult_MIB_CAUSALGRAPH:
|
|
|
230 |
# eval_name: str
|
231 |
# method_name: str
|
232 |
# results: Dict
|
|
|
239 |
# method_name = data.get("method_name")
|
240 |
# results = {}
|
241 |
|
242 |
+
# # Process each model's results
|
243 |
# for model_result in data.get("results", []):
|
244 |
+
# model_id = model_result.get("model_id", "")
|
245 |
+
|
246 |
+
|
247 |
+
|
248 |
# task_scores = model_result.get("task_scores", {})
|
249 |
|
250 |
+
# # Process MCQA scores
|
|
|
251 |
# for layer_data in task_scores.get("MCQA", []):
|
252 |
# layer = layer_data.get("layer")
|
253 |
+
# for score_data in layer_data.get("layer_scores", []):
|
254 |
+
# intervention = score_data["intervention"][0]
|
255 |
+
# for cf_score in score_data["counterfactual_scores"]:
|
|
|
|
|
|
|
|
|
|
|
256 |
# counterfactual = cf_score["counterfactual"][0]
|
257 |
# score = cf_score["score"]
|
258 |
|
259 |
+
# # Create key matching the expected column format
|
260 |
+
# key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
|
261 |
+
# results[key] = score
|
|
|
|
|
262 |
|
263 |
# return EvalResult_MIB_CAUSALGRAPH(
|
264 |
# eval_name=method_name,
|
|
|
266 |
# results=results
|
267 |
# )
|
268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
+
# data = {}
|
271 |
+
# method_counters = defaultdict(int)
|
272 |
+
|
273 |
+
# for json_file in json_files:
|
274 |
+
# # Handle method name and duplicates
|
275 |
+
# method_name = json_file['method_name']
|
276 |
+
# method_counters[method_name] += 1
|
277 |
+
# unique_method_name = f"{method_name}_{method_counters[method_name]}"
|
278 |
|
279 |
+
# method_scores = []
|
|
|
|
|
280 |
|
281 |
+
# for result in json_file['results']:
|
282 |
+
# model = result['model_id']
|
|
|
|
|
283 |
|
284 |
+
# for task, scores in result['task_scores'].items():
|
285 |
+
# # Process each layer's data
|
286 |
+
# intervention_scores = defaultdict(list)
|
|
|
287 |
|
288 |
+
# for layer_data in scores:
|
289 |
+
# for intervention_data in layer_data['layer_scores']:
|
290 |
+
# # Calculate average score for counterfactuals
|
291 |
+
# avg_cf_score = np.mean([
|
292 |
+
# cf['score']
|
293 |
+
# for cf in intervention_data['counterfactual_scores']
|
294 |
+
# ])
|
295 |
+
|
296 |
+
# if np.isnan(avg_cf_score):
|
297 |
+
# avg_cf_score = 0.0
|
298 |
+
|
299 |
+
# # Group scores by intervention
|
300 |
+
# intervention_key = '_'.join(intervention_data['intervention'])
|
301 |
+
# intervention_scores[intervention_key].append(avg_cf_score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
+
# # Average across layers for each intervention
|
304 |
+
# for intervention, layer_scores in intervention_scores.items():
|
305 |
+
# column = f"{model}_{task}_{intervention}"
|
306 |
+
# avg_score = np.mean(layer_scores) if layer_scores else 0.0
|
307 |
+
# method_scores.append((column, f"{avg_score:.3f}"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
+
# # Sort by column names for consistency
|
310 |
+
# method_scores.sort(key=lambda x: x[0])
|
311 |
+
# data[unique_method_name] = {
|
312 |
+
# col: score for col, score in method_scores
|
313 |
+
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
315 |
+
# return pd.DataFrame.from_dict(data, orient='index')
|
316 |
+
|
317 |
+
# def to_dict(self):
|
318 |
+
# """Converts the Eval Result to a dict for dataframe display"""
|
319 |
+
# data_dict = {
|
320 |
+
# "eval_name": self.eval_name,
|
321 |
+
# "Method": self.method_name,
|
322 |
+
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
|
324 |
+
# # Add all results directly
|
325 |
+
# data_dict.update(self.results)
|
326 |
|
327 |
+
# return data_dict
|
|
|
328 |
|
329 |
|
330 |
|
331 |
|
332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
334 |
model_result_filepaths = []
|
335 |
|
|
|
350 |
|
351 |
# print(f"Found json files: {model_result_filepaths}")
|
352 |
|
353 |
+
method_counters = defaultdict(int)
|
354 |
+
dataframes = []
|
355 |
+
|
356 |
+
for json_file in model_result_filepaths:
|
357 |
try:
|
358 |
+
with open(filepath, 'r') as f:
|
359 |
+
json_data = json.load(f)
|
360 |
+
method_name = json_data['method_name']
|
361 |
+
method_counters[method_name] += 1
|
362 |
+
|
363 |
+
# Process single JSON file
|
364 |
+
df = process_single_json(json_data, method_counters[method_name])
|
365 |
+
dataframes.append(df)
|
366 |
except Exception as e:
|
367 |
+
print(f"Error processing {json_file}: {e}")
|
368 |
continue
|
369 |
+
|
370 |
+
# # Concatenate all DataFrames
|
371 |
+
# if dataframes:
|
372 |
+
# final_df = pd.concat(dataframes, axis=0)
|
373 |
+
# return final_df
|
374 |
+
# else:
|
375 |
+
# return pd.DataFrame()
|
376 |
+
return dataframes
|
377 |
|
378 |
|
379 |
|
src/populate.py
CHANGED
@@ -29,14 +29,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
29 |
|
30 |
df = pd.DataFrame.from_records(all_data_json)
|
31 |
df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
|
32 |
-
# df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
|
33 |
-
# df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
|
34 |
-
|
35 |
-
# print(f"df is {df}")
|
36 |
-
|
37 |
-
# df = df[cols].round(decimals=1)
|
38 |
-
|
39 |
-
# filter out if any of the benchmarks have not been produced
|
40 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
41 |
return df
|
42 |
|
@@ -69,23 +61,6 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
|
|
69 |
|
70 |
|
71 |
|
72 |
-
# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
73 |
-
# """Aggregates rows with the same base method name by taking the max value for each column"""
|
74 |
-
# df_copy = df.copy()
|
75 |
-
|
76 |
-
# # Extract base method names (remove _2, _3, etc. suffixes)
|
77 |
-
# base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
78 |
-
# else name for name in df_copy.index]
|
79 |
-
# df_copy.index = base_methods
|
80 |
-
|
81 |
-
# # Convert scores to numeric values
|
82 |
-
# numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
|
83 |
-
|
84 |
-
# # Group by base method name and take the max
|
85 |
-
# aggregated_df = numeric_df.groupby(level=0).max().round(3)
|
86 |
-
|
87 |
-
# return aggregated_df
|
88 |
-
|
89 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
90 |
"""Aggregates rows with the same base method name by taking the max value for each column"""
|
91 |
df_copy = df.copy()
|
@@ -111,63 +86,6 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
|
111 |
|
112 |
return aggregated_df
|
113 |
|
114 |
-
# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
115 |
-
# """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
116 |
-
# df_copy = df.copy()
|
117 |
-
|
118 |
-
# # Remove the Method column and eval_name if present
|
119 |
-
# columns_to_drop = ['Method', 'eval_name']
|
120 |
-
# df_copy = df_copy.drop(columns=[col for col in columns_to_drop if col in df_copy.columns])
|
121 |
-
|
122 |
-
# # Group columns by model_task
|
123 |
-
# model_task_groups = {}
|
124 |
-
# for col in df_copy.columns:
|
125 |
-
# model_task = '_'.join(col.split('_')[:2]) # Get model_task part
|
126 |
-
# if model_task not in model_task_groups:
|
127 |
-
# model_task_groups[model_task] = []
|
128 |
-
# model_task_groups[model_task].append(col)
|
129 |
-
|
130 |
-
# # Create new DataFrame with averaged intervention scores
|
131 |
-
# averaged_df = pd.DataFrame({
|
132 |
-
# model_task: df_copy[cols].mean(axis=1).round(3)
|
133 |
-
# for model_task, cols in model_task_groups.items()
|
134 |
-
# })
|
135 |
-
|
136 |
-
# return averaged_df
|
137 |
-
|
138 |
-
# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
139 |
-
# """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
140 |
-
# df_copy = df.copy()
|
141 |
-
|
142 |
-
# # Store Method column if it exists
|
143 |
-
# method_col = None
|
144 |
-
# if 'Method' in df_copy.columns:
|
145 |
-
# method_col = df_copy['Method']
|
146 |
-
# df_copy = df_copy.drop('Method', axis=1)
|
147 |
-
|
148 |
-
# # Remove eval_name if present
|
149 |
-
# if 'eval_name' in df_copy.columns:
|
150 |
-
# df_copy = df_copy.drop('eval_name', axis=1)
|
151 |
-
|
152 |
-
# # Group columns by model_task
|
153 |
-
# model_task_groups = {}
|
154 |
-
# for col in df_copy.columns:
|
155 |
-
# model_task = '_'.join(col.split('_')[:2]) # Get model_task part
|
156 |
-
# if model_task not in model_task_groups:
|
157 |
-
# model_task_groups[model_task] = []
|
158 |
-
# model_task_groups[model_task].append(col)
|
159 |
-
|
160 |
-
# # Create new DataFrame with averaged intervention scores
|
161 |
-
# averaged_df = pd.DataFrame({
|
162 |
-
# model_task: df_copy[cols].mean(axis=1).round(3)
|
163 |
-
# for model_task, cols in model_task_groups.items()
|
164 |
-
# })
|
165 |
-
|
166 |
-
# # Add Method column back
|
167 |
-
# if method_col is not None:
|
168 |
-
# averaged_df.insert(0, 'Method', method_col)
|
169 |
-
|
170 |
-
# return averaged_df
|
171 |
|
172 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
173 |
"""Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
@@ -203,69 +121,32 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
203 |
|
204 |
return averaged_df
|
205 |
|
206 |
-
# def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
207 |
-
# """Creates a dataframe from all the MIB causal graph experiment results"""
|
208 |
-
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
209 |
-
# raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
210 |
-
# print(f"raw_data is {raw_data}")
|
211 |
-
|
212 |
-
# # Convert each result to dict format for detailed df
|
213 |
-
# all_data_json = [v.to_dict() for v in raw_data]
|
214 |
-
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
215 |
-
# print(f"detailed_df is: {detailed_df}")
|
216 |
-
|
217 |
-
# # Create and print other views for debugging/reference
|
218 |
-
# aggregated_df = aggregate_methods(detailed_df)
|
219 |
-
# print(f"aggregated_df is: {aggregated_df}")
|
220 |
-
|
221 |
-
# intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
|
222 |
-
# print(f"intervention_averaged_df is: {intervention_averaged_df}")
|
223 |
-
|
224 |
-
# # Only return detailed_df for display
|
225 |
-
# return detailed_df
|
226 |
|
227 |
-
# def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
228 |
-
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
229 |
-
# raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
230 |
-
|
231 |
-
# # Convert each result to dict format for detailed df
|
232 |
-
# all_data_json = [v.to_dict() for v in raw_data]
|
233 |
-
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
234 |
-
# print("Columns in detailed_df:", detailed_df.columns.tolist()) # Print actual columns
|
235 |
-
|
236 |
-
# # Create aggregated df
|
237 |
-
# aggregated_df = aggregate_methods(detailed_df)
|
238 |
-
# print("Columns in aggregated_df:", aggregated_df.columns.tolist())
|
239 |
-
|
240 |
-
# # Create intervention-averaged df
|
241 |
-
# intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
|
242 |
-
# print("Columns in intervention_averaged_df:", intervention_averaged_df.columns.tolist())
|
243 |
-
|
244 |
-
# return detailed_df, aggregated_df, intervention_averaged_df
|
245 |
|
246 |
-
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str
|
247 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
248 |
-
|
249 |
-
|
250 |
-
#
|
251 |
-
|
252 |
-
|
|
|
253 |
|
254 |
# Print the actual columns for debugging
|
255 |
-
|
256 |
-
|
257 |
-
# Rename columns to match schema
|
258 |
-
column_mapping = {}
|
259 |
-
for col in detailed_df.columns:
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
detailed_df = detailed_df.rename(columns=column_mapping)
|
269 |
|
270 |
# Create aggregated df
|
271 |
aggregated_df = aggregate_methods(detailed_df)
|
|
|
29 |
|
30 |
df = pd.DataFrame.from_records(all_data_json)
|
31 |
df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
33 |
return df
|
34 |
|
|
|
61 |
|
62 |
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
65 |
"""Aggregates rows with the same base method name by taking the max value for each column"""
|
66 |
df_copy = df.copy()
|
|
|
86 |
|
87 |
return aggregated_df
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
91 |
"""Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
|
|
121 |
|
122 |
return averaged_df
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
+
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
127 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
128 |
+
|
129 |
+
# raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
130 |
+
# all_data_json = [v.to_dict() for v in raw_data]
|
131 |
+
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
132 |
+
|
133 |
+
detailed_df = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
134 |
|
135 |
# Print the actual columns for debugging
|
136 |
+
print("Original columns:", detailed_df.columns.tolist())
|
137 |
+
|
138 |
+
# # Rename columns to match schema
|
139 |
+
# column_mapping = {}
|
140 |
+
# for col in detailed_df.columns:
|
141 |
+
# if col in ['eval_name', 'Method']:
|
142 |
+
# continue
|
143 |
+
# # Ensure consistent casing for the column names
|
144 |
+
# new_col = col.replace('Qwen2ForCausalLM', 'qwen2forcausallm') \
|
145 |
+
# .replace('Gemma2ForCausalLM', 'gemma2forcausallm') \
|
146 |
+
# .replace('LlamaForCausalLM', 'llamaforcausallm')
|
147 |
+
# column_mapping[col] = new_col
|
148 |
+
|
149 |
+
# detailed_df = detailed_df.rename(columns=column_mapping)
|
150 |
|
151 |
# Create aggregated df
|
152 |
aggregated_df = aggregate_methods(detailed_df)
|