Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
a2e0e8f
1
Parent(s):
2ba536b
Update causal graph page to display separate views for average and highest values
Browse files- app.py +35 -7
- src/leaderboard/read_evals.py +1 -1
app.py
CHANGED
@@ -681,12 +681,12 @@ with demo:
|
|
681 |
# Then modify the Causal Graph tab section
|
682 |
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
683 |
with gr.Tabs() as causalgraph_tabs:
|
684 |
-
with gr.TabItem("Detailed View", id=0):
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
with gr.TabItem("
|
690 |
gr.Markdown("""
|
691 |
### Filtering Options
|
692 |
Use the dropdown menus below to filter results by specific tasks or models.
|
@@ -718,11 +718,39 @@ with demo:
|
|
718 |
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
719 |
outputs=leaderboard_aggregated
|
720 |
)
|
721 |
-
with gr.TabItem("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
723 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
724 |
"Causal Graph"
|
725 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
726 |
|
727 |
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
728 |
# Track selection
|
|
|
681 |
# Then modify the Causal Graph tab section
|
682 |
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
683 |
with gr.Tabs() as causalgraph_tabs:
|
684 |
+
# with gr.TabItem("Detailed View", id=0):
|
685 |
+
# leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
|
686 |
+
# LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
|
687 |
+
# "Causal Graph"
|
688 |
+
# )
|
689 |
+
with gr.TabItem("Highest View", id=0):
|
690 |
gr.Markdown("""
|
691 |
### Filtering Options
|
692 |
Use the dropdown menus below to filter results by specific tasks or models.
|
|
|
718 |
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
719 |
outputs=leaderboard_aggregated
|
720 |
)
|
721 |
+
with gr.TabItem("Averaged View", id=1):
|
722 |
+
|
723 |
+
task_substring_checkbox = gr.CheckboxGroup(
|
724 |
+
choices=TASK_SUBSTRINGS,
|
725 |
+
label="View tasks:",
|
726 |
+
value=TASK_SUBSTRINGS, # Default to all substrings selected
|
727 |
+
)
|
728 |
+
model_substring_checkbox = gr.CheckboxGroup(
|
729 |
+
choices = MODEL_SUBSTRINGS,
|
730 |
+
label = "View models:",
|
731 |
+
value = MODEL_SUBSTRINGS
|
732 |
+
)
|
733 |
+
|
734 |
leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
735 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
736 |
"Causal Graph"
|
737 |
)
|
738 |
+
original_leaderboard = gr.State(value=data)
|
739 |
+
task_substring_checkbox.change(
|
740 |
+
fn=update_leaderboard,
|
741 |
+
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
742 |
+
outputs=leaderboard_averaged
|
743 |
+
)
|
744 |
+
model_substring_checkbox.change(
|
745 |
+
fn=update_leaderboard,
|
746 |
+
inputs=[original_leaderboard, task_substring_checkbox, model_substring_checkbox],
|
747 |
+
outputs=leaderboard_averaged
|
748 |
+
)
|
749 |
+
|
750 |
+
# leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
751 |
+
# LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
752 |
+
# "Causal Graph"
|
753 |
+
# )
|
754 |
|
755 |
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
756 |
# Track selection
|
src/leaderboard/read_evals.py
CHANGED
@@ -508,7 +508,7 @@ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFram
|
|
508 |
# intervention_averaged_mean_df = create_intervention_averaged_df(detailed_df_mean)
|
509 |
|
510 |
# return detailed_df_highest, detailed_df_mean, intervention_averaged_highest_df
|
511 |
-
return detailed_df_highest,
|
512 |
|
513 |
|
514 |
|
|
|
508 |
# intervention_averaged_mean_df = create_intervention_averaged_df(detailed_df_mean)
|
509 |
|
510 |
# return detailed_df_highest, detailed_df_mean, intervention_averaged_highest_df
|
511 |
+
return detailed_df_highest, detailed_df_highest, detailed_df_mean
|
512 |
|
513 |
|
514 |
|