Spaces:
Running
Running
benediktstroebl
commited on
Commit
·
3d3d857
1
Parent(s):
c27a759
added cost and heatmap explanation
Browse files
app.py
CHANGED
@@ -30,6 +30,8 @@ from datetime import datetime
|
|
30 |
|
31 |
abs_path = Path(__file__).parent
|
32 |
|
|
|
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
35 |
|
@@ -603,7 +605,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
603 |
)
|
604 |
|
605 |
with gr.Row():
|
606 |
-
gr.Markdown(
|
607 |
with gr.Row():
|
608 |
scatter_plot = gr.Plot(create_scatter_plot(
|
609 |
parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_challenge', aggregate=False),
|
@@ -615,7 +617,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
615 |
))
|
616 |
|
617 |
gr.HTML('<div style="height: 30px;"></div>')
|
618 |
-
gr.Markdown(
|
619 |
with gr.Row():
|
620 |
task_success_heatmap = gr.Plot()
|
621 |
demo.load(
|
@@ -642,12 +644,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
642 |
datatype='markdown'
|
643 |
)
|
644 |
with gr.Row():
|
645 |
-
gr.Markdown(
|
646 |
with gr.Row():
|
647 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
648 |
|
649 |
gr.HTML('<div style="height: 30px;"></div>')
|
650 |
-
gr.Markdown(
|
651 |
with gr.Row():
|
652 |
task_success_heatmap = gr.Plot()
|
653 |
demo.load(
|
@@ -685,13 +687,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
685 |
)
|
686 |
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
687 |
with gr.Row():
|
688 |
-
gr.Markdown(
|
689 |
with gr.Row():
|
690 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
691 |
|
692 |
gr.HTML('<div style="height: 30px;"></div>')
|
693 |
-
gr.Markdown(
|
694 |
-
gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
|
695 |
with gr.Row():
|
696 |
task_success_heatmap = gr.Plot()
|
697 |
demo.load(
|
@@ -724,13 +725,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
724 |
)
|
725 |
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
726 |
with gr.Row():
|
727 |
-
gr.Markdown(
|
728 |
with gr.Row():
|
729 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
730 |
|
731 |
gr.HTML('<div style="height: 30px;"></div>')
|
732 |
-
gr.Markdown(
|
733 |
-
gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
|
734 |
with gr.Row():
|
735 |
task_success_heatmap = gr.Plot()
|
736 |
demo.load(
|
@@ -763,13 +763,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
763 |
)
|
764 |
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
765 |
with gr.Row():
|
766 |
-
gr.Markdown(
|
767 |
with gr.Row():
|
768 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
769 |
|
770 |
gr.HTML('<div style="height: 30px;"></div>')
|
771 |
-
gr.Markdown(
|
772 |
-
gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
|
773 |
with gr.Row():
|
774 |
task_success_heatmap = gr.Plot()
|
775 |
demo.load(
|
@@ -823,7 +822,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
823 |
)
|
824 |
|
825 |
with gr.Row():
|
826 |
-
gr.Markdown(
|
827 |
with gr.Row():
|
828 |
cybench_scatter_plot = gr.Plot()
|
829 |
|
@@ -862,7 +861,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
862 |
|
863 |
# Keep the existing heatmap section
|
864 |
gr.HTML('<div style="height: 30px;"></div>')
|
865 |
-
gr.Markdown(
|
866 |
with gr.Row():
|
867 |
task_success_heatmap = gr.Plot()
|
868 |
demo.load(
|
@@ -917,7 +916,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
917 |
)
|
918 |
|
919 |
with gr.Row():
|
920 |
-
gr.Markdown(
|
921 |
with gr.Row():
|
922 |
gaia_scatter_plot = gr.Plot()
|
923 |
|
@@ -956,7 +955,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
956 |
|
957 |
# Keep the existing heatmap section
|
958 |
gr.HTML('<div style="height: 30px;"></div>')
|
959 |
-
gr.Markdown(
|
960 |
with gr.Row():
|
961 |
task_success_heatmap = gr.Plot()
|
962 |
demo.load(
|
@@ -1010,12 +1009,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
1010 |
)
|
1011 |
|
1012 |
with gr.Row():
|
1013 |
-
gr.Markdown(
|
1014 |
with gr.Row():
|
1015 |
swebench_scatter_plot = gr.Plot()
|
1016 |
|
1017 |
gr.HTML('<div style="height: 30px;"></div>')
|
1018 |
-
gr.Markdown(
|
1019 |
with gr.Row():
|
1020 |
task_success_heatmap = gr.Plot()
|
1021 |
demo.load(
|
@@ -1078,12 +1077,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
1078 |
)
|
1079 |
gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
1080 |
with gr.Row():
|
1081 |
-
gr.Markdown(
|
1082 |
with gr.Row():
|
1083 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
1084 |
|
1085 |
gr.HTML('<div style="height: 30px;"></div>')
|
1086 |
-
gr.Markdown(
|
1087 |
with gr.Row():
|
1088 |
task_success_heatmap = gr.Plot()
|
1089 |
demo.load(
|
@@ -1111,13 +1110,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
1111 |
)
|
1112 |
gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
1113 |
with gr.Row():
|
1114 |
-
gr.Markdown(
|
1115 |
with gr.Row():
|
1116 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
1117 |
|
1118 |
gr.HTML('<div style="height: 30px;"></div>')
|
1119 |
-
gr.Markdown(
|
1120 |
-
gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
|
1121 |
with gr.Row():
|
1122 |
task_success_heatmap = gr.Plot()
|
1123 |
demo.load(
|
|
|
30 |
|
31 |
abs_path = Path(__file__).parent
|
32 |
|
33 |
+
heatmap_explanation = (Path(__file__).parent / "heatmap_explanation.md").read_text()
|
34 |
+
cost_explanation = (Path(__file__).parent / "cost_explanation.md").read_text()
|
35 |
def restart_space():
|
36 |
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
37 |
|
|
|
605 |
)
|
606 |
|
607 |
with gr.Row():
|
608 |
+
gr.Markdown(cost_explanation)
|
609 |
with gr.Row():
|
610 |
scatter_plot = gr.Plot(create_scatter_plot(
|
611 |
parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_challenge', aggregate=False),
|
|
|
617 |
))
|
618 |
|
619 |
gr.HTML('<div style="height: 30px;"></div>')
|
620 |
+
gr.Markdown(heatmap_explanation)
|
621 |
with gr.Row():
|
622 |
task_success_heatmap = gr.Plot()
|
623 |
demo.load(
|
|
|
644 |
datatype='markdown'
|
645 |
)
|
646 |
with gr.Row():
|
647 |
+
gr.Markdown(cost_explanation)
|
648 |
with gr.Row():
|
649 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
650 |
|
651 |
gr.HTML('<div style="height: 30px;"></div>')
|
652 |
+
gr.Markdown(heatmap_explanation)
|
653 |
with gr.Row():
|
654 |
task_success_heatmap = gr.Plot()
|
655 |
demo.load(
|
|
|
687 |
)
|
688 |
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
689 |
with gr.Row():
|
690 |
+
gr.Markdown(cost_explanation)
|
691 |
with gr.Row():
|
692 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
693 |
|
694 |
gr.HTML('<div style="height: 30px;"></div>')
|
695 |
+
gr.Markdown(heatmap_explanation)
|
|
|
696 |
with gr.Row():
|
697 |
task_success_heatmap = gr.Plot()
|
698 |
demo.load(
|
|
|
725 |
)
|
726 |
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
727 |
with gr.Row():
|
728 |
+
gr.Markdown(cost_explanation)
|
729 |
with gr.Row():
|
730 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
731 |
|
732 |
gr.HTML('<div style="height: 30px;"></div>')
|
733 |
+
gr.Markdown(heatmap_explanation)
|
|
|
734 |
with gr.Row():
|
735 |
task_success_heatmap = gr.Plot()
|
736 |
demo.load(
|
|
|
763 |
)
|
764 |
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
765 |
with gr.Row():
|
766 |
+
gr.Markdown(cost_explanation)
|
767 |
with gr.Row():
|
768 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
769 |
|
770 |
gr.HTML('<div style="height: 30px;"></div>')
|
771 |
+
gr.Markdown(heatmap_explanation)
|
|
|
772 |
with gr.Row():
|
773 |
task_success_heatmap = gr.Plot()
|
774 |
demo.load(
|
|
|
822 |
)
|
823 |
|
824 |
with gr.Row():
|
825 |
+
gr.Markdown(cost_explanation)
|
826 |
with gr.Row():
|
827 |
cybench_scatter_plot = gr.Plot()
|
828 |
|
|
|
861 |
|
862 |
# Keep the existing heatmap section
|
863 |
gr.HTML('<div style="height: 30px;"></div>')
|
864 |
+
gr.Markdown(heatmap_explanation)
|
865 |
with gr.Row():
|
866 |
task_success_heatmap = gr.Plot()
|
867 |
demo.load(
|
|
|
916 |
)
|
917 |
|
918 |
with gr.Row():
|
919 |
+
gr.Markdown(cost_explanation)
|
920 |
with gr.Row():
|
921 |
gaia_scatter_plot = gr.Plot()
|
922 |
|
|
|
955 |
|
956 |
# Keep the existing heatmap section
|
957 |
gr.HTML('<div style="height: 30px;"></div>')
|
958 |
+
gr.Markdown(heatmap_explanation)
|
959 |
with gr.Row():
|
960 |
task_success_heatmap = gr.Plot()
|
961 |
demo.load(
|
|
|
1009 |
)
|
1010 |
|
1011 |
with gr.Row():
|
1012 |
+
gr.Markdown(cost_explanation)
|
1013 |
with gr.Row():
|
1014 |
swebench_scatter_plot = gr.Plot()
|
1015 |
|
1016 |
gr.HTML('<div style="height: 30px;"></div>')
|
1017 |
+
gr.Markdown(heatmap_explanation)
|
1018 |
with gr.Row():
|
1019 |
task_success_heatmap = gr.Plot()
|
1020 |
demo.load(
|
|
|
1077 |
)
|
1078 |
gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
1079 |
with gr.Row():
|
1080 |
+
gr.Markdown(cost_explanation)
|
1081 |
with gr.Row():
|
1082 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
1083 |
|
1084 |
gr.HTML('<div style="height: 30px;"></div>')
|
1085 |
+
gr.Markdown(heatmap_explanation)
|
1086 |
with gr.Row():
|
1087 |
task_success_heatmap = gr.Plot()
|
1088 |
demo.load(
|
|
|
1110 |
)
|
1111 |
gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
1112 |
with gr.Row():
|
1113 |
+
gr.Markdown(cost_explanation)
|
1114 |
with gr.Row():
|
1115 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
1116 |
|
1117 |
gr.HTML('<div style="height: 30px;"></div>')
|
1118 |
+
gr.Markdown(heatmap_explanation)
|
|
|
1119 |
with gr.Row():
|
1120 |
task_success_heatmap = gr.Plot()
|
1121 |
demo.load(
|