benediktstroebl commited on
Commit
3d3d857
·
1 Parent(s): c27a759

added cost and heatmap explanation

Browse files
Files changed (1) hide show
  1. app.py +22 -24
app.py CHANGED
@@ -30,6 +30,8 @@ from datetime import datetime
30
 
31
  abs_path = Path(__file__).parent
32
 
 
 
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
35
 
@@ -603,7 +605,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
603
  )
604
 
605
  with gr.Row():
606
- gr.Markdown("### Accuracy vs. Cost on AppWorld")
607
  with gr.Row():
608
  scatter_plot = gr.Plot(create_scatter_plot(
609
  parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_challenge', aggregate=False),
@@ -615,7 +617,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
615
  ))
616
 
617
  gr.HTML('<div style="height: 30px;"></div>')
618
- gr.Markdown("## Task success heatmap")
619
  with gr.Row():
620
  task_success_heatmap = gr.Plot()
621
  demo.load(
@@ -642,12 +644,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
642
  datatype='markdown'
643
  )
644
  with gr.Row():
645
- gr.Markdown("### Accuracy vs. Cost on AppWorld")
646
  with gr.Row():
647
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
648
 
649
  gr.HTML('<div style="height: 30px;"></div>')
650
- gr.Markdown("## Task success heatmap")
651
  with gr.Row():
652
  task_success_heatmap = gr.Plot()
653
  demo.load(
@@ -685,13 +687,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
685
  )
686
  # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
687
  with gr.Row():
688
- gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
689
  with gr.Row():
690
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
691
 
692
  gr.HTML('<div style="height: 30px;"></div>')
693
- gr.Markdown("## Task success heatmap")
694
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
695
  with gr.Row():
696
  task_success_heatmap = gr.Plot()
697
  demo.load(
@@ -724,13 +725,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
724
  )
725
  # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
726
  with gr.Row():
727
- gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Medium")
728
  with gr.Row():
729
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
730
 
731
  gr.HTML('<div style="height: 30px;"></div>')
732
- gr.Markdown("## Task success heatmap")
733
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
734
  with gr.Row():
735
  task_success_heatmap = gr.Plot()
736
  demo.load(
@@ -763,13 +763,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
763
  )
764
  # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
765
  with gr.Row():
766
- gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
767
  with gr.Row():
768
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
769
 
770
  gr.HTML('<div style="height: 30px;"></div>')
771
- gr.Markdown("## Task success heatmap")
772
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
773
  with gr.Row():
774
  task_success_heatmap = gr.Plot()
775
  demo.load(
@@ -823,7 +822,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
823
  )
824
 
825
  with gr.Row():
826
- gr.Markdown("### Accuracy vs. Cost on Cybench")
827
  with gr.Row():
828
  cybench_scatter_plot = gr.Plot()
829
 
@@ -862,7 +861,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
862
 
863
  # Keep the existing heatmap section
864
  gr.HTML('<div style="height: 30px;"></div>')
865
- gr.Markdown("## Task success heatmap")
866
  with gr.Row():
867
  task_success_heatmap = gr.Plot()
868
  demo.load(
@@ -917,7 +916,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
917
  )
918
 
919
  with gr.Row():
920
- gr.Markdown("### Accuracy vs. Cost on GAIA")
921
  with gr.Row():
922
  gaia_scatter_plot = gr.Plot()
923
 
@@ -956,7 +955,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
956
 
957
  # Keep the existing heatmap section
958
  gr.HTML('<div style="height: 30px;"></div>')
959
- gr.Markdown("## Task success heatmap")
960
  with gr.Row():
961
  task_success_heatmap = gr.Plot()
962
  demo.load(
@@ -1010,12 +1009,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
1010
  )
1011
 
1012
  with gr.Row():
1013
- gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
1014
  with gr.Row():
1015
  swebench_scatter_plot = gr.Plot()
1016
 
1017
  gr.HTML('<div style="height: 30px;"></div>')
1018
- gr.Markdown("## Task success heatmap")
1019
  with gr.Row():
1020
  task_success_heatmap = gr.Plot()
1021
  demo.load(
@@ -1078,12 +1077,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
1078
  )
1079
  gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
1080
  with gr.Row():
1081
- gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
1082
  with gr.Row():
1083
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1084
 
1085
  gr.HTML('<div style="height: 30px;"></div>')
1086
- gr.Markdown("## Task success heatmap")
1087
  with gr.Row():
1088
  task_success_heatmap = gr.Plot()
1089
  demo.load(
@@ -1111,13 +1110,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
1111
  )
1112
  gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
1113
  with gr.Row():
1114
- gr.Markdown("### Accuracy vs. Cost for USACO agents")
1115
  with gr.Row():
1116
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1117
 
1118
  gr.HTML('<div style="height: 30px;"></div>')
1119
- gr.Markdown("## Task success heatmap")
1120
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
1121
  with gr.Row():
1122
  task_success_heatmap = gr.Plot()
1123
  demo.load(
 
30
 
31
  abs_path = Path(__file__).parent
32
 
33
+ heatmap_explanation = (Path(__file__).parent / "heatmap_explanation.md").read_text()
34
+ cost_explanation = (Path(__file__).parent / "cost_explanation.md").read_text()
35
  def restart_space():
36
  API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
37
 
 
605
  )
606
 
607
  with gr.Row():
608
+ gr.Markdown(cost_explanation)
609
  with gr.Row():
610
  scatter_plot = gr.Plot(create_scatter_plot(
611
  parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_challenge', aggregate=False),
 
617
  ))
618
 
619
  gr.HTML('<div style="height: 30px;"></div>')
620
+ gr.Markdown(heatmap_explanation)
621
  with gr.Row():
622
  task_success_heatmap = gr.Plot()
623
  demo.load(
 
644
  datatype='markdown'
645
  )
646
  with gr.Row():
647
+ gr.Markdown(cost_explanation)
648
  with gr.Row():
649
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
650
 
651
  gr.HTML('<div style="height: 30px;"></div>')
652
+ gr.Markdown(heatmap_explanation)
653
  with gr.Row():
654
  task_success_heatmap = gr.Plot()
655
  demo.load(
 
687
  )
688
  # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
689
  with gr.Row():
690
+ gr.Markdown(cost_explanation)
691
  with gr.Row():
692
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
693
 
694
  gr.HTML('<div style="height: 30px;"></div>')
695
+ gr.Markdown(heatmap_explanation)
 
696
  with gr.Row():
697
  task_success_heatmap = gr.Plot()
698
  demo.load(
 
725
  )
726
  # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
727
  with gr.Row():
728
+ gr.Markdown(cost_explanation)
729
  with gr.Row():
730
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
731
 
732
  gr.HTML('<div style="height: 30px;"></div>')
733
+ gr.Markdown(heatmap_explanation)
 
734
  with gr.Row():
735
  task_success_heatmap = gr.Plot()
736
  demo.load(
 
763
  )
764
  # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
765
  with gr.Row():
766
+ gr.Markdown(cost_explanation)
767
  with gr.Row():
768
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
769
 
770
  gr.HTML('<div style="height: 30px;"></div>')
771
+ gr.Markdown(heatmap_explanation)
 
772
  with gr.Row():
773
  task_success_heatmap = gr.Plot()
774
  demo.load(
 
822
  )
823
 
824
  with gr.Row():
825
+ gr.Markdown(cost_explanation)
826
  with gr.Row():
827
  cybench_scatter_plot = gr.Plot()
828
 
 
861
 
862
  # Keep the existing heatmap section
863
  gr.HTML('<div style="height: 30px;"></div>')
864
+ gr.Markdown(heatmap_explanation)
865
  with gr.Row():
866
  task_success_heatmap = gr.Plot()
867
  demo.load(
 
916
  )
917
 
918
  with gr.Row():
919
+ gr.Markdown(cost_explanation)
920
  with gr.Row():
921
  gaia_scatter_plot = gr.Plot()
922
 
 
955
 
956
  # Keep the existing heatmap section
957
  gr.HTML('<div style="height: 30px;"></div>')
958
+ gr.Markdown(heatmap_explanation)
959
  with gr.Row():
960
  task_success_heatmap = gr.Plot()
961
  demo.load(
 
1009
  )
1010
 
1011
  with gr.Row():
1012
+ gr.Markdown(cost_explanation)
1013
  with gr.Row():
1014
  swebench_scatter_plot = gr.Plot()
1015
 
1016
  gr.HTML('<div style="height: 30px;"></div>')
1017
+ gr.Markdown(heatmap_explanation)
1018
  with gr.Row():
1019
  task_success_heatmap = gr.Plot()
1020
  demo.load(
 
1077
  )
1078
  gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
1079
  with gr.Row():
1080
+ gr.Markdown(cost_explanation)
1081
  with gr.Row():
1082
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1083
 
1084
  gr.HTML('<div style="height: 30px;"></div>')
1085
+ gr.Markdown(heatmap_explanation)
1086
  with gr.Row():
1087
  task_success_heatmap = gr.Plot()
1088
  demo.load(
 
1110
  )
1111
  gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
1112
  with gr.Row():
1113
+ gr.Markdown(cost_explanation)
1114
  with gr.Row():
1115
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1116
 
1117
  gr.HTML('<div style="height: 30px;"></div>')
1118
+ gr.Markdown(heatmap_explanation)
 
1119
  with gr.Row():
1120
  task_success_heatmap = gr.Plot()
1121
  demo.load(