Spaces:

agent-evals
/

leaderboard

Running

App Files Files Community

benediktstroebl commited on Dec 4, 2024

Commit

3d3d857

1 Parent(s): c27a759

added cost and heatmap explanation

Browse files

Files changed (1) hide show

app.py +22 -24

app.py CHANGED Viewed

@@ -30,6 +30,8 @@ from datetime import datetime
 abs_path = Path(__file__).parent
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
@@ -603,7 +605,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                     )
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on AppWorld")
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(
                     parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_challenge', aggregate=False),
@@ -615,7 +617,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                 ))
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
@@ -642,12 +644,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                         datatype='markdown'
                     )
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on AppWorld")
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
@@ -685,13 +687,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
@@ -724,13 +725,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Medium")
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
@@ -763,13 +763,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
@@ -823,7 +822,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                     )
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on Cybench")
             with gr.Row():
                 cybench_scatter_plot = gr.Plot()
@@ -862,7 +861,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
             # Keep the existing heatmap section
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
@@ -917,7 +916,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                     )
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on GAIA")
             with gr.Row():
                 gaia_scatter_plot = gr.Plot()
@@ -956,7 +955,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
             # Keep the existing heatmap section
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
@@ -1010,12 +1009,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                     )
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
             with gr.Row():
                 swebench_scatter_plot = gr.Plot()
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
@@ -1078,12 +1077,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                     )
                     gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
@@ -1111,13 +1110,12 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                     )
                     gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost for USACO agents")
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(

 abs_path = Path(__file__).parent
+heatmap_explanation = (Path(__file__).parent / "heatmap_explanation.md").read_text()
+cost_explanation = (Path(__file__).parent / "cost_explanation.md").read_text()
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
                     )
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(
                     parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_challenge', aggregate=False),
                 ))
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
                         datatype='markdown'
                     )
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
                     )
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 cybench_scatter_plot = gr.Plot()
             # Keep the existing heatmap section
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
                     )
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 gaia_scatter_plot = gr.Plot()
             # Keep the existing heatmap section
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
                     )
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 swebench_scatter_plot = gr.Plot()
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
                     )
                     gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
                     )
                     gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
+                gr.Markdown(cost_explanation)
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown(heatmap_explanation)
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(