Spaces:

agent-evals
/

leaderboard

Running

App Files Files Community

benediktstroebl commited on Nov 30, 2024

Commit

201da5d

1 Parent(s): 4bb1605

added appworld gaia cybench.

Browse files

Files changed (8) hide show

app.py +239 -113
config.py +34 -4
scratch.ipynb +0 -0
scratch.py +0 -38
utils/data.py +0 -57
utils/db.py +19 -5
utils/processing.py +13 -6
verified_agents.yaml +27 -1

app.py CHANGED Viewed

@@ -49,6 +49,10 @@ def get_failure_report(agent_name, benchmark_name):
     return preprocessor.get_failure_report(agent_name, benchmark_name)
 def parse_json_files(folder_path, benchmark_name, aggregate=True):
     return preprocessor.get_parsed_results(benchmark_name, aggregate=aggregate)
 def update_agent_dropdown(benchmark_name, metric):
@@ -459,120 +463,121 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
 """)
     with gr.Tabs() as tabs:
-        with gr.Tab("CORE-Bench"):
             gr.HTML("""
             <p>
             CORE-Bench evaluates the ability of agents to computationally reproduce the results of published scientific papers. Agents are given the codebase of a paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. The benchmark has tasks at three difficulty levels:
             </p>
             """)
-            with gr.Tab("CORE-Bench-Hard"):
-                gr.HTML("""
-                <p>
-                    <i><b>CORE-Bench-Hard:</b></i> The agent is given the codebase of the paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. This level is most akin to fully reproducing a paper and is the most realistic and challenging level.
-                </p>
-                """)
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        Leaderboard(
-                            value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
-                            select_columns=SelectColumns(
-                                default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
-                                cant_deselect=["Agent Name"],
-                                label="Select Columns to Display:",
-                            ),
-                            hide_columns=config.COREBENCH_HIDE_COLUMNS,
-                            search_columns=config.COREBENCH_SEARCH_COLUMNS,
-                        )
-                        # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
-                with gr.Row():
-                    gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
-                with gr.Row():
-                    scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-                gr.HTML('<div style="height: 30px;"></div>')
-                gr.Markdown("## Task success heatmap")
-                gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
-                with gr.Row():
-                    task_success_heatmap = gr.Plot()
-                demo.load(
-                lambda: create_task_success_heatmap(
-                    preprocessor.get_task_success_data('corebench_hard'),
-                    'CORE-Bench-Hard'
-                ),
-                outputs=[task_success_heatmap]
-                )
-            with gr.Tab("CORE-Bench-Medium"):
-                gr.HTML("""
-                <p>
-                <i><b>CORE-Bench-Medium:</b></i> The agent is given a Dockerfile and instructions on how to use the Dockerfile to fully reproduce the paper. This level mainly evaluates agents ability to use and interact with the terminal. The agent must then answer questions about the output of the code, as in the above level.
-                </p>
-                """)
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        Leaderboard(
-                            value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
-                            select_columns=SelectColumns(
-                                default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
-                                cant_deselect=["Agent Name"],
-                                label="Select Columns to Display:",
-                            ),
-                            hide_columns=config.COREBENCH_HIDE_COLUMNS,
-                            search_columns=config.COREBENCH_SEARCH_COLUMNS,
-                        )
-                        # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
-                with gr.Row():
-                    gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Medium")
-                with gr.Row():
-                    scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-                gr.HTML('<div style="height: 30px;"></div>')
-                gr.Markdown("## Task success heatmap")
-                gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
-                with gr.Row():
-                    task_success_heatmap = gr.Plot()
-                demo.load(
-                lambda: create_task_success_heatmap(
-                    preprocessor.get_task_success_data('corebench_medium'),
-                    'CORE-Bench-Medium'
-                ),
-                outputs=[task_success_heatmap]
-                )
-            with gr.Tab("CORE-Bench-Easy"):
-                gr.HTML("""
-                <p>
-                <i><b>CORE-Bench-Easy:</b></i> The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
-                </p>
-                """)
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        Leaderboard(
-                            value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
-                            select_columns=SelectColumns(
-                                default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
-                                cant_deselect=["Agent Name"],
-                                label="Select Columns to Display:",
-                            ),
-                            hide_columns=config.COREBENCH_HIDE_COLUMNS,
-                            search_columns=config.COREBENCH_SEARCH_COLUMNS,
-                        )
-                        # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
-                with gr.Row():
-                    gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
-                with gr.Row():
-                    scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-                gr.HTML('<div style="height: 30px;"></div>')
-                gr.Markdown("## Task success heatmap")
-                gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
-                with gr.Row():
-                    task_success_heatmap = gr.Plot()
-                demo.load(
-                lambda: create_task_success_heatmap(
-                    preprocessor.get_task_success_data('corebench_easy'),
-                    'CORE-Bench-Easy'
-                ),
-                outputs=[task_success_heatmap]
-                )
             gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
         with gr.Tab("USACO"):
@@ -1411,6 +1416,130 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
         #                                 inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
         #                                 outputs=[raw_call_details])
         with gr.Tab("About"):
             gr.Markdown((Path(__file__).parent / "about.md").read_text())
@@ -1429,9 +1558,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
     gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>""")
     gr.Markdown("""Coming soon...""")
 async def main():
     # Preprocess traces
@@ -1442,7 +1569,7 @@ async def main():
     # Download the results from the Hugging Face Hub
     # await asyncio.to_thread(download_latest_results)
-    # # Check for new uploads and process them
     # await check_and_process_uploads()
     scheduler = AsyncIOScheduler()
@@ -1454,5 +1581,4 @@ async def main():
     await demo.launch(favicon_path="hal.png")
 if __name__ == "__main__":
-    # weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
     asyncio.run(main())

     return preprocessor.get_failure_report(agent_name, benchmark_name)
 def parse_json_files(folder_path, benchmark_name, aggregate=True):
+    # Handle inspect_evals prefix
+    if benchmark_name.startswith('inspect_evals/'):
+        actual_benchmark = benchmark_name.split('/')[-1]
+        return preprocessor.get_parsed_results(actual_benchmark, aggregate=aggregate)
     return preprocessor.get_parsed_results(benchmark_name, aggregate=aggregate)
 def update_agent_dropdown(benchmark_name, metric):
 """)
     with gr.Tabs() as tabs:
+        with gr.Tab("CORE-Bench-Hard"):
             gr.HTML("""
             <p>
             CORE-Bench evaluates the ability of agents to computationally reproduce the results of published scientific papers. Agents are given the codebase of a paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. The benchmark has tasks at three difficulty levels:
             </p>
             """)
+            gr.HTML("""
+            <p>
+                <i><b>CORE-Bench-Hard:</b></i> The agent is given the codebase of the paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. This level is most akin to fully reproducing a paper and is the most realistic and challenging level.
+            </p>
+            """)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.COREBENCH_HIDE_COLUMNS,
+                        search_columns=config.COREBENCH_SEARCH_COLUMNS,
+                    )
+                    # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+            lambda: create_task_success_heatmap(
+                preprocessor.get_task_success_data('corebench_hard'),
+                'CORE-Bench-Hard'
+            ),
+            outputs=[task_success_heatmap]
+            )
+            gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
+        with gr.Tab("CORE-Bench-Medium"):
+            gr.HTML("""
+            <p>
+            <i><b>CORE-Bench-Medium:</b></i> The agent is given a Dockerfile and instructions on how to use the Dockerfile to fully reproduce the paper. This level mainly evaluates agents ability to use and interact with the terminal. The agent must then answer questions about the output of the code, as in the above level.
+            </p>
+            """)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.COREBENCH_HIDE_COLUMNS,
+                        search_columns=config.COREBENCH_SEARCH_COLUMNS,
+                    )
+                    # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Medium")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+            lambda: create_task_success_heatmap(
+                preprocessor.get_task_success_data('corebench_medium'),
+                'CORE-Bench-Medium'
+            ),
+            outputs=[task_success_heatmap]
+            )
+            gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
+        with gr.Tab("CORE-Bench-Easy"):
+            gr.HTML("""
+            <p>
+            <i><b>CORE-Bench-Easy:</b></i> The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
+            </p>
+            """)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.COREBENCH_HIDE_COLUMNS,
+                        search_columns=config.COREBENCH_SEARCH_COLUMNS,
+                    )
+                    # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+            lambda: create_task_success_heatmap(
+                preprocessor.get_task_success_data('corebench_easy'),
+                'CORE-Bench-Easy'
+            ),
+            outputs=[task_success_heatmap]
+            )
             gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
         with gr.Tab("USACO"):
         #                                 inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
         #                                 outputs=[raw_call_details])
+        with gr.Tab("AppWorld Normal"):
+            gr.Markdown("""AppWorld Normal is a benchmark suite containing standard programming tasks that represent typical real-world development scenarios. These tasks evaluate a language model's ability to handle common coding challenges across different domains. Even at this baseline level, current state-of-the-art models like GPT-4 achieve only a 49% success rate, demonstrating that even "normal" software development tasks remain challenging for AI systems.""")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_normal'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.APPWORLD_HIDE_COLUMNS,
+                        search_columns=config.APPWORLD_SEARCH_COLUMNS,
+                    )
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on AppWorld")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('appworld test_normal'),
+                    'AppWorld'
+                ),
+                outputs=[task_success_heatmap]
+            )
+        with gr.Tab("AppWorld Challenge"):
+            gr.Markdown("""AppWorld Challenge is a more demanding subset of tasks designed to push the boundaries of AI coding capabilities. These tasks feature increased complexity, requiring sophisticated problem-solving and deeper understanding of programming concepts. The significant drop in performance (to 30% success rate for GPT-4) on these challenge tasks indicates they effectively test the upper limits of current AI coding abilities.""")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_challenge'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.APPWORLD_HIDE_COLUMNS,
+                        search_columns=config.APPWORLD_SEARCH_COLUMNS,
+                    )
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on AppWorld")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_challenge', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('appworld test_challenge'),
+                    'AppWorld'
+                ),
+                outputs=[task_success_heatmap]
+            )
+        with gr.Tab("Cybench"):
+            gr.Markdown("""Cybench is a benchmark for evaluating the cybersecurity capabilities and risks of language models. Cybench includes 40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties. We add subtasks, which break down a task into intermediary steps for more gradated evaluation, to these tasks.""")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'cybench'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.CYBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.CYBENCH_HIDE_COLUMNS,
+                        search_columns=config.CYBENCH_SEARCH_COLUMNS,
+                    )
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on Cybench")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'cybench', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('cybench'),
+                    'Cybench'
+                ),
+                outputs=[task_success_heatmap]
+            )
+        with gr.Tab("GAIA"):
+            gr.Markdown("""GAIA is a benchmark for General AI Assistants. GAIA proposes real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency. GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.""")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'gaia'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.GAIA_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.GAIA_HIDE_COLUMNS,
+                        search_columns=config.GAIA_SEARCH_COLUMNS,
+                    )
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on GAIA")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'gaia', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('gaia'),
+                    'GAIA'
+                ),
+                outputs=[task_success_heatmap]
+            )
         with gr.Tab("About"):
             gr.Markdown((Path(__file__).parent / "about.md").read_text())
     gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>""")
     gr.Markdown("""Coming soon...""")
 async def main():
     # Preprocess traces
     # Download the results from the Hugging Face Hub
     # await asyncio.to_thread(download_latest_results)
+    # Check for new uploads and process them
     # await check_and_process_uploads()
     scheduler = AsyncIOScheduler()
     await demo.launch(favicon_path="hal.png")
 if __name__ == "__main__":
     asyncio.run(main())

config.py CHANGED Viewed

@@ -13,7 +13,7 @@ SWEBENCH_ON_LOAD_COLUMNS = [
     "Runs",
    ]
 SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
-SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
 USACO_ON_LOAD_COLUMNS = [
     "Agent Name",
@@ -22,7 +22,7 @@ USACO_ON_LOAD_COLUMNS = [
     "Runs",
    ]
 USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
-USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
 COREBENCH_ON_LOAD_COLUMNS = [
     "Agent Name",
@@ -31,7 +31,7 @@ COREBENCH_ON_LOAD_COLUMNS = [
     "Runs",
    ]
 COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
-COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
@@ -53,4 +53,34 @@ NUMERIC_INTERVALS = {
     "~35": pd.Interval(20, 45, closed="right"),
     "~60": pd.Interval(45, 70, closed="right"),
     "70+": pd.Interval(70, 10000, closed="right"),
-}

     "Runs",
    ]
 SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
+SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
 USACO_ON_LOAD_COLUMNS = [
     "Agent Name",
     "Runs",
    ]
 USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
+USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
 COREBENCH_ON_LOAD_COLUMNS = [
     "Agent Name",
     "Runs",
    ]
 COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
+COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
     "~35": pd.Interval(20, 45, closed="right"),
     "~60": pd.Interval(45, 70, closed="right"),
     "70+": pd.Interval(70, 10000, closed="right"),
+}
+CYBENCH_ON_LOAD_COLUMNS = [
+    "Agent Name",
+    "Accuracy",
+    "Total Cost",
+    "Runs",
+]
+CYBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
+CYBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
+APPWORLD_ON_LOAD_COLUMNS = [
+    "Agent Name",
+    "Accuracy",
+    "Total Cost",
+    "Runs",
+]
+APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
+APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
+GAIA_ON_LOAD_COLUMNS = [
+    "Agent Name",
+    "Accuracy",
+    "Level 1 Accuracy",
+    "Level 2 Accuracy",
+    "Level 3 Accuracy",
+    "Total Cost",
+    "Runs",
+]
+GAIA_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
+GAIA_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']

scratch.ipynb DELETED Viewed

File without changes

scratch.py DELETED Viewed

@@ -1,38 +0,0 @@
-import json
-import os
-from pathlib import Path
-def process_json_files(directory, suffix="_updated"):
-    # Iterate through all JSON files in the directory
-    for filename in os.listdir(directory):
-        if filename.endswith(".json") and "USACO" in filename:
-            file_path = os.path.join(directory, filename)
-            # Read the JSON file
-            with open(file_path, 'r') as f:
-                data = json.load(f)
-            # Extract sdict from raw_eval_results
-            sdict = data['raw_eval_results']['sdict']
-            # Calculate successful_tasks and failed_tasks
-            successful_tasks = [key for key in sdict if float(sdict[key][0]['result']['fraction_passed']) == 1]
-            failed_tasks = [key for key in sdict if float(sdict[key][0]['result']['fraction_passed']) < 1]
-            # Add new key-value pairs to the results
-            data['results']['successful_tasks'] = successful_tasks
-            data['results']['failed_tasks'] = failed_tasks
-            # Create new filename with suffix
-            new_filename = f"{Path(filename).stem}{suffix}{Path(filename).suffix}"
-            new_file_path = os.path.join(directory, new_filename)
-            # Write updated data to new file
-            with open(new_file_path, 'w') as f:
-                json.dump(data, f, indent=4)
-            print(f"Processed {filename} and saved as {new_filename}")
-# Usage
-directory_path = "/Users/benediktstroebl/Documents/GitHub/leaderboard/evals_live"
-process_json_files(directory_path)

utils/data.py CHANGED Viewed

@@ -6,63 +6,6 @@ from utils.pareto import Agent, compute_pareto_frontier
 import plotly.graph_objects as go
 import textwrap
-# def parse_json_files(folder_path, benchmark_name):
-#     # Convert folder path to Path object
-#     folder = Path(folder_path)
-#     # List to store data from each file
-#     data_list = []
-#     # Iterate through all JSON files in the folder
-#     for json_file in folder.glob('*.json'):
-#         try:
-#             with open(json_file, 'r') as file:
-#                 data = json.load(file)
-#                 # Extract config and results
-#                 config = data['config']
-#                 results = data['results']
-#                 # Combine config and results into a single dictionary
-#                 combined_data = {
-#                     'agent_name': config['agent_name'],
-#                     'benchmark_name': config['benchmark_name'],
-#                     'date': config['date']
-#                 }
-#                 # Add results with 'results_' prefix
-#                 for key, value in results.items():
-#                     combined_data[f'results_{key}'] = value
-#                 data_list.append(combined_data)
-#         except Exception as e:
-#             print(f"Error processing {json_file}: {e}. Skipping!")
-#     # Create DataFrame from the list of dictionaries
-#     df = pd.DataFrame(data_list)
-#     df = df[df['benchmark_name'] == benchmark_name]
-#     # sort df by descending accuracy
-#     df = df.sort_values(by='results_accuracy', ascending=False)
-#     # round all float columns to 2 decimal places
-#     for column in df.select_dtypes(include='float').columns:
-#         df[column] = df[column].round(3)
-#     # Rename columns
-#     df = df.rename(columns={
-#         'agent_name': 'Agent Name',
-#         'results_total_cost': 'Total Cost',
-#         'results_accuracy': 'Accuracy',
-#         'results_precision': 'Precision',
-#         'results_recall': 'Recall',
-#         'results_f1_score': 'F1 Score',
-#         'results_auc': 'AUC',
-#     })
-#     return df
 def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
     agents = [Agent(row.results_total_cost, row.results_accuracy) for row in df.itertuples()]
     pareto_frontier = compute_pareto_frontier(agents)

 import plotly.graph_objects as go
 import textwrap
 def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
     agents = [Agent(row.results_total_cost, row.results_accuracy) for row in df.itertuples()]
     pareto_frontier = compute_pareto_frontier(agents)

utils/db.py CHANGED Viewed

@@ -65,6 +65,9 @@ class TracePreprocessor:
                     amp_parkinsons_disease_progression_prediction_score REAL,
                     cifar10_score REAL,
                     imdb_score REAL,
                     PRIMARY KEY (benchmark_name, agent_name, run_id)
                 )
             ''')
@@ -77,6 +80,8 @@ class TracePreprocessor:
                 data = json.load(f)
                 agent_name = data['config']['agent_name']
                 benchmark_name = data['config']['benchmark_name']
                 date = data['config']['date']
                 config = data['config']
@@ -108,8 +113,8 @@ class TracePreprocessor:
                 with self.get_conn() as conn:
                     conn.execute('''
                         INSERT INTO parsed_results
-                        (benchmark_name, agent_name, date, run_id, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
-                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                     ''', (
                         benchmark_name,
                         agent_name,
@@ -131,7 +136,10 @@ class TracePreprocessor:
                         results.get('spaceship-titanic_score'),
                         results.get('amp-parkinsons-disease-progression-prediction_score'),
                         results.get('cifar10_score'),
-                        results.get('imdb_score')
                     ))
             except Exception as e:
                 print(f"Error preprocessing parsed results in {file}: {e}")
@@ -257,12 +265,15 @@ class TracePreprocessor:
                 'overall_score': 'mean',
                 'vectorization_score': 'mean',
                 'fathomnet_score': 'mean',
-                'feedback_score': 'mean',
                 'house_price_score': 'mean',
                 'spaceship_titanic_score': 'mean',
                 'amp_parkinsons_disease_progression_prediction_score': 'mean',
                 'cifar10_score': 'mean',
                 'imdb_score': 'mean',
                 'Verified': 'first',
                 'Runs': 'first',
                 'acc_ci': 'first',
@@ -270,7 +281,7 @@ class TracePreprocessor:
             }).reset_index()
         # Round float columns to 3 decimal places
-        float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
         for column in float_columns:
             if column in df.columns:
                 df[column] = df[column].round(3)
@@ -297,6 +308,9 @@ class TracePreprocessor:
             'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
             'cifar10_score': 'CIFAR10 Score',
             'imdb_score': 'IMDB Score',
             'acc_ci': 'Accuracy CI',
             'cost_ci': 'Total Cost CI'
         })

                     amp_parkinsons_disease_progression_prediction_score REAL,
                     cifar10_score REAL,
                     imdb_score REAL,
+                    level_1_accuracy REAL,
+                    level_2_accuracy REAL,
+                    level_3_accuracy REAL,
                     PRIMARY KEY (benchmark_name, agent_name, run_id)
                 )
             ''')
                 data = json.load(f)
                 agent_name = data['config']['agent_name']
                 benchmark_name = data['config']['benchmark_name']
+                if "inspect" in benchmark_name:
+                    benchmark_name = benchmark_name.split("/")[-1]
                 date = data['config']['date']
                 config = data['config']
                 with self.get_conn() as conn:
                     conn.execute('''
                         INSERT INTO parsed_results
+                        (benchmark_name, agent_name, date, run_id, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score, level_1_accuracy, level_2_accuracy, level_3_accuracy)
+                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                     ''', (
                         benchmark_name,
                         agent_name,
                         results.get('spaceship-titanic_score'),
                         results.get('amp-parkinsons-disease-progression-prediction_score'),
                         results.get('cifar10_score'),
+                        results.get('imdb_score'),
+                        results.get('level_1_accuracy'),
+                        results.get('level_2_accuracy'),
+                        results.get('level_3_accuracy')
                     ))
             except Exception as e:
                 print(f"Error preprocessing parsed results in {file}: {e}")
                 'overall_score': 'mean',
                 'vectorization_score': 'mean',
                 'fathomnet_score': 'mean',
+                'feedback_score': 'mean',
                 'house_price_score': 'mean',
                 'spaceship_titanic_score': 'mean',
                 'amp_parkinsons_disease_progression_prediction_score': 'mean',
                 'cifar10_score': 'mean',
                 'imdb_score': 'mean',
+                'level_1_accuracy': 'mean',
+                'level_2_accuracy': 'mean',
+                'level_3_accuracy': 'mean',
                 'Verified': 'first',
                 'Runs': 'first',
                 'acc_ci': 'first',
             }).reset_index()
         # Round float columns to 3 decimal places
+        float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score', 'level_1_accuracy', 'level_2_accuracy', 'level_3_accuracy']
         for column in float_columns:
             if column in df.columns:
                 df[column] = df[column].round(3)
             'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
             'cifar10_score': 'CIFAR10 Score',
             'imdb_score': 'IMDB Score',
+            'level_1_accuracy': 'Level 1 Accuracy',
+            'level_2_accuracy': 'Level 2 Accuracy',
+            'level_3_accuracy': 'Level 3 Accuracy',
             'acc_ci': 'Accuracy CI',
             'cost_ci': 'Total Cost CI'
         })

utils/processing.py CHANGED Viewed

@@ -93,12 +93,13 @@ async def check_upload_structure(file_path):
             return {'is_valid': False, 'message': f"Missing required keys: {', '.join(missing_keys)}"}
         # Check for specific structure in raw_logging_results
-        if not isinstance(data['raw_logging_results'], list):
             return {'is_valid': False, 'message': "raw_logging_results should be a list"}
-        for item in data['raw_logging_results']:
-            if not all(key in item for key in ['weave_task_id', 'inputs', 'outputs']):
-                return {'is_valid': False, 'message': "Each item in raw_logging_results should have weave_task_id, inputs, and outputs"}
         return {'is_valid': True, 'message': "File structure is valid"}
@@ -115,10 +116,16 @@ async def process_upload(input_path, output_path):
         data = json.loads(f.read())
     assert 'raw_logging_results' in data, "raw_logging_results key not found in the file"
-    openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
     try:
-        processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=False)
         # failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
         data['raw_logging_results'] = processed_calls

             return {'is_valid': False, 'message': f"Missing required keys: {', '.join(missing_keys)}"}
         # Check for specific structure in raw_logging_results
+        if not isinstance(data['raw_logging_results'], list) and not "inspect" in data['config']['benchmark_name']:
             return {'is_valid': False, 'message': "raw_logging_results should be a list"}
+        if "inspect" not in data['config']['benchmark_name']:
+            for item in data['raw_logging_results']:
+                if not all(key in item for key in ['weave_task_id', 'inputs', 'outputs']):
+                    return {'is_valid': False, 'message': "Each item in raw_logging_results should have weave_task_id, inputs, and outputs"}
         return {'is_valid': True, 'message': "File structure is valid"}
         data = json.loads(f.read())
     assert 'raw_logging_results' in data, "raw_logging_results key not found in the file"
     try:
+        if isinstance(data['raw_logging_results'], list):
+            openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
+            processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=False)
+        else:
+            processed_calls = data['raw_logging_results']
+        # # experimental
         # failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
         data['raw_logging_results'] = processed_calls

verified_agents.yaml CHANGED Viewed

@@ -66,6 +66,8 @@ swebench_verified_mini:
 swebench_verified:
   - agent_name: "Moatless (gpt-4o-2024-08-06)"
     verification_date: 2024-10-30
 mlagentbench:
   - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
@@ -100,4 +102,28 @@ corebench_hard:
   - agent_name: "CORE-Agent (GPT-4o)"
     verification_date: 2024-09-28
   - agent_name: "CORE-Agent (GPT-4o-mini)"
-    verification_date: 2024-09-28

 swebench_verified:
   - agent_name: "Moatless (gpt-4o-2024-08-06)"
     verification_date: 2024-10-30
+  - agent_name: "Agentless (o1-mini-2024-09-12)"
+    verification_date: 2024-10-30
 mlagentbench:
   - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
   - agent_name: "CORE-Agent (GPT-4o)"
     verification_date: 2024-09-28
   - agent_name: "CORE-Agent (GPT-4o-mini)"
+    verification_date: 2024-09-28
+gaia:
+  - agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-11-30
+  - agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
+    verification_date: 2024-11-30
+  - agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
+    verification_date: 2024-11-30
+  - agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
+    verification_date: 2024-11-30
+cybench:
+  - agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-11-30
+  - agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
+    verification_date: 2024-11-30
+  - agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
+    verification_date: 2024-11-30
+  - agent_name: "Inspect Default Agent (o1-mini-2024-09-12)"
+    verification_date: 2024-11-30
+  - agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
+    verification_date: 2024-11-30