benediktstroebl commited on
Commit
201da5d
·
1 Parent(s): 4bb1605

added appworld gaia cybench.

Browse files
Files changed (8) hide show
  1. app.py +239 -113
  2. config.py +34 -4
  3. scratch.ipynb +0 -0
  4. scratch.py +0 -38
  5. utils/data.py +0 -57
  6. utils/db.py +19 -5
  7. utils/processing.py +13 -6
  8. verified_agents.yaml +27 -1
app.py CHANGED
@@ -49,6 +49,10 @@ def get_failure_report(agent_name, benchmark_name):
49
  return preprocessor.get_failure_report(agent_name, benchmark_name)
50
 
51
  def parse_json_files(folder_path, benchmark_name, aggregate=True):
 
 
 
 
52
  return preprocessor.get_parsed_results(benchmark_name, aggregate=aggregate)
53
 
54
  def update_agent_dropdown(benchmark_name, metric):
@@ -459,120 +463,121 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
459
  """)
460
 
461
  with gr.Tabs() as tabs:
462
- with gr.Tab("CORE-Bench"):
463
  gr.HTML("""
464
  <p>
465
  CORE-Bench evaluates the ability of agents to computationally reproduce the results of published scientific papers. Agents are given the codebase of a paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. The benchmark has tasks at three difficulty levels:
466
  </p>
467
  """)
468
- with gr.Tab("CORE-Bench-Hard"):
469
- gr.HTML("""
470
- <p>
471
- <i><b>CORE-Bench-Hard:</b></i> The agent is given the codebase of the paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. This level is most akin to fully reproducing a paper and is the most realistic and challenging level.
472
- </p>
473
- """)
474
- with gr.Row():
475
- with gr.Column(scale=2):
476
- Leaderboard(
477
- value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
478
- select_columns=SelectColumns(
479
- default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
480
- cant_deselect=["Agent Name"],
481
- label="Select Columns to Display:",
482
- ),
483
- hide_columns=config.COREBENCH_HIDE_COLUMNS,
484
- search_columns=config.COREBENCH_SEARCH_COLUMNS,
485
- )
486
- # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
487
- with gr.Row():
488
- gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
489
- with gr.Row():
490
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
491
-
492
- gr.HTML('<div style="height: 30px;"></div>')
493
- gr.Markdown("## Task success heatmap")
494
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
495
- with gr.Row():
496
- task_success_heatmap = gr.Plot()
497
- demo.load(
498
- lambda: create_task_success_heatmap(
499
- preprocessor.get_task_success_data('corebench_hard'),
500
- 'CORE-Bench-Hard'
501
- ),
502
- outputs=[task_success_heatmap]
503
- )
504
- with gr.Tab("CORE-Bench-Medium"):
505
- gr.HTML("""
506
- <p>
507
- <i><b>CORE-Bench-Medium:</b></i> The agent is given a Dockerfile and instructions on how to use the Dockerfile to fully reproduce the paper. This level mainly evaluates agents ability to use and interact with the terminal. The agent must then answer questions about the output of the code, as in the above level.
508
- </p>
509
- """)
510
- with gr.Row():
511
- with gr.Column(scale=2):
512
- Leaderboard(
513
- value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
514
- select_columns=SelectColumns(
515
- default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
516
- cant_deselect=["Agent Name"],
517
- label="Select Columns to Display:",
518
- ),
519
- hide_columns=config.COREBENCH_HIDE_COLUMNS,
520
- search_columns=config.COREBENCH_SEARCH_COLUMNS,
521
- )
522
- # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
523
- with gr.Row():
524
- gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Medium")
525
- with gr.Row():
526
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
527
-
528
- gr.HTML('<div style="height: 30px;"></div>')
529
- gr.Markdown("## Task success heatmap")
530
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
531
- with gr.Row():
532
- task_success_heatmap = gr.Plot()
533
- demo.load(
534
- lambda: create_task_success_heatmap(
535
- preprocessor.get_task_success_data('corebench_medium'),
536
- 'CORE-Bench-Medium'
537
- ),
538
- outputs=[task_success_heatmap]
539
- )
540
- with gr.Tab("CORE-Bench-Easy"):
541
- gr.HTML("""
542
- <p>
543
- <i><b>CORE-Bench-Easy:</b></i> The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
544
- </p>
545
- """)
546
- with gr.Row():
547
- with gr.Column(scale=2):
548
- Leaderboard(
549
- value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
550
- select_columns=SelectColumns(
551
- default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
552
- cant_deselect=["Agent Name"],
553
- label="Select Columns to Display:",
554
- ),
555
- hide_columns=config.COREBENCH_HIDE_COLUMNS,
556
- search_columns=config.COREBENCH_SEARCH_COLUMNS,
557
- )
558
- # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
559
- with gr.Row():
560
- gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
561
- with gr.Row():
562
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
 
563
 
564
- gr.HTML('<div style="height: 30px;"></div>')
565
- gr.Markdown("## Task success heatmap")
566
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
567
- with gr.Row():
568
- task_success_heatmap = gr.Plot()
569
- demo.load(
570
- lambda: create_task_success_heatmap(
571
- preprocessor.get_task_success_data('corebench_easy'),
572
- 'CORE-Bench-Easy'
573
- ),
574
- outputs=[task_success_heatmap]
575
- )
576
 
577
  gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
578
  with gr.Tab("USACO"):
@@ -1411,6 +1416,130 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
1411
  # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1412
  # outputs=[raw_call_details])
1413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1414
  with gr.Tab("About"):
1415
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
1416
 
@@ -1429,9 +1558,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
1429
  gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>""")
1430
  gr.Markdown("""Coming soon...""")
1431
 
1432
-
1433
-
1434
-
1435
 
1436
  async def main():
1437
  # Preprocess traces
@@ -1442,7 +1569,7 @@ async def main():
1442
  # Download the results from the Hugging Face Hub
1443
  # await asyncio.to_thread(download_latest_results)
1444
 
1445
- # # Check for new uploads and process them
1446
  # await check_and_process_uploads()
1447
 
1448
  scheduler = AsyncIOScheduler()
@@ -1454,5 +1581,4 @@ async def main():
1454
  await demo.launch(favicon_path="hal.png")
1455
 
1456
  if __name__ == "__main__":
1457
- # weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
1458
  asyncio.run(main())
 
49
  return preprocessor.get_failure_report(agent_name, benchmark_name)
50
 
51
  def parse_json_files(folder_path, benchmark_name, aggregate=True):
52
+ # Handle inspect_evals prefix
53
+ if benchmark_name.startswith('inspect_evals/'):
54
+ actual_benchmark = benchmark_name.split('/')[-1]
55
+ return preprocessor.get_parsed_results(actual_benchmark, aggregate=aggregate)
56
  return preprocessor.get_parsed_results(benchmark_name, aggregate=aggregate)
57
 
58
  def update_agent_dropdown(benchmark_name, metric):
 
463
  """)
464
 
465
  with gr.Tabs() as tabs:
466
+ with gr.Tab("CORE-Bench-Hard"):
467
  gr.HTML("""
468
  <p>
469
  CORE-Bench evaluates the ability of agents to computationally reproduce the results of published scientific papers. Agents are given the codebase of a paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. The benchmark has tasks at three difficulty levels:
470
  </p>
471
  """)
472
+ gr.HTML("""
473
+ <p>
474
+ <i><b>CORE-Bench-Hard:</b></i> The agent is given the codebase of the paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. This level is most akin to fully reproducing a paper and is the most realistic and challenging level.
475
+ </p>
476
+ """)
477
+ with gr.Row():
478
+ with gr.Column(scale=2):
479
+ Leaderboard(
480
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
481
+ select_columns=SelectColumns(
482
+ default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
483
+ cant_deselect=["Agent Name"],
484
+ label="Select Columns to Display:",
485
+ ),
486
+ hide_columns=config.COREBENCH_HIDE_COLUMNS,
487
+ search_columns=config.COREBENCH_SEARCH_COLUMNS,
488
+ )
489
+ # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
490
+ with gr.Row():
491
+ gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
492
+ with gr.Row():
493
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
494
+
495
+ gr.HTML('<div style="height: 30px;"></div>')
496
+ gr.Markdown("## Task success heatmap")
497
+ gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
498
+ with gr.Row():
499
+ task_success_heatmap = gr.Plot()
500
+ demo.load(
501
+ lambda: create_task_success_heatmap(
502
+ preprocessor.get_task_success_data('corebench_hard'),
503
+ 'CORE-Bench-Hard'
504
+ ),
505
+ outputs=[task_success_heatmap]
506
+ )
507
+ gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
508
+ with gr.Tab("CORE-Bench-Medium"):
509
+ gr.HTML("""
510
+ <p>
511
+ <i><b>CORE-Bench-Medium:</b></i> The agent is given a Dockerfile and instructions on how to use the Dockerfile to fully reproduce the paper. This level mainly evaluates agents ability to use and interact with the terminal. The agent must then answer questions about the output of the code, as in the above level.
512
+ </p>
513
+ """)
514
+ with gr.Row():
515
+ with gr.Column(scale=2):
516
+ Leaderboard(
517
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
518
+ select_columns=SelectColumns(
519
+ default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
520
+ cant_deselect=["Agent Name"],
521
+ label="Select Columns to Display:",
522
+ ),
523
+ hide_columns=config.COREBENCH_HIDE_COLUMNS,
524
+ search_columns=config.COREBENCH_SEARCH_COLUMNS,
525
+ )
526
+ # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
527
+ with gr.Row():
528
+ gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Medium")
529
+ with gr.Row():
530
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
531
+
532
+ gr.HTML('<div style="height: 30px;"></div>')
533
+ gr.Markdown("## Task success heatmap")
534
+ gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
535
+ with gr.Row():
536
+ task_success_heatmap = gr.Plot()
537
+ demo.load(
538
+ lambda: create_task_success_heatmap(
539
+ preprocessor.get_task_success_data('corebench_medium'),
540
+ 'CORE-Bench-Medium'
541
+ ),
542
+ outputs=[task_success_heatmap]
543
+ )
544
+ gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
545
+ with gr.Tab("CORE-Bench-Easy"):
546
+ gr.HTML("""
547
+ <p>
548
+ <i><b>CORE-Bench-Easy:</b></i> The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
549
+ </p>
550
+ """)
551
+ with gr.Row():
552
+ with gr.Column(scale=2):
553
+ Leaderboard(
554
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
555
+ select_columns=SelectColumns(
556
+ default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
557
+ cant_deselect=["Agent Name"],
558
+ label="Select Columns to Display:",
559
+ ),
560
+ hide_columns=config.COREBENCH_HIDE_COLUMNS,
561
+ search_columns=config.COREBENCH_SEARCH_COLUMNS,
562
+ )
563
+ # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
564
+ with gr.Row():
565
+ gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
566
+ with gr.Row():
567
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
568
 
569
+ gr.HTML('<div style="height: 30px;"></div>')
570
+ gr.Markdown("## Task success heatmap")
571
+ gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
572
+ with gr.Row():
573
+ task_success_heatmap = gr.Plot()
574
+ demo.load(
575
+ lambda: create_task_success_heatmap(
576
+ preprocessor.get_task_success_data('corebench_easy'),
577
+ 'CORE-Bench-Easy'
578
+ ),
579
+ outputs=[task_success_heatmap]
580
+ )
581
 
582
  gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
583
  with gr.Tab("USACO"):
 
1416
  # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1417
  # outputs=[raw_call_details])
1418
 
1419
+ with gr.Tab("AppWorld Normal"):
1420
+ gr.Markdown("""AppWorld Normal is a benchmark suite containing standard programming tasks that represent typical real-world development scenarios. These tasks evaluate a language model's ability to handle common coding challenges across different domains. Even at this baseline level, current state-of-the-art models like GPT-4 achieve only a 49% success rate, demonstrating that even "normal" software development tasks remain challenging for AI systems.""")
1421
+ with gr.Row():
1422
+ with gr.Column(scale=2):
1423
+ Leaderboard(
1424
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_normal'), ci_metrics=["Accuracy", "Total Cost"]),
1425
+ select_columns=SelectColumns(
1426
+ default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
1427
+ cant_deselect=["Agent Name"],
1428
+ label="Select Columns to Display:",
1429
+ ),
1430
+ hide_columns=config.APPWORLD_HIDE_COLUMNS,
1431
+ search_columns=config.APPWORLD_SEARCH_COLUMNS,
1432
+ )
1433
+ with gr.Row():
1434
+ gr.Markdown("### Accuracy vs. Cost on AppWorld")
1435
+ with gr.Row():
1436
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1437
+
1438
+ gr.HTML('<div style="height: 30px;"></div>')
1439
+ gr.Markdown("## Task success heatmap")
1440
+ with gr.Row():
1441
+ task_success_heatmap = gr.Plot()
1442
+ demo.load(
1443
+ lambda: create_task_success_heatmap(
1444
+ preprocessor.get_task_success_data('appworld test_normal'),
1445
+ 'AppWorld'
1446
+ ),
1447
+ outputs=[task_success_heatmap]
1448
+ )
1449
+
1450
+ with gr.Tab("AppWorld Challenge"):
1451
+ gr.Markdown("""AppWorld Challenge is a more demanding subset of tasks designed to push the boundaries of AI coding capabilities. These tasks feature increased complexity, requiring sophisticated problem-solving and deeper understanding of programming concepts. The significant drop in performance (to 30% success rate for GPT-4) on these challenge tasks indicates they effectively test the upper limits of current AI coding abilities.""")
1452
+ with gr.Row():
1453
+ with gr.Column(scale=2):
1454
+ Leaderboard(
1455
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_challenge'), ci_metrics=["Accuracy", "Total Cost"]),
1456
+ select_columns=SelectColumns(
1457
+ default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
1458
+ cant_deselect=["Agent Name"],
1459
+ label="Select Columns to Display:",
1460
+ ),
1461
+ hide_columns=config.APPWORLD_HIDE_COLUMNS,
1462
+ search_columns=config.APPWORLD_SEARCH_COLUMNS,
1463
+ )
1464
+ with gr.Row():
1465
+ gr.Markdown("### Accuracy vs. Cost on AppWorld")
1466
+ with gr.Row():
1467
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_challenge', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1468
+
1469
+ gr.HTML('<div style="height: 30px;"></div>')
1470
+ gr.Markdown("## Task success heatmap")
1471
+ with gr.Row():
1472
+ task_success_heatmap = gr.Plot()
1473
+ demo.load(
1474
+ lambda: create_task_success_heatmap(
1475
+ preprocessor.get_task_success_data('appworld test_challenge'),
1476
+ 'AppWorld'
1477
+ ),
1478
+ outputs=[task_success_heatmap]
1479
+ )
1480
+
1481
+ with gr.Tab("Cybench"):
1482
+ gr.Markdown("""Cybench is a benchmark for evaluating the cybersecurity capabilities and risks of language models. Cybench includes 40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties. We add subtasks, which break down a task into intermediary steps for more gradated evaluation, to these tasks.""")
1483
+ with gr.Row():
1484
+ with gr.Column(scale=2):
1485
+ Leaderboard(
1486
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'cybench'), ci_metrics=["Accuracy", "Total Cost"]),
1487
+ select_columns=SelectColumns(
1488
+ default_selection=config.CYBENCH_ON_LOAD_COLUMNS + ["Verified"],
1489
+ cant_deselect=["Agent Name"],
1490
+ label="Select Columns to Display:",
1491
+ ),
1492
+ hide_columns=config.CYBENCH_HIDE_COLUMNS,
1493
+ search_columns=config.CYBENCH_SEARCH_COLUMNS,
1494
+ )
1495
+ with gr.Row():
1496
+ gr.Markdown("### Accuracy vs. Cost on Cybench")
1497
+ with gr.Row():
1498
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'cybench', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1499
+
1500
+ gr.HTML('<div style="height: 30px;"></div>')
1501
+ gr.Markdown("## Task success heatmap")
1502
+ with gr.Row():
1503
+ task_success_heatmap = gr.Plot()
1504
+ demo.load(
1505
+ lambda: create_task_success_heatmap(
1506
+ preprocessor.get_task_success_data('cybench'),
1507
+ 'Cybench'
1508
+ ),
1509
+ outputs=[task_success_heatmap]
1510
+ )
1511
+
1512
+ with gr.Tab("GAIA"):
1513
+ gr.Markdown("""GAIA is a benchmark for General AI Assistants. GAIA proposes real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency. GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.""")
1514
+ with gr.Row():
1515
+ with gr.Column(scale=2):
1516
+ Leaderboard(
1517
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'gaia'), ci_metrics=["Accuracy", "Total Cost"]),
1518
+ select_columns=SelectColumns(
1519
+ default_selection=config.GAIA_ON_LOAD_COLUMNS + ["Verified"],
1520
+ cant_deselect=["Agent Name"],
1521
+ label="Select Columns to Display:",
1522
+ ),
1523
+ hide_columns=config.GAIA_HIDE_COLUMNS,
1524
+ search_columns=config.GAIA_SEARCH_COLUMNS,
1525
+ )
1526
+ with gr.Row():
1527
+ gr.Markdown("### Accuracy vs. Cost on GAIA")
1528
+ with gr.Row():
1529
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'gaia', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1530
+
1531
+ gr.HTML('<div style="height: 30px;"></div>')
1532
+ gr.Markdown("## Task success heatmap")
1533
+ with gr.Row():
1534
+ task_success_heatmap = gr.Plot()
1535
+ demo.load(
1536
+ lambda: create_task_success_heatmap(
1537
+ preprocessor.get_task_success_data('gaia'),
1538
+ 'GAIA'
1539
+ ),
1540
+ outputs=[task_success_heatmap]
1541
+ )
1542
+
1543
  with gr.Tab("About"):
1544
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
1545
 
 
1558
  gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>""")
1559
  gr.Markdown("""Coming soon...""")
1560
 
1561
+
 
 
1562
 
1563
  async def main():
1564
  # Preprocess traces
 
1569
  # Download the results from the Hugging Face Hub
1570
  # await asyncio.to_thread(download_latest_results)
1571
 
1572
+ # Check for new uploads and process them
1573
  # await check_and_process_uploads()
1574
 
1575
  scheduler = AsyncIOScheduler()
 
1581
  await demo.launch(favicon_path="hal.png")
1582
 
1583
  if __name__ == "__main__":
 
1584
  asyncio.run(main())
config.py CHANGED
@@ -13,7 +13,7 @@ SWEBENCH_ON_LOAD_COLUMNS = [
13
  "Runs",
14
  ]
15
  SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
16
- SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
17
 
18
  USACO_ON_LOAD_COLUMNS = [
19
  "Agent Name",
@@ -22,7 +22,7 @@ USACO_ON_LOAD_COLUMNS = [
22
  "Runs",
23
  ]
24
  USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
25
- USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
26
 
27
  COREBENCH_ON_LOAD_COLUMNS = [
28
  "Agent Name",
@@ -31,7 +31,7 @@ COREBENCH_ON_LOAD_COLUMNS = [
31
  "Runs",
32
  ]
33
  COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
34
- COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
35
 
36
 
37
 
@@ -53,4 +53,34 @@ NUMERIC_INTERVALS = {
53
  "~35": pd.Interval(20, 45, closed="right"),
54
  "~60": pd.Interval(45, 70, closed="right"),
55
  "70+": pd.Interval(70, 10000, closed="right"),
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  "Runs",
14
  ]
15
  SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
16
+ SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
17
 
18
  USACO_ON_LOAD_COLUMNS = [
19
  "Agent Name",
 
22
  "Runs",
23
  ]
24
  USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
25
+ USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
26
 
27
  COREBENCH_ON_LOAD_COLUMNS = [
28
  "Agent Name",
 
31
  "Runs",
32
  ]
33
  COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
34
+ COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
35
 
36
 
37
 
 
53
  "~35": pd.Interval(20, 45, closed="right"),
54
  "~60": pd.Interval(45, 70, closed="right"),
55
  "70+": pd.Interval(70, 10000, closed="right"),
56
+ }
57
+
58
+ CYBENCH_ON_LOAD_COLUMNS = [
59
+ "Agent Name",
60
+ "Accuracy",
61
+ "Total Cost",
62
+ "Runs",
63
+ ]
64
+ CYBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
65
+ CYBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
66
+
67
+ APPWORLD_ON_LOAD_COLUMNS = [
68
+ "Agent Name",
69
+ "Accuracy",
70
+ "Total Cost",
71
+ "Runs",
72
+ ]
73
+ APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
74
+ APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
75
+
76
+ GAIA_ON_LOAD_COLUMNS = [
77
+ "Agent Name",
78
+ "Accuracy",
79
+ "Level 1 Accuracy",
80
+ "Level 2 Accuracy",
81
+ "Level 3 Accuracy",
82
+ "Total Cost",
83
+ "Runs",
84
+ ]
85
+ GAIA_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
86
+ GAIA_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
scratch.ipynb DELETED
File without changes
scratch.py DELETED
@@ -1,38 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- def process_json_files(directory, suffix="_updated"):
6
- # Iterate through all JSON files in the directory
7
- for filename in os.listdir(directory):
8
- if filename.endswith(".json") and "USACO" in filename:
9
- file_path = os.path.join(directory, filename)
10
-
11
- # Read the JSON file
12
- with open(file_path, 'r') as f:
13
- data = json.load(f)
14
-
15
- # Extract sdict from raw_eval_results
16
- sdict = data['raw_eval_results']['sdict']
17
-
18
- # Calculate successful_tasks and failed_tasks
19
- successful_tasks = [key for key in sdict if float(sdict[key][0]['result']['fraction_passed']) == 1]
20
- failed_tasks = [key for key in sdict if float(sdict[key][0]['result']['fraction_passed']) < 1]
21
-
22
- # Add new key-value pairs to the results
23
- data['results']['successful_tasks'] = successful_tasks
24
- data['results']['failed_tasks'] = failed_tasks
25
-
26
- # Create new filename with suffix
27
- new_filename = f"{Path(filename).stem}{suffix}{Path(filename).suffix}"
28
- new_file_path = os.path.join(directory, new_filename)
29
-
30
- # Write updated data to new file
31
- with open(new_file_path, 'w') as f:
32
- json.dump(data, f, indent=4)
33
-
34
- print(f"Processed {filename} and saved as {new_filename}")
35
-
36
- # Usage
37
- directory_path = "/Users/benediktstroebl/Documents/GitHub/leaderboard/evals_live"
38
- process_json_files(directory_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/data.py CHANGED
@@ -6,63 +6,6 @@ from utils.pareto import Agent, compute_pareto_frontier
6
  import plotly.graph_objects as go
7
  import textwrap
8
 
9
- # def parse_json_files(folder_path, benchmark_name):
10
- # # Convert folder path to Path object
11
- # folder = Path(folder_path)
12
-
13
- # # List to store data from each file
14
- # data_list = []
15
-
16
- # # Iterate through all JSON files in the folder
17
- # for json_file in folder.glob('*.json'):
18
- # try:
19
- # with open(json_file, 'r') as file:
20
- # data = json.load(file)
21
-
22
- # # Extract config and results
23
- # config = data['config']
24
- # results = data['results']
25
-
26
- # # Combine config and results into a single dictionary
27
- # combined_data = {
28
- # 'agent_name': config['agent_name'],
29
- # 'benchmark_name': config['benchmark_name'],
30
- # 'date': config['date']
31
- # }
32
-
33
- # # Add results with 'results_' prefix
34
- # for key, value in results.items():
35
- # combined_data[f'results_{key}'] = value
36
-
37
- # data_list.append(combined_data)
38
- # except Exception as e:
39
- # print(f"Error processing {json_file}: {e}. Skipping!")
40
-
41
- # # Create DataFrame from the list of dictionaries
42
- # df = pd.DataFrame(data_list)
43
- # df = df[df['benchmark_name'] == benchmark_name]
44
-
45
- # # sort df by descending accuracy
46
- # df = df.sort_values(by='results_accuracy', ascending=False)
47
-
48
- # # round all float columns to 2 decimal places
49
- # for column in df.select_dtypes(include='float').columns:
50
- # df[column] = df[column].round(3)
51
-
52
- # # Rename columns
53
- # df = df.rename(columns={
54
- # 'agent_name': 'Agent Name',
55
- # 'results_total_cost': 'Total Cost',
56
- # 'results_accuracy': 'Accuracy',
57
- # 'results_precision': 'Precision',
58
- # 'results_recall': 'Recall',
59
- # 'results_f1_score': 'F1 Score',
60
- # 'results_auc': 'AUC',
61
- # })
62
-
63
- # return df
64
-
65
-
66
  def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
67
  agents = [Agent(row.results_total_cost, row.results_accuracy) for row in df.itertuples()]
68
  pareto_frontier = compute_pareto_frontier(agents)
 
6
  import plotly.graph_objects as go
7
  import textwrap
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
10
  agents = [Agent(row.results_total_cost, row.results_accuracy) for row in df.itertuples()]
11
  pareto_frontier = compute_pareto_frontier(agents)
utils/db.py CHANGED
@@ -65,6 +65,9 @@ class TracePreprocessor:
65
  amp_parkinsons_disease_progression_prediction_score REAL,
66
  cifar10_score REAL,
67
  imdb_score REAL,
 
 
 
68
  PRIMARY KEY (benchmark_name, agent_name, run_id)
69
  )
70
  ''')
@@ -77,6 +80,8 @@ class TracePreprocessor:
77
  data = json.load(f)
78
  agent_name = data['config']['agent_name']
79
  benchmark_name = data['config']['benchmark_name']
 
 
80
  date = data['config']['date']
81
  config = data['config']
82
 
@@ -108,8 +113,8 @@ class TracePreprocessor:
108
  with self.get_conn() as conn:
109
  conn.execute('''
110
  INSERT INTO parsed_results
111
- (benchmark_name, agent_name, date, run_id, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
112
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
113
  ''', (
114
  benchmark_name,
115
  agent_name,
@@ -131,7 +136,10 @@ class TracePreprocessor:
131
  results.get('spaceship-titanic_score'),
132
  results.get('amp-parkinsons-disease-progression-prediction_score'),
133
  results.get('cifar10_score'),
134
- results.get('imdb_score')
 
 
 
135
  ))
136
  except Exception as e:
137
  print(f"Error preprocessing parsed results in {file}: {e}")
@@ -257,12 +265,15 @@ class TracePreprocessor:
257
  'overall_score': 'mean',
258
  'vectorization_score': 'mean',
259
  'fathomnet_score': 'mean',
260
- 'feedback_score': 'mean',
261
  'house_price_score': 'mean',
262
  'spaceship_titanic_score': 'mean',
263
  'amp_parkinsons_disease_progression_prediction_score': 'mean',
264
  'cifar10_score': 'mean',
265
  'imdb_score': 'mean',
 
 
 
266
  'Verified': 'first',
267
  'Runs': 'first',
268
  'acc_ci': 'first',
@@ -270,7 +281,7 @@ class TracePreprocessor:
270
  }).reset_index()
271
 
272
  # Round float columns to 3 decimal places
273
- float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
274
  for column in float_columns:
275
  if column in df.columns:
276
  df[column] = df[column].round(3)
@@ -297,6 +308,9 @@ class TracePreprocessor:
297
  'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
298
  'cifar10_score': 'CIFAR10 Score',
299
  'imdb_score': 'IMDB Score',
 
 
 
300
  'acc_ci': 'Accuracy CI',
301
  'cost_ci': 'Total Cost CI'
302
  })
 
65
  amp_parkinsons_disease_progression_prediction_score REAL,
66
  cifar10_score REAL,
67
  imdb_score REAL,
68
+ level_1_accuracy REAL,
69
+ level_2_accuracy REAL,
70
+ level_3_accuracy REAL,
71
  PRIMARY KEY (benchmark_name, agent_name, run_id)
72
  )
73
  ''')
 
80
  data = json.load(f)
81
  agent_name = data['config']['agent_name']
82
  benchmark_name = data['config']['benchmark_name']
83
+ if "inspect" in benchmark_name:
84
+ benchmark_name = benchmark_name.split("/")[-1]
85
  date = data['config']['date']
86
  config = data['config']
87
 
 
113
  with self.get_conn() as conn:
114
  conn.execute('''
115
  INSERT INTO parsed_results
116
+ (benchmark_name, agent_name, date, run_id, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score, level_1_accuracy, level_2_accuracy, level_3_accuracy)
117
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
118
  ''', (
119
  benchmark_name,
120
  agent_name,
 
136
  results.get('spaceship-titanic_score'),
137
  results.get('amp-parkinsons-disease-progression-prediction_score'),
138
  results.get('cifar10_score'),
139
+ results.get('imdb_score'),
140
+ results.get('level_1_accuracy'),
141
+ results.get('level_2_accuracy'),
142
+ results.get('level_3_accuracy')
143
  ))
144
  except Exception as e:
145
  print(f"Error preprocessing parsed results in {file}: {e}")
 
265
  'overall_score': 'mean',
266
  'vectorization_score': 'mean',
267
  'fathomnet_score': 'mean',
268
+ 'feedback_score': 'mean',
269
  'house_price_score': 'mean',
270
  'spaceship_titanic_score': 'mean',
271
  'amp_parkinsons_disease_progression_prediction_score': 'mean',
272
  'cifar10_score': 'mean',
273
  'imdb_score': 'mean',
274
+ 'level_1_accuracy': 'mean',
275
+ 'level_2_accuracy': 'mean',
276
+ 'level_3_accuracy': 'mean',
277
  'Verified': 'first',
278
  'Runs': 'first',
279
  'acc_ci': 'first',
 
281
  }).reset_index()
282
 
283
  # Round float columns to 3 decimal places
284
+ float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score', 'level_1_accuracy', 'level_2_accuracy', 'level_3_accuracy']
285
  for column in float_columns:
286
  if column in df.columns:
287
  df[column] = df[column].round(3)
 
308
  'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
309
  'cifar10_score': 'CIFAR10 Score',
310
  'imdb_score': 'IMDB Score',
311
+ 'level_1_accuracy': 'Level 1 Accuracy',
312
+ 'level_2_accuracy': 'Level 2 Accuracy',
313
+ 'level_3_accuracy': 'Level 3 Accuracy',
314
  'acc_ci': 'Accuracy CI',
315
  'cost_ci': 'Total Cost CI'
316
  })
utils/processing.py CHANGED
@@ -93,12 +93,13 @@ async def check_upload_structure(file_path):
93
  return {'is_valid': False, 'message': f"Missing required keys: {', '.join(missing_keys)}"}
94
 
95
  # Check for specific structure in raw_logging_results
96
- if not isinstance(data['raw_logging_results'], list):
97
  return {'is_valid': False, 'message': "raw_logging_results should be a list"}
98
 
99
- for item in data['raw_logging_results']:
100
- if not all(key in item for key in ['weave_task_id', 'inputs', 'outputs']):
101
- return {'is_valid': False, 'message': "Each item in raw_logging_results should have weave_task_id, inputs, and outputs"}
 
102
 
103
  return {'is_valid': True, 'message': "File structure is valid"}
104
 
@@ -115,10 +116,16 @@ async def process_upload(input_path, output_path):
115
  data = json.loads(f.read())
116
 
117
  assert 'raw_logging_results' in data, "raw_logging_results key not found in the file"
118
- openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
119
 
120
  try:
121
- processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=False)
 
 
 
 
 
 
 
122
  # failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
123
 
124
  data['raw_logging_results'] = processed_calls
 
93
  return {'is_valid': False, 'message': f"Missing required keys: {', '.join(missing_keys)}"}
94
 
95
  # Check for specific structure in raw_logging_results
96
+ if not isinstance(data['raw_logging_results'], list) and not "inspect" in data['config']['benchmark_name']:
97
  return {'is_valid': False, 'message': "raw_logging_results should be a list"}
98
 
99
+ if "inspect" not in data['config']['benchmark_name']:
100
+ for item in data['raw_logging_results']:
101
+ if not all(key in item for key in ['weave_task_id', 'inputs', 'outputs']):
102
+ return {'is_valid': False, 'message': "Each item in raw_logging_results should have weave_task_id, inputs, and outputs"}
103
 
104
  return {'is_valid': True, 'message': "File structure is valid"}
105
 
 
116
  data = json.loads(f.read())
117
 
118
  assert 'raw_logging_results' in data, "raw_logging_results key not found in the file"
 
119
 
120
  try:
121
+ if isinstance(data['raw_logging_results'], list):
122
+ openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
123
+ processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=False)
124
+ else:
125
+ processed_calls = data['raw_logging_results']
126
+
127
+
128
+ # # experimental
129
  # failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
130
 
131
  data['raw_logging_results'] = processed_calls
verified_agents.yaml CHANGED
@@ -66,6 +66,8 @@ swebench_verified_mini:
66
  swebench_verified:
67
  - agent_name: "Moatless (gpt-4o-2024-08-06)"
68
  verification_date: 2024-10-30
 
 
69
 
70
  mlagentbench:
71
  - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
@@ -100,4 +102,28 @@ corebench_hard:
100
  - agent_name: "CORE-Agent (GPT-4o)"
101
  verification_date: 2024-09-28
102
  - agent_name: "CORE-Agent (GPT-4o-mini)"
103
- verification_date: 2024-09-28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  swebench_verified:
67
  - agent_name: "Moatless (gpt-4o-2024-08-06)"
68
  verification_date: 2024-10-30
69
+ - agent_name: "Agentless (o1-mini-2024-09-12)"
70
+ verification_date: 2024-10-30
71
 
72
  mlagentbench:
73
  - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
 
102
  - agent_name: "CORE-Agent (GPT-4o)"
103
  verification_date: 2024-09-28
104
  - agent_name: "CORE-Agent (GPT-4o-mini)"
105
+ verification_date: 2024-09-28
106
+
107
+ gaia:
108
+ - agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
109
+ verification_date: 2024-11-30
110
+ - agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
111
+ verification_date: 2024-11-30
112
+ - agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
113
+ verification_date: 2024-11-30
114
+ - agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
115
+ verification_date: 2024-11-30
116
+
117
+ cybench:
118
+ - agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
119
+ verification_date: 2024-11-30
120
+ - agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
121
+ verification_date: 2024-11-30
122
+ - agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
123
+ verification_date: 2024-11-30
124
+ - agent_name: "Inspect Default Agent (o1-mini-2024-09-12)"
125
+ verification_date: 2024-11-30
126
+ - agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
127
+ verification_date: 2024-11-30
128
+
129
+