diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -19,6 +19,10 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler
 # import weave
 from utils.db import TracePreprocessor
 from gradio.themes.soft import Soft
+from utils.db import DEFAULT_PRICING
+from dotenv import load_dotenv
+
+load_dotenv()
 
 preprocessor = TracePreprocessor()
 
@@ -40,6 +44,17 @@ def download_latest_results():
                     max_workers=4,
                     )
     print("Download complete.")
+    
+def download_db_files():
+    snapshot_download(RESULTS_REPO_ID, 
+                    local_dir= "preprocessed_traces",
+                    repo_type='dataset',
+                    tqdm_class=None,
+                    etag_timeout=30,
+                    max_workers=4,
+                    allow_patterns="*.db",
+                    )
+    print("Download complete.")
 
 
 def get_analyzed_traces(agent_name, benchmark_name):
@@ -251,8 +266,32 @@ class MyTheme(Soft):
 
 my_theme = MyTheme()
 
+# Add after the other helper functions, before the UI code
+def update_visualizations(benchmark_name, pricing_config):
+    """Update leaderboard and scatter plot with new pricing"""
+    # Get updated results with new pricing
+    results_df = preprocessor.get_parsed_results_with_costs(benchmark_name, pricing_config)
+    
+    # Create updated leaderboard
+    leaderboard_df = create_leaderboard(
+        results_df,
+        ci_metrics=["Accuracy", "Total Cost"]
+    )
+    
+    # Create updated scatter plot
+    scatter_fig = create_scatter_plot(
+        results_df,
+        "Total Cost",
+        "Accuracy",
+        "Total Cost (in USD)",
+        "Accuracy",
+        ["Agent Name"]
+    )
+    
+    return leaderboard_df, scatter_fig
+
 with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderboard") as demo:
-    # gr.Markdown((Path(__file__).parent / "header.md").read_text(), elem_classes=["text-large"])
+    # First add the header HTML
     gr.HTML("""
             <style>
     .hal-header {
@@ -282,187 +321,316 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
         color: #3498db;
         font-weight: 600;
     }
+    
+    /* New styles for about section */
+    .about-section {
+        background-color: #ffffff;
+        padding: 20px 0;
+        margin: 20px 0;
+    }
 </style>
 
 <header class="hal-header">
     <h1 class="hal-title">Holistic Agent Leaderboard (HAL)</h1>
     <p class="hal-subtitle">
-        A standardized, cost-aware, and third-party leaderboard for evaluating agents.
+        The standardized, cost-aware, and third-party leaderboard for evaluating agents.
     </p>
 </header>""")
+
+    # Add the about section as an accordion
+    with gr.Accordion("About HAL", open=False, elem_classes=["about-section"]):
+        gr.Markdown((Path(__file__).parent / "about.md").read_text())
+
+    # Continue with the features HTML
     gr.HTML("""
-<style>
-    .feature-row {
-        display: flex;
-        justify-content: space-between;
-        margin-top: 20px;
-        margin-bottom: 20px;
-    }
-    .feature-column {
-        flex: 1;
-        padding: 25px;
-        background-color: #ffffff;
-        border-radius: 10px;
-        margin: 0 15px;
-        text-align: left;
-        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
-        display: flex;
-        flex-direction: column;
-        align-items: flex-start;
-        border-top: 5px solid #3498db;
-        transition: transform 0.3s ease, box-shadow 0.3s ease;
-    }
-    .feature-column:hover {
-        transform: translateY(-5px);
-        box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
-    }
-    .feature-keyword {
-        font-size: 1.2em;
-        font-weight: bold;
-        color: #1b9e77;
-        margin-bottom: 10px;
-        text-transform: uppercase;
-        letter-spacing: 1px;
-    }
-    .feature-content {
-        flex-grow: 1;
-    }
-    .feature-description {
-        font-size: 0.95em;
-        line-height: 1.6;
-        color: #333;
-    }
-</style>
+    <style>
+        .feature-row {
+            display: flex;
+            justify-content: space-between;
+            margin-top: 20px;
+            margin-bottom: 20px;
+        }
+        .feature-column {
+            flex: 1;
+            padding: 25px;
+            background-color: #ffffff;
+            border-radius: 10px;
+            margin: 0 15px;
+            text-align: left;
+            box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
+            display: flex;
+            flex-direction: column;
+            align-items: flex-start;
+            border-top: 5px solid #3498db;
+            transition: transform 0.3s ease, box-shadow 0.3s ease;
+        }
+        .feature-column:hover {
+            transform: translateY(-5px);
+            box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
+        }
+        .feature-keyword {
+            font-size: 1.2em;
+            font-weight: bold;
+            color: #1b9e77;
+            margin-bottom: 10px;
+            text-transform: uppercase;
+            letter-spacing: 1px;
+        }
+        .feature-content {
+            flex-grow: 1;
+        }
+        .feature-description {
+            font-size: 0.95em;
+            line-height: 1.6;
+            color: #333;
+        }
+    </style>
 
-<div class="feature-row">
-    <div class="feature-column">
-        <div class="feature-keyword">Standardized</div>
-        <div class="feature-content">
-            <p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
+    <div class="feature-row">
+        <div class="feature-column">
+            <div class="feature-keyword">Standardized</div>
+            <div class="feature-content">
+                <p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
+            </div>
         </div>
-    </div>
-    <div class="feature-column">
-        <div class="feature-keyword">Cost-controlled</div>
-        <div class="feature-content">
-            <p class="feature-description">For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.</p>
+        <div class="feature-column">
+            <div class="feature-keyword">Cost-controlled</div>
+            <div class="feature-content">
+                <p class="feature-description">For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.</p>
+            </div>
         </div>
-    </div>
-    <div class="feature-column">
-        <div class="feature-keyword">Third-party</div>
-        <div class="feature-content">
-            <p class="feature-description">Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.</p>
+        <div class="feature-column">
+            <div class="feature-keyword">Third-party</div>
+            <div class="feature-content">
+                <p class="feature-description">Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.</p>
+            </div>
         </div>
     </div>
-</div>
-<style>
-    .section-heading {
-        font-size: 1.8em;
-        font-weight: bold;
-        color: #2c3e50;
-        margin-top: 40px;
-        margin-bottom: 20px;
-        text-align: left;
-    }
-    .user-types-container {
-        display: grid;
-        grid-template-columns: repeat(2, 1fr);
-        gap: 20px;
-        margin-top: 20px;
-    }
-    .user-type {
-        background-color: #ffffff;
-        border-radius: 10px;
-        padding: 25px;
-        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
-        transition: transform 0.3s ease, box-shadow 0.3s ease;
-        border-left: 5px solid #3498db;
-    }
-    .user-type:hover {
-        transform: translateY(-5px);
-        box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
-    }
-    .user-type-title {
-        font-size: 1.2em;
-        font-weight: bold;
-        color: #3498db;
-        margin-bottom: 10px;
-    }
-    .user-type-description {
-        font-size: 0.95em;
-        line-height: 1.6;
-        color: #333;
-    }
-    .user-type-links a {
-        display: inline-block;
-        padding: 5px 12px;
-        margin-bottom: 5px;
-        background-color: #f0f4f8;
-        color: #2c3e50 !important; /* Force the color change */
-        text-decoration: none !important; /* Force remove underline */
-        border-radius: 15px;
-        font-size: 0.85em;
-        transition: all 0.3s ease;
-        border: 1px solid #e1e8ed;
-    }
-    .user-type-links a:hover {
-        background-color: #3498db;
-        color: white !important; /* Force the color change on hover */
-        transform: translateY(-2px);
-        box-shadow: 0 2px 5px rgba(52, 152, 219, 0.2);
-        text-decoration: none !important; /* Ensure no underline on hover */
-    }
-    .user-type-links a:visited {
-        color: #2c3e50 !important; /* Ensure visited links have the same color */
-    }
-    .user-type-links a::before {
-        content: "→";
-        margin-right: 5px;
-        font-size: 1.1em;
-    }
-</style>
+    <style>
+        .section-heading {
+            font-size: 1.8em;
+            font-weight: bold;
+            color: #2c3e50;
+            margin-top: 40px;
+            margin-bottom: 20px;
+            text-align: left;
+        }
+        .user-types-container {
+            display: grid;
+            grid-template-columns: repeat(2, 1fr);
+            gap: 20px;
+            margin-top: 20px;
+        }
+        .user-type {
+            background-color: #ffffff;
+            border-radius: 10px;
+            padding: 25px;
+            box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
+            transition: transform 0.3s ease, box-shadow 0.3s ease;
+            border-left: 5px solid #3498db;
+        }
+        .user-type:hover {
+            transform: translateY(-5px);
+            box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
+        }
+        .user-type-title {
+            font-size: 1.2em;
+            font-weight: bold;
+            color: #3498db;
+            margin-bottom: 10px;
+        }
+        .user-type-description {
+            font-size: 0.95em;
+            line-height: 1.6;
+            color: #333;
+        }
+        .user-type-links a {
+            display: inline-block;
+            padding: 5px 12px;
+            margin-bottom: 5px;
+            background-color: #f0f4f8;
+            color: #2c3e50 !important; /* Force the color change */
+            text-decoration: none !important; /* Force remove underline */
+            border-radius: 15px;
+            font-size: 0.85em;
+            transition: all 0.3s ease;
+            border: 1px solid #e1e8ed;
+        }
+        .user-type-links a:hover {
+            background-color: #3498db;
+            color: white !important; /* Force the color change on hover */
+            transform: translateY(-2px);
+            box-shadow: 0 2px 5px rgba(52, 152, 219, 0.2);
+            text-decoration: none !important; /* Ensure no underline on hover */
+        }
+        .user-type-links a:visited {
+            color: #2c3e50 !important; /* Ensure visited links have the same color */
+        }
+        .user-type-links a::before {
+            content: "→";
+            margin-right: 5px;
+            font-size: 1.1em;
+        }
+    </style>
 
-<h2 class="section-heading">Who is it for?</h2>
-<p>We see HAL being useful for four types of users:</p>
+    <h2 class="section-heading">Who is it for?</h2>
+    <p>We see HAL being useful for four types of users:</p>
 
-<div class="user-types-container">
-    <div class="user-type">
-        <h3 class="user-type-title">Downstream Users & Procurers</h3>
-        <p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
-        <div class="user-type-links">
-            <a href="#leaderboards">Leaderboards</a>
+    <div class="user-types-container">
+        <div class="user-type">
+            <h3 class="user-type-title">Downstream Users & Procurers</h3>
+            <p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
+            <div class="user-type-links">
+                <a href="#leaderboards">Leaderboards</a>
+            </div>
         </div>
-    </div>
-    <div class="user-type">
-        <h3 class="user-type-title">Agent Benchmark Developers</h3>
-        <p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
-        <div class="user-type-links">
-            <a href="#benchmark-submission">Add a Benchmark</a>
+        <div class="user-type">
+            <h3 class="user-type-title">Agent Benchmark Developers</h3>
+            <p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
+            <div class="user-type-links">
+                <a href="#benchmark-submission">Add a Benchmark</a>
+            </div>
         </div>
-    </div>
-    <div class="user-type">
-        <h3 class="user-type-title">Agent Developers</h3>
-        <p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
-        <div class="user-type-links">
-            <a href="#agent-submission">Submit an Agent</a>
-            <a href="#leaderboards">Leaderboards</a>
-            <a href="#reproduction-guide">Reproduction Guide</a>
+        <div class="user-type">
+            <h3 class="user-type-title">Agent Developers</h3>
+            <p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
+            <div class="user-type-links">
+                <a href="#agent-submission">Submit an Agent</a>
+                <a href="#leaderboards">Leaderboards</a>
+                <a href="#reproduction-guide">Reproduction Guide</a>
+            </div>
         </div>
-    </div>
-    <div class="user-type">
-        <h3 class="user-type-title">Safety Researchers</h3>
-        <p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
-        <div class="user-type-links">
-            <a href="#cybench-results">Cybench Leaderboard (coming soon)</a>
-            <a href="#agent-monitor">Agent Monitor</a>
+        <div class="user-type">
+            <h3 class="user-type-title">Safety Researchers</h3>
+            <p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
+            <div class="user-type-links">
+                <a href="#leaderboards">Cybench Leaderboard</a>
+            </div>
         </div>
     </div>
-</div>
-</br>
-<h2 class="section-heading" id="leaderboards">Leaderboards</h2>
-<p>Select a benchmark to see the agent leaderboard. Verified results have been run by the HAL team:</p>
-""")
+    </br>
+    <h2 class="section-heading" id="leaderboards">Leaderboards</h2>
+    <p>Select a benchmark to see the agent leaderboard. Verified results have been run by the HAL team:</p>
+    """)
     
+    # Then continue with the tabs, but remove the "About" tab
     with gr.Tabs() as tabs:
+        # Remove the About tab and continue with other tabs
+        with gr.Tab("AppWorld Challenge"):
+            gr.Markdown("""AppWorld Challenge is a more demanding subset of tasks designed to push the boundaries of AI coding capabilities. These tasks feature increased complexity, requiring sophisticated problem-solving and deeper understanding of programming concepts. The significant drop in performance (to 30% success rate for GPT-4) on these challenge tasks indicates they effectively test the upper limits of current AI coding abilities.""")
+            
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(
+                            parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_challenge'),
+                            ci_metrics=["Accuracy", "Total Cost"]
+                        ),
+                        select_columns=SelectColumns(
+                            default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.APPWORLD_HIDE_COLUMNS,
+                        search_columns=config.APPWORLD_SEARCH_COLUMNS,
+                        datatype='markdown'
+                    )
+            
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on AppWorld")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(
+                    parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_challenge', aggregate=False),
+                    "Total Cost",
+                    "Accuracy",
+                    "Total Cost (in USD)",
+                    "Accuracy",
+                    ["Agent Name"]
+                ))
+
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('appworld_test_challenge'),
+                    'AppWorld'
+                ),
+                outputs=[task_success_heatmap]
+            )
+        
+        with gr.Tab("AppWorld Normal"):
+            gr.Markdown("""AppWorld Normal is a benchmark suite containing standard programming tasks that represent typical real-world development scenarios. These tasks evaluate a language model's ability to handle common coding challenges across different domains. Even at this baseline level, current state-of-the-art models like GPT-4 achieve only a 49% success rate, demonstrating that even "normal" software development tasks remain challenging for AI systems.""")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.APPWORLD_HIDE_COLUMNS,
+                        search_columns=config.APPWORLD_SEARCH_COLUMNS,
+                        datatype='markdown'
+                    )
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on AppWorld")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+        
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('appworld_test_normal'),
+                    'AppWorld'
+                ),
+                outputs=[task_success_heatmap]
+            )
+        
+        with gr.Tab("CORE-Bench-Easy"):
+            gr.HTML("""
+            <p>         
+            <i><b>CORE-Bench-Easy:</b></i> The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
+            </p>
+            """)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.COREBENCH_HIDE_COLUMNS,
+                        search_columns=config.COREBENCH_SEARCH_COLUMNS,
+                        datatype='markdown'
+                    )
+                    # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+            lambda: create_task_success_heatmap(
+                preprocessor.get_task_success_data('corebench_easy'),
+                'CORE-Bench-Easy'
+            ),
+            outputs=[task_success_heatmap]
+            )
+
+            gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
         with gr.Tab("CORE-Bench-Hard"):
             gr.HTML("""
             <p>
@@ -485,6 +653,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                         ),
                         hide_columns=config.COREBENCH_HIDE_COLUMNS,
                         search_columns=config.COREBENCH_SEARCH_COLUMNS,
+                        datatype='markdown'
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
@@ -522,6 +691,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                         ),
                         hide_columns=config.COREBENCH_HIDE_COLUMNS,
                         search_columns=config.COREBENCH_SEARCH_COLUMNS,
+                        datatype='markdown'
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
@@ -542,88 +712,364 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
             outputs=[task_success_heatmap]
             )
             gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
-        with gr.Tab("CORE-Bench-Easy"):
-            gr.HTML("""
-            <p>         
-            <i><b>CORE-Bench-Easy:</b></i> The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
-            </p>
-            """)
+        with gr.Tab("Cybench"):
+            gr.Markdown("""Cybench is a benchmark for evaluating the cybersecurity capabilities and risks of language models. Cybench includes 40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties. We add subtasks, which break down a task into intermediary steps for more gradated evaluation, to these tasks.""")
+            
+            # Get models used in Cybench benchmark
+            cybench_models = preprocessor.get_models_for_benchmark('cybench')
+            
+            # Initialize pricing inputs dictionary
+            cybench_pricing_inputs = {}
+            
+            with gr.Accordion("Token Pricing Configuration", open=False):
+                for model in DEFAULT_PRICING.keys():
+                    if model in cybench_models:
+                        with gr.Row():
+                            gr.Markdown(f"**{model}**")
+                            cybench_pricing_inputs[f"{model}_input"] = gr.Number(
+                                value=DEFAULT_PRICING[model]["prompt_tokens"],
+                                label="Input ($/1M tokens)",
+                                step=0.01
+                            )
+                            cybench_pricing_inputs[f"{model}_output"] = gr.Number(
+                                value=DEFAULT_PRICING[model]["completion_tokens"],
+                                label="Output ($/1M tokens)",
+                                step=0.01
+                            )
+            
             with gr.Row():
                 with gr.Column(scale=2):
-                    Leaderboard(
-                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
+                    cybench_leaderboard = Leaderboard(
+                        value=create_leaderboard(
+                            preprocessor.get_parsed_results_with_costs('cybench'),
+                            ci_metrics=["Accuracy", "Total Cost"]
+                        ),
                         select_columns=SelectColumns(
-                            default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            default_selection=config.CYBENCH_ON_LOAD_COLUMNS + ["Verified"],
                             cant_deselect=["Agent Name"],
                             label="Select Columns to Display:",
                         ),
-                        hide_columns=config.COREBENCH_HIDE_COLUMNS,
-                        search_columns=config.COREBENCH_SEARCH_COLUMNS,
+                        hide_columns=config.CYBENCH_HIDE_COLUMNS,
+                        search_columns=config.CYBENCH_SEARCH_COLUMNS,
+                        datatype='markdown'
                     )
-                    # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Easy")
+                gr.Markdown("### Accuracy vs. Cost on Cybench")
             with gr.Row():
-                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+                cybench_scatter_plot = gr.Plot()
+
+            def update_cybench(*pricing_values):
+                pricing_config = {}
+                values_iter = iter(pricing_values)
+                for model in DEFAULT_PRICING.keys():
+                    if model in cybench_models:
+                        pricing_config[model] = {
+                            "prompt_tokens": next(values_iter),
+                            "completion_tokens": next(values_iter)
+                        }
+                return update_visualizations('cybench', pricing_config)
+            
+            # Connect all pricing inputs to the update function
+            cybench_pricing_input_list = [v for v in cybench_pricing_inputs.values()]
+            for input_component in cybench_pricing_input_list:
+                input_component.change(
+                    fn=update_cybench,
+                    inputs=cybench_pricing_input_list,
+                    outputs=[cybench_leaderboard, cybench_scatter_plot]
+                )
+
+            # Initialize the scatter plot
+            demo.load(
+                fn=lambda: create_scatter_plot(
+                    preprocessor.get_parsed_results_with_costs('cybench'),
+                    "Total Cost",
+                    "Accuracy",
+                    "Total Cost (in USD)",
+                    "Accuracy",
+                    ["Agent Name"]
+                ),
+                outputs=cybench_scatter_plot
+            )
 
+            # Keep the existing heatmap section
             gr.HTML('<div style="height: 30px;"></div>')
             gr.Markdown("## Task success heatmap")
-            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
-            lambda: create_task_success_heatmap(
-                preprocessor.get_task_success_data('corebench_easy'),
-                'CORE-Bench-Easy'
-            ),
-            outputs=[task_success_heatmap]
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('cybench'),
+                    'Cybench'
+                ),
+                outputs=[task_success_heatmap]
             )
-
-            gr.Markdown((Path(__file__).parent / "agent_submission_core.md").read_text())
-        with gr.Tab("USACO"):
-            gr.Markdown("""The USA Computing Olympiad (USACO) is a computer programming competition for pre-college students. This benchmark evaluates the performance of AI agents on a set of 307 USACO tasks. The agents are evaluated based on the number of tasks correctly solved.""")
+        
+        with gr.Tab("GAIA"):
+            gr.Markdown("""GAIA is a benchmark for General AI Assistants. GAIA proposes real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency. GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.""")
+            
+            # Get models used in GAIA benchmark
+            gaia_models = preprocessor.get_models_for_benchmark('gaia')
+            
+            # Initialize pricing inputs dictionary
+            gaia_pricing_inputs = {}
+            
+            with gr.Accordion("Token Pricing Configuration", open=False):
+                for model in DEFAULT_PRICING.keys():
+                    # Only show pricing inputs for models used in GAIA
+                    if model in gaia_models:
+                        with gr.Row():
+                            gr.Markdown(f"**{model}**")
+                            gaia_pricing_inputs[f"{model}_input"] = gr.Number(
+                                value=DEFAULT_PRICING[model]["prompt_tokens"],
+                                label="Input ($/1M tokens)",
+                                step=0.01
+                            )
+                            gaia_pricing_inputs[f"{model}_output"] = gr.Number(
+                                value=DEFAULT_PRICING[model]["completion_tokens"],
+                                label="Output ($/1M tokens)",
+                                step=0.01
+                            )
+            
             with gr.Row():
                 with gr.Column(scale=2):
-                    Leaderboard(
-                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
+                    gaia_leaderboard = Leaderboard(
+                        value=create_leaderboard(
+                            preprocessor.get_parsed_results_with_costs('gaia'),
+                            ci_metrics=["Accuracy", "Total Cost"]
+                        ),
                         select_columns=SelectColumns(
-                            default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
+                            default_selection=config.GAIA_ON_LOAD_COLUMNS + ["Verified"],
                             cant_deselect=["Agent Name"],
                             label="Select Columns to Display:",
                         ),
-                        hide_columns=config.USACO_HIDE_COLUMNS,
-                        search_columns=config.USACO_SEARCH_COLUMNS,
+                        hide_columns=config.GAIA_HIDE_COLUMNS,
+                        search_columns=config.GAIA_SEARCH_COLUMNS,
+                        datatype='markdown'
                     )
-                    gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            
             with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost for USACO agents")
+                gr.Markdown("### Accuracy vs. Cost on GAIA")
             with gr.Row():
-                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+                gaia_scatter_plot = gr.Plot()
+
+            def update_gaia(*pricing_values):
+                pricing_config = {}
+                values_iter = iter(pricing_values)
+                for model in DEFAULT_PRICING.keys():
+                    if model in gaia_models:  # Only include used models
+                        pricing_config[model] = {
+                            "prompt_tokens": next(values_iter),
+                            "completion_tokens": next(values_iter)
+                        }
+                return update_visualizations('gaia', pricing_config)
             
+            # Connect all pricing inputs to the update function
+            gaia_pricing_input_list = [v for v in gaia_pricing_inputs.values()]
+            for input_component in gaia_pricing_input_list:
+                input_component.change(
+                    fn=update_gaia,
+                    inputs=gaia_pricing_input_list,
+                    outputs=[gaia_leaderboard, gaia_scatter_plot]
+                )
+
+            # Initialize the scatter plot
+            demo.load(
+                fn=lambda: create_scatter_plot(
+                    preprocessor.get_parsed_results_with_costs('gaia'),
+                    "Total Cost",
+                    "Accuracy",
+                    "Total Cost (in USD)",
+                    "Accuracy",
+                    ["Agent Name"]
+                ),
+                outputs=gaia_scatter_plot
+            )
+
+            # Keep the existing heatmap section
             gr.HTML('<div style="height: 30px;"></div>')
             gr.Markdown("## Task success heatmap")
-            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
             with gr.Row():
                 task_success_heatmap = gr.Plot()
             demo.load(
-            lambda: create_task_success_heatmap(
-                preprocessor.get_task_success_data('usaco'),
-                'USACO'
-            ),
-            outputs=[task_success_heatmap]
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('gaia'),
+                    'GAIA'
+                ),
+                outputs=[task_success_heatmap]
             )
-
-            gr.HTML("""
-            <style>
-                .grouped-section {
-                    border: 2px solid #dee2e6; /* Color matching unactivated tabs */
-                    border-radius: 10px;
+        
+        with gr.Tab("SWE-bench Verified"):
+            gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
+            
+            # Get models used in SWE-bench benchmark
+            swebench_models = preprocessor.get_models_for_benchmark('swebench_verified')
+            
+            # Initialize pricing inputs dictionary
+            swebench_pricing_inputs = {}
+            
+            with gr.Accordion("Token Pricing Configuration", open=False):
+                for model in DEFAULT_PRICING.keys():
+                    if model in swebench_models:
+                        with gr.Row():
+                            gr.Markdown(f"**{model}**")
+                            swebench_pricing_inputs[f"{model}_input"] = gr.Number(
+                                value=DEFAULT_PRICING[model]["prompt_tokens"],
+                                label="Input ($/1M tokens)",
+                                step=0.01
+                            )
+                            swebench_pricing_inputs[f"{model}_output"] = gr.Number(
+                                value=DEFAULT_PRICING[model]["completion_tokens"],
+                                label="Output ($/1M tokens)",
+                                step=0.01
+                            )
+            
+            with gr.Row():
+                with gr.Column(scale=2):
+                    swebench_leaderboard = Leaderboard(
+                        value=create_leaderboard(
+                            preprocessor.get_parsed_results_with_costs('swebench_verified'),
+                            ci_metrics=["Accuracy", "Total Cost"]
+                        ),
+                        select_columns=SelectColumns(
+                            default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.SWEBENCH_HIDE_COLUMNS,
+                        search_columns=config.SWEBENCH_SEARCH_COLUMNS,
+                        datatype='markdown'
+                    )
+            
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
+            with gr.Row():
+                swebench_scatter_plot = gr.Plot()
+                
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('swebench_verified'),
+                    'SWE-bench Verified'
+                ),
+                outputs=[task_success_heatmap]
+            )
+
+            def update_swebench(*pricing_values):
+                pricing_config = {}
+                values_iter = iter(pricing_values)
+                for model in DEFAULT_PRICING.keys():
+                    if model in swebench_models:
+                        pricing_config[model] = {
+                            "prompt_tokens": next(values_iter),
+                            "completion_tokens": next(values_iter)
+                        }
+                return update_visualizations('swebench_verified', pricing_config)
+            
+            # Connect all pricing inputs to the update function
+            swebench_pricing_input_list = [v for v in swebench_pricing_inputs.values()]
+            for input_component in swebench_pricing_input_list:
+                input_component.change(
+                    fn=update_swebench,
+                    inputs=swebench_pricing_input_list,
+                    outputs=[swebench_leaderboard, swebench_scatter_plot]
+                )
+
+            # Initialize the scatter plot
+            demo.load(
+                fn=lambda: create_scatter_plot(
+                    preprocessor.get_parsed_results_with_costs('swebench_verified'),
+                    "Total Cost",
+                    "Accuracy",
+                    "Total Cost (in USD)",
+                    "Accuracy",
+                    ["Agent Name"]
+                ),
+                outputs=swebench_scatter_plot
+            )
+
+            # Keep the rest of the SWE-bench tab content...
+        
+        with gr.Tab("SWE-bench Verified (Mini)"):
+            gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The Mini version is a smaller subset of tasks - you can find the specific task IDs included in this subset [here](https://github.com/benediktstroebl/agent-eval-harness/blob/7b231a952828022a43977f21acfd452adda5088c/agent_eval_harness/benchmarks/swebench_verified_mini_task_ids.txt). We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.SWEBENCH_HIDE_COLUMNS,
+                        search_columns=config.SWEBENCH_SEARCH_COLUMNS,
+                        datatype='markdown'
+                    )
+                    gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('swebench_verified_mini'),
+                    'SWE-bench Verified (Mini)'
+                ),
+                outputs=[task_success_heatmap]
+            )
+            
+        with gr.Tab("USACO"):
+            gr.Markdown("""The USA Computing Olympiad (USACO) is a computer programming competition for pre-college students. This benchmark evaluates the performance of AI agents on a set of 307 USACO tasks. The agents are evaluated based on the number of tasks correctly solved.""")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
+                        select_columns=SelectColumns(
+                            default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
+                            cant_deselect=["Agent Name"],
+                            label="Select Columns to Display:",
+                        ),
+                        hide_columns=config.USACO_HIDE_COLUMNS,
+                        search_columns=config.USACO_SEARCH_COLUMNS,
+                        datatype='markdown'
+                    )
+                    gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
+            with gr.Row():
+                gr.Markdown("### Accuracy vs. Cost for USACO agents")
+            with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            
+            gr.HTML('<div style="height: 30px;"></div>')
+            gr.Markdown("## Task success heatmap")
+            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
+            with gr.Row():
+                task_success_heatmap = gr.Plot()
+            demo.load(
+                lambda: create_task_success_heatmap(
+                    preprocessor.get_task_success_data('usaco'),
+                    'USACO'
+                ),
+                outputs=[task_success_heatmap]
+            )
+
+            gr.HTML("""
+            <style>
+                .grouped-section {
+                    border: 2px solid #dee2e6; /* Color matching unactivated tabs */
+                    border-radius: 10px;
                     padding: 30px;
                     margin-top: 40px;
                     margin-bottom: 40px;
                     position: relative;
                 }
-
                 .grouped-section-title {
                     font-size: 1.7em;
                     font-weight: bold;
@@ -746,802 +1192,6 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
                 raw_step_dropdown.change(update_raw_call_details, 
                                         inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], 
                                         outputs=[raw_call_details])
-        
-       
-        with gr.Tab("SWE-bench Verified (Mini)"):
-            gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The  We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    Leaderboard(
-                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini'), ci_metrics=["Accuracy", "Total Cost"]),
-                        select_columns=SelectColumns(
-                            default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
-                            cant_deselect=["Agent Name"],
-                            label="Select Columns to Display:",
-                        ),
-                        hide_columns=config.SWEBENCH_HIDE_COLUMNS,
-                        search_columns=config.SWEBENCH_SEARCH_COLUMNS,
-                    )
-                    gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
-            with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
-            with gr.Row():
-                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-            
-            gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in SWE-bench are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
-            with gr.Row():
-                task_success_heatmap = gr.Plot()
-            demo.load(
-            lambda: create_task_success_heatmap(
-                preprocessor.get_task_success_data('swebench_verified_mini'),
-                'SWE-bench Verified (Mini)'
-            ),
-            outputs=[task_success_heatmap]
-            )
-
-            # gr.HTML("""
-            # <style>
-            #     .grouped-section {
-            #         border: 2px solid #dee2e6; /* Color matching unactivated tabs */
-            #         border-radius: 10px;
-            #         padding: 30px;
-            #         margin-top: 40px;
-            #         margin-bottom: 40px;
-            #         position: relative;
-            #     }
-
-            #     .grouped-section-title {
-            #         font-size: 1.7em;
-            #         font-weight: bold;
-            #         color: #2c3e50;
-            #         margin-bottom: 20px;
-            #         padding-bottom: 10px;
-            #         border-bottom: 2px solid #dee2e6;
-            #     }
-            # </style>
-            # """)
-            # with gr.Group(elem_classes=["grouped-section"]):
-            #     gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
-
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     gr.Markdown("## Failure report for each agent")
-            #     gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     with gr.Row():
-            #         with gr.Column(scale=1):
-            #             failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     with gr.Row():
-            #         with gr.Column(scale=1):
-            #             failure_categories_overview = gr.Markdown()
-                
-            #         with gr.Column(scale=1):
-            #             failure_categories_chart = gr.Plot()
-
-            #     # Initialize the failure report agent dropdown with all agents
-            #     demo.load(update_agent_dropdown, 
-            #             inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
-            #             outputs=[failure_report_agent_dropdown])
-                
-            #     # Update failure report when agent is selected
-            #     failure_report_agent_dropdown.change(update_failure_report,
-            #                                         inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
-            #                                         outputs=[failure_categories_overview, failure_categories_chart])
-
-            #     gr.HTML('<div style="height: 30px;"></div>')
-            #     gr.Markdown("## Task overview")
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     with gr.Row():
-            #         with gr.Column(scale=1):
-            #             agent_dropdown = gr.Dropdown(label="Select Agent")
-            #         with gr.Column(scale=1):
-            #             task_dropdown = gr.Dropdown(label="Select SWE-bench Verified Task")
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     with gr.Row():
-            #         task_overview = gr.Markdown()
-            #     with gr.Row():
-            #         flow_chart = gr.Plot(label="Task Flow")
-
-            #     # Initialize the agent dropdown with the best agent
-            #     demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
-            #     demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
-
-            #     agent_dropdown.change(update_task_analysis, 
-            #                         inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
-            #                         outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
-            #     task_dropdown.change(update_task_details,
-            #                         inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
-            #                         outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
-            
-            gr.Markdown("## Raw predictions")
-            gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
-            with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        raw_agent_dropdown = gr.Dropdown(label="Select Agent")
-                    with gr.Column(scale=1):
-                        raw_task_dropdown = gr.Dropdown(label="Select Task")
-                    with gr.Column(scale=1):
-                        raw_step_dropdown = gr.Dropdown(label="Select Step")
-                with gr.Row():
-                    raw_call_details = gr.HTML()
-                
-                def update_raw_task_dropdown(agent_name):
-                    analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified_mini")
-                    if not analyzed_traces:
-                        return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
-                    task_ids = list(analyzed_traces.keys())
-                    steps = analyzed_traces[task_ids[0]]['steps']
-                    return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "swebench_verified_mini")[task_ids[0]]['steps'][0], 0)
-
-                def update_raw_step_dropdown(agent_name, task_id):
-                    analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified_mini")
-                    if not analyzed_traces or task_id not in analyzed_traces:
-                        return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
-                    steps = analyzed_traces[task_id]['steps']
-                    return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
-
-                def update_raw_call_details(agent_name, task_id, step_index):
-                    analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified_mini")
-                    if not analyzed_traces or task_id not in analyzed_traces:
-                        return "No data available for this selection."
-                    steps = analyzed_traces[task_id]['steps']
-                    if step_index is None:
-                        return "Invalid step selection."
-                    step = steps[step_index]
-                    return format_call_info(step, step_index)
-
-                # Initialize the raw agent dropdown with all agents
-                demo.load(update_agent_dropdown, 
-                        inputs=[gr.Textbox(value="swebench_verified_mini", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
-                        outputs=[raw_agent_dropdown])
-                demo.load(update_raw_task_dropdown,
-                        inputs=[raw_agent_dropdown],
-                        outputs=[raw_task_dropdown, raw_step_dropdown])
-                demo.load(update_raw_call_details,
-                        inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
-                        outputs=[raw_call_details])
-
-                raw_agent_dropdown.change(update_raw_task_dropdown, 
-                                        inputs=[raw_agent_dropdown], 
-                                        outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
-                raw_task_dropdown.change(update_raw_step_dropdown, 
-                                        inputs=[raw_agent_dropdown, raw_task_dropdown], 
-                                        outputs=[raw_step_dropdown, raw_call_details])
-                raw_step_dropdown.change(update_raw_call_details, 
-                                        inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], 
-                                        outputs=[raw_call_details])
-        
-        
-        with gr.Tab("SWE-bench Verified"):
-            gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The  We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    Leaderboard(
-                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), ci_metrics=["Accuracy", "Total Cost"]),
-                        select_columns=SelectColumns(
-                            default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
-                            cant_deselect=["Agent Name"],
-                            label="Select Columns to Display:",
-                        ),
-                        hide_columns=config.SWEBENCH_HIDE_COLUMNS,
-                        search_columns=config.SWEBENCH_SEARCH_COLUMNS,
-                    )
-                    gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
-            with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
-            with gr.Row():
-                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-            
-            gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in SWE-bench are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
-            with gr.Row():
-                task_success_heatmap = gr.Plot()
-            demo.load(
-            lambda: create_task_success_heatmap(
-                preprocessor.get_task_success_data('swebench_verified'),
-                'SWE-bench Verified'
-            ),
-            outputs=[task_success_heatmap]
-            )
-
-            # gr.HTML("""
-            # <style>
-            #     .grouped-section {
-            #         border: 2px solid #dee2e6; /* Color matching unactivated tabs */
-            #         border-radius: 10px;
-            #         padding: 30px;
-            #         margin-top: 40px;
-            #         margin-bottom: 40px;
-            #         position: relative;
-            #     }
-
-            #     .grouped-section-title {
-            #         font-size: 1.7em;
-            #         font-weight: bold;
-            #         color: #2c3e50;
-            #         margin-bottom: 20px;
-            #         padding-bottom: 10px;
-            #         border-bottom: 2px solid #dee2e6;
-            #     }
-            # </style>
-            # """)
-            # with gr.Group(elem_classes=["grouped-section"]):
-            #     gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
-
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     gr.Markdown("## Failure report for each agent")
-            #     gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     with gr.Row():
-            #         with gr.Column(scale=1):
-            #             failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     with gr.Row():
-            #         with gr.Column(scale=1):
-            #             failure_categories_overview = gr.Markdown()
-                
-            #         with gr.Column(scale=1):
-            #             failure_categories_chart = gr.Plot()
-
-            #     # Initialize the failure report agent dropdown with all agents
-            #     demo.load(update_agent_dropdown, 
-            #             inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
-            #             outputs=[failure_report_agent_dropdown])
-                
-            #     # Update failure report when agent is selected
-            #     failure_report_agent_dropdown.change(update_failure_report,
-            #                                         inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
-            #                                         outputs=[failure_categories_overview, failure_categories_chart])
-
-            #     gr.HTML('<div style="height: 30px;"></div>')
-            #     gr.Markdown("## Task overview")
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     with gr.Row():
-            #         with gr.Column(scale=1):
-            #             agent_dropdown = gr.Dropdown(label="Select Agent")
-            #         with gr.Column(scale=1):
-            #             task_dropdown = gr.Dropdown(label="Select SWE-bench Verified Task")
-            #     gr.HTML('<div style="height: 10px;"></div>')
-            #     with gr.Row():
-            #         task_overview = gr.Markdown()
-            #     with gr.Row():
-            #         flow_chart = gr.Plot(label="Task Flow")
-
-            #     # Initialize the agent dropdown with the best agent
-            #     demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
-            #     demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
-
-            #     agent_dropdown.change(update_task_analysis, 
-            #                         inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
-            #                         outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
-            #     task_dropdown.change(update_task_details,
-            #                         inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
-            #                         outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
-            
-            gr.Markdown("## Raw predictions")
-            gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
-            with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        raw_agent_dropdown = gr.Dropdown(label="Select Agent")
-                    with gr.Column(scale=1):
-                        raw_task_dropdown = gr.Dropdown(label="Select Task")
-                    with gr.Column(scale=1):
-                        raw_step_dropdown = gr.Dropdown(label="Select Step")
-                with gr.Row():
-                    raw_call_details = gr.HTML()
-                
-                def update_raw_task_dropdown(agent_name):
-                    analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
-                    if not analyzed_traces:
-                        return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
-                    task_ids = list(analyzed_traces.keys())
-                    steps = analyzed_traces[task_ids[0]]['steps']
-                    return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "swebench_verified")[task_ids[0]]['steps'][0], 0)
-
-                def update_raw_step_dropdown(agent_name, task_id):
-                    analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
-                    if not analyzed_traces or task_id not in analyzed_traces:
-                        return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
-                    steps = analyzed_traces[task_id]['steps']
-                    return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
-
-                def update_raw_call_details(agent_name, task_id, step_index):
-                    analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
-                    if not analyzed_traces or task_id not in analyzed_traces:
-                        return "No data available for this selection."
-                    steps = analyzed_traces[task_id]['steps']
-                    if step_index is None:
-                        return "Invalid step selection."
-                    step = steps[step_index]
-                    return format_call_info(step, step_index)
-
-                # Initialize the raw agent dropdown with all agents
-                demo.load(update_agent_dropdown, 
-                        inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
-                        outputs=[raw_agent_dropdown])
-                demo.load(update_raw_task_dropdown,
-                        inputs=[raw_agent_dropdown],
-                        outputs=[raw_task_dropdown, raw_step_dropdown])
-                demo.load(update_raw_call_details,
-                        inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
-                        outputs=[raw_call_details])
-
-                raw_agent_dropdown.change(update_raw_task_dropdown, 
-                                        inputs=[raw_agent_dropdown], 
-                                        outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
-                raw_task_dropdown.change(update_raw_step_dropdown, 
-                                        inputs=[raw_agent_dropdown, raw_task_dropdown], 
-                                        outputs=[raw_step_dropdown, raw_call_details])
-                raw_step_dropdown.change(update_raw_call_details, 
-                                        inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], 
-                                        outputs=[raw_call_details])
-        
-        
-        
-        # with gr.Tab("SWE-bench Lite"):
-        #     gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
-        #     with gr.Row():
-        #         with gr.Column(scale=2):
-        #             Leaderboard(
-        #                 value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), ci_metrics=["Accuracy", "Total Cost"]),
-        #                 select_columns=SelectColumns(
-        #                     default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
-        #                     cant_deselect=["Agent Name"],
-        #                     label="Select Columns to Display:",
-        #                 ),
-        #                 hide_columns=config.SWEBENCH_HIDE_COLUMNS,
-        #                 search_columns=config.SWEBENCH_SEARCH_COLUMNS,
-        #             )
-        #             gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
-        #     with gr.Row():
-        #         gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
-        #     with gr.Row():
-        #         scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-            
-        #     gr.HTML('<div style="height: 30px;"></div>')
-        #     gr.Markdown("## Task success heatmap")
-        #     gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in SWE-bench are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
-        #     with gr.Row():
-        #         task_success_heatmap = gr.Plot()
-        #     demo.load(
-        #     lambda: create_task_success_heatmap(
-        #         preprocessor.get_task_success_data('swebench_lite'),
-        #         'SWE-bench Lite'
-        #     ),
-        #     outputs=[task_success_heatmap]
-        #     )
-
-        #     gr.HTML("""
-        #     <style>
-        #         .grouped-section {
-        #             border: 2px solid #dee2e6; /* Color matching unactivated tabs */
-        #             border-radius: 10px;
-        #             padding: 30px;
-        #             margin-top: 40px;
-        #             margin-bottom: 40px;
-        #             position: relative;
-        #         }
-
-        #         .grouped-section-title {
-        #             font-size: 1.7em;
-        #             font-weight: bold;
-        #             color: #2c3e50;
-        #             margin-bottom: 20px;
-        #             padding-bottom: 10px;
-        #             border-bottom: 2px solid #dee2e6;
-        #         }
-        #     </style>
-        #     """)
-        #     with gr.Group(elem_classes=["grouped-section"]):
-        #         gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
-
-        #         gr.HTML('<div style="height: 10px;"></div>')
-        #         gr.Markdown("## Failure report for each agent")
-        #         gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
-        #         gr.HTML('<div style="height: 10px;"></div>')
-        #         with gr.Row():
-        #             with gr.Column(scale=1):
-        #                 failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
-        #         gr.HTML('<div style="height: 10px;"></div>')
-        #         with gr.Row():
-        #             with gr.Column(scale=1):
-        #                 failure_categories_overview = gr.Markdown()
-                
-        #             with gr.Column(scale=1):
-        #                 failure_categories_chart = gr.Plot()
-
-        #         # Initialize the failure report agent dropdown with all agents
-        #         demo.load(update_agent_dropdown, 
-        #                 inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
-        #                 outputs=[failure_report_agent_dropdown])
-                
-        #         # Update failure report when agent is selected
-        #         failure_report_agent_dropdown.change(update_failure_report,
-        #                                             inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
-        #                                             outputs=[failure_categories_overview, failure_categories_chart])
-
-        #         gr.HTML('<div style="height: 30px;"></div>')
-        #         gr.Markdown("## Task overview")
-        #         gr.HTML('<div style="height: 10px;"></div>')
-        #         with gr.Row():
-        #             with gr.Column(scale=1):
-        #                 agent_dropdown = gr.Dropdown(label="Select Agent")
-        #             with gr.Column(scale=1):
-        #                 task_dropdown = gr.Dropdown(label="Select SWE-bench Lite Task")
-        #         gr.HTML('<div style="height: 10px;"></div>')
-        #         with gr.Row():
-        #             task_overview = gr.Markdown()
-        #         with gr.Row():
-        #             flow_chart = gr.Plot(label="Task Flow")
-
-        #         # Initialize the agent dropdown with the best agent
-        #         demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
-        #         demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
-
-        #         agent_dropdown.change(update_task_analysis, 
-        #                             inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
-        #                             outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
-        #         task_dropdown.change(update_task_details,
-        #                             inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
-        #                             outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
-            
-        #     gr.Markdown("## Raw predictions")
-        #     gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
-        #     with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
-        #         with gr.Row():
-        #             with gr.Column(scale=1):
-        #                 raw_agent_dropdown = gr.Dropdown(label="Select Agent")
-        #             with gr.Column(scale=1):
-        #                 raw_task_dropdown = gr.Dropdown(label="Select Task")
-        #             with gr.Column(scale=1):
-        #                 raw_step_dropdown = gr.Dropdown(label="Select Step")
-        #         with gr.Row():
-        #             raw_call_details = gr.HTML()
-                
-        #         def update_raw_task_dropdown(agent_name):
-        #             analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
-        #             if not analyzed_traces:
-        #                 return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
-        #             task_ids = list(analyzed_traces.keys())
-        #             steps = analyzed_traces[task_ids[0]]['steps']
-        #             return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "swebench_lite")[task_ids[0]]['steps'][0], 0)
-
-        #         def update_raw_step_dropdown(agent_name, task_id):
-        #             analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
-        #             if not analyzed_traces or task_id not in analyzed_traces:
-        #                 return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
-        #             steps = analyzed_traces[task_id]['steps']
-        #             return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
-
-        #         def update_raw_call_details(agent_name, task_id, step_index):
-        #             analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
-        #             if not analyzed_traces or task_id not in analyzed_traces:
-        #                 return "No data available for this selection."
-        #             steps = analyzed_traces[task_id]['steps']
-        #             if step_index is None:
-        #                 return "Invalid step selection."
-        #             step = steps[step_index]
-        #             return format_call_info(step, step_index)
-
-        #         # Initialize the raw agent dropdown with all agents
-        #         demo.load(update_agent_dropdown, 
-        #                 inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
-        #                 outputs=[raw_agent_dropdown])
-        #         demo.load(update_raw_task_dropdown,
-        #                 inputs=[raw_agent_dropdown],
-        #                 outputs=[raw_task_dropdown, raw_step_dropdown])
-        #         demo.load(update_raw_call_details,
-        #                 inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
-        #                 outputs=[raw_call_details])
-
-        #         raw_agent_dropdown.change(update_raw_task_dropdown, 
-        #                                 inputs=[raw_agent_dropdown], 
-        #                                 outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
-        #         raw_task_dropdown.change(update_raw_step_dropdown, 
-        #                                 inputs=[raw_agent_dropdown, raw_task_dropdown], 
-        #                                 outputs=[raw_step_dropdown, raw_call_details])
-        #         raw_step_dropdown.change(update_raw_call_details, 
-        #                                 inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], 
-        #                                 outputs=[raw_call_details])
-        
-        
-        # with gr.Tab("MLAgentBench"):
-        #     gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
-        #     with gr.Row():
-        #         with gr.Column(scale=2):
-        #             Leaderboard(
-        #                 value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench')),
-        #                 select_columns=SelectColumns(
-        #                     default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
-        #                     cant_deselect=["Agent Name"],
-        #                     label="Select Columns to Display:",
-        #                 ),
-        #                 hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
-        #                 search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
-        #             )
-        #             gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
-        #     with gr.Row():
-        #         gr.Markdown("### Accuracy vs. Cost for MLAgentBench agents")
-        #     with gr.Row():
-        #         scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench', aggregate=False), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
-            
-        #     # gr.HTML('<div style="height: 30px;"></div>')
-        #     # gr.Markdown("## Task success heatmap")
-        #     # gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
-        #     # with gr.Row():
-        #     #     task_success_heatmap = gr.Plot()
-        #     # demo.load(
-        #     # lambda: create_task_success_heatmap(
-        #     #     preprocessor.get_task_success_data('usaco'),
-        #     #     'USACO'
-        #     # ),
-        #     # outputs=[task_success_heatmap]
-        #     # )
-
-        #     gr.HTML("""
-        #     <style>
-        #         .grouped-section {
-        #             border: 2px solid #dee2e6; /* Color matching unactivated tabs */
-        #             border-radius: 10px;
-        #             padding: 30px;
-        #             margin-top: 40px;
-        #             margin-bottom: 40px;
-        #             position: relative;
-        #         }
-
-        #         .grouped-section-title {
-        #             font-size: 1.7em;
-        #             font-weight: bold;
-        #             color: #2c3e50;
-        #             margin-bottom: 20px;
-        #             padding-bottom: 10px;
-        #             border-bottom: 2px solid #dee2e6;
-        #         }
-        #     </style>
-        #     """)
-        #     with gr.Group(elem_classes=["grouped-section"]):
-        #         gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
-
-        #         # gr.HTML('<div style="height: 10px;"></div>')
-        #         # gr.Markdown("## Failure report for each agent")
-        #         # gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
-        #         # gr.HTML('<div style="height: 10px;"></div>')
-        #         # with gr.Row():
-        #         #     with gr.Column(scale=1):
-        #         #         failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
-        #         # gr.HTML('<div style="height: 10px;"></div>')
-        #         # with gr.Row():
-        #         #     with gr.Column(scale=1):
-        #         #         failure_categories_overview = gr.Markdown()
-                
-        #         #     with gr.Column(scale=1):
-        #         #         failure_categories_chart = gr.Plot()
-
-        #         # # Initialize the failure report agent dropdown with all agents
-        #         # demo.load(update_agent_dropdown, 
-        #         #         inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], 
-        #         #         outputs=[failure_report_agent_dropdown])
-                
-        #         # # Update failure report when agent is selected
-        #         # failure_report_agent_dropdown.change(update_failure_report,
-        #         #                                     inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
-        #         #                                     outputs=[failure_categories_overview, failure_categories_chart])
-
-        #         gr.HTML('<div style="height: 30px;"></div>')
-        #         gr.Markdown("## Task overview")
-        #         gr.HTML('<div style="height: 10px;"></div>')
-        #         with gr.Row():
-        #             with gr.Column(scale=1):
-        #                 agent_dropdown = gr.Dropdown(label="Select Agent")
-        #             with gr.Column(scale=1):
-        #                 task_dropdown = gr.Dropdown(label="Select MLAgentBench Task")
-        #         gr.HTML('<div style="height: 10px;"></div>')
-        #         with gr.Row():
-        #             task_overview = gr.Markdown()
-        #         with gr.Row():
-        #             flow_chart = gr.Plot(label="Task Flow")
-
-        #         # Initialize the agent dropdown with the best agent
-        #         demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
-        #         demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
-
-        #         agent_dropdown.change(update_task_analysis, 
-        #                             inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
-        #                             outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
-        #         task_dropdown.change(update_task_details,
-        #                             inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
-        #                             outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
-            
-        #     gr.Markdown("## Raw predictions")
-        #     gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
-        #     with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
-        #         with gr.Row():
-        #             with gr.Column(scale=1):
-        #                 raw_agent_dropdown = gr.Dropdown(label="Select Agent")
-        #             with gr.Column(scale=1):
-        #                 raw_task_dropdown = gr.Dropdown(label="Select Task")
-        #             with gr.Column(scale=1):
-        #                 raw_step_dropdown = gr.Dropdown(label="Select Step")
-        #         with gr.Row():
-        #             raw_call_details = gr.HTML()
-                
-        #         def update_raw_task_dropdown(agent_name):
-        #             analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
-        #             if not analyzed_traces:
-        #                 return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
-        #             task_ids = list(analyzed_traces.keys())
-        #             steps = analyzed_traces[task_ids[0]]['steps']
-        #             return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "mlagentbench")[task_ids[0]]['steps'][0], 0)
-
-        #         def update_raw_step_dropdown(agent_name, task_id):
-        #             analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
-        #             if not analyzed_traces or task_id not in analyzed_traces:
-        #                 return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
-        #             steps = analyzed_traces[task_id]['steps']
-        #             return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
-
-        #         def update_raw_call_details(agent_name, task_id, step_index):
-        #             analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
-        #             if not analyzed_traces or task_id not in analyzed_traces:
-        #                 return "No data available for this selection."
-        #             steps = analyzed_traces[task_id]['steps']
-        #             if step_index is None:
-        #                 return "Invalid step selection."
-        #             step = steps[step_index]
-        #             return format_call_info(step, step_index)
-
-        #         # Initialize the raw agent dropdown with all agents
-        #         demo.load(update_agent_dropdown, 
-        #                 inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], 
-        #                 outputs=[raw_agent_dropdown])
-        #         demo.load(update_raw_task_dropdown,
-        #                 inputs=[raw_agent_dropdown],
-        #                 outputs=[raw_task_dropdown, raw_step_dropdown])
-        #         demo.load(update_raw_call_details,
-        #                 inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
-        #                 outputs=[raw_call_details])
-
-        #         raw_agent_dropdown.change(update_raw_task_dropdown, 
-        #                                 inputs=[raw_agent_dropdown], 
-        #                                 outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
-        #         raw_task_dropdown.change(update_raw_step_dropdown, 
-        #                                 inputs=[raw_agent_dropdown, raw_task_dropdown], 
-        #                                 outputs=[raw_step_dropdown, raw_call_details])
-        #         raw_step_dropdown.change(update_raw_call_details, 
-        #                                 inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], 
-        #                                 outputs=[raw_call_details])   
-        
-        with gr.Tab("AppWorld Normal"):
-            gr.Markdown("""AppWorld Normal is a benchmark suite containing standard programming tasks that represent typical real-world development scenarios. These tasks evaluate a language model's ability to handle common coding challenges across different domains. Even at this baseline level, current state-of-the-art models like GPT-4 achieve only a 49% success rate, demonstrating that even "normal" software development tasks remain challenging for AI systems.""")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    Leaderboard(
-                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_normal'), ci_metrics=["Accuracy", "Total Cost"]),
-                        select_columns=SelectColumns(
-                            default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
-                            cant_deselect=["Agent Name"],
-                            label="Select Columns to Display:",
-                        ),
-                        hide_columns=config.APPWORLD_HIDE_COLUMNS,
-                        search_columns=config.APPWORLD_SEARCH_COLUMNS,
-                    )
-            with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on AppWorld")
-            with gr.Row():
-                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_normal', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-        
-            gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            with gr.Row():
-                task_success_heatmap = gr.Plot()
-            demo.load(
-                lambda: create_task_success_heatmap(
-                    preprocessor.get_task_success_data('appworld test_normal'),
-                    'AppWorld'
-                ),
-                outputs=[task_success_heatmap]
-            )
-            
-        with gr.Tab("AppWorld Challenge"):
-            gr.Markdown("""AppWorld Challenge is a more demanding subset of tasks designed to push the boundaries of AI coding capabilities. These tasks feature increased complexity, requiring sophisticated problem-solving and deeper understanding of programming concepts. The significant drop in performance (to 30% success rate for GPT-4) on these challenge tasks indicates they effectively test the upper limits of current AI coding abilities.""")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    Leaderboard(
-                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_challenge'), ci_metrics=["Accuracy", "Total Cost"]),
-                        select_columns=SelectColumns(
-                            default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
-                            cant_deselect=["Agent Name"],
-                            label="Select Columns to Display:",
-                        ),
-                        hide_columns=config.APPWORLD_HIDE_COLUMNS,
-                        search_columns=config.APPWORLD_SEARCH_COLUMNS,
-                    )
-            with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on AppWorld")
-            with gr.Row():
-                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld test_challenge', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-        
-            gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            with gr.Row():
-                task_success_heatmap = gr.Plot()
-            demo.load(
-                lambda: create_task_success_heatmap(
-                    preprocessor.get_task_success_data('appworld test_challenge'),
-                    'AppWorld'
-                ),
-                outputs=[task_success_heatmap]
-            )
-        
-        with gr.Tab("Cybench"):
-            gr.Markdown("""Cybench is a benchmark for evaluating the cybersecurity capabilities and risks of language models. Cybench includes 40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties. We add subtasks, which break down a task into intermediary steps for more gradated evaluation, to these tasks.""")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    Leaderboard(
-                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'cybench'), ci_metrics=["Accuracy", "Total Cost"]),
-                        select_columns=SelectColumns(
-                            default_selection=config.CYBENCH_ON_LOAD_COLUMNS + ["Verified"],
-                            cant_deselect=["Agent Name"],
-                            label="Select Columns to Display:",
-                        ),
-                        hide_columns=config.CYBENCH_HIDE_COLUMNS,
-                        search_columns=config.CYBENCH_SEARCH_COLUMNS,
-                    )
-            with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on Cybench")
-            with gr.Row():
-                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'cybench', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-        
-            gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            with gr.Row():
-                task_success_heatmap = gr.Plot()
-            demo.load(
-                lambda: create_task_success_heatmap(
-                    preprocessor.get_task_success_data('cybench'),
-                    'Cybench'
-                ),
-                outputs=[task_success_heatmap]
-            )
-
-        with gr.Tab("GAIA"):
-            gr.Markdown("""GAIA is a benchmark for General AI Assistants. GAIA proposes real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency. GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.""")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    Leaderboard(
-                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'gaia'), ci_metrics=["Accuracy", "Total Cost"]),
-                        select_columns=SelectColumns(
-                            default_selection=config.GAIA_ON_LOAD_COLUMNS + ["Verified"],
-                            cant_deselect=["Agent Name"],
-                            label="Select Columns to Display:",
-                        ),
-                        hide_columns=config.GAIA_HIDE_COLUMNS,
-                        search_columns=config.GAIA_SEARCH_COLUMNS,
-                    )
-            with gr.Row():
-                gr.Markdown("### Accuracy vs. Cost on GAIA")
-            with gr.Row():
-                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'gaia', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
-        
-            gr.HTML('<div style="height: 30px;"></div>')
-            gr.Markdown("## Task success heatmap")
-            with gr.Row():
-                task_success_heatmap = gr.Plot()
-            demo.load(
-                lambda: create_task_success_heatmap(
-                    preprocessor.get_task_success_data('gaia'),
-                    'GAIA'
-                ),
-                outputs=[task_success_heatmap]
-            )
-        
-        with gr.Tab("About"):
-            gr.Markdown((Path(__file__).parent / "about.md").read_text())
 
     # Will trigger autoscaling of plots when tabs are switched
     tabs.select(fn=None, inputs=None, outputs=None, js="""
@@ -1561,6 +1211,9 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
     
 
 async def main():
+    # download db files
+    # await asyncio.to_thread(download_db_files)
+    
     # Preprocess traces
     # preprocessor = TracePreprocessor()
     # preprocessor.preprocess_traces('evals_live')