import gradio as gr from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter import config from envs import RESULTS_REPO_ID, REPO_ID, API, HF_TOKEN from pathlib import Path import pandas as pd import os import json from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart, create_task_success_heatmap, create_leaderboard from utils.processing import check_and_process_uploads from huggingface_hub import snapshot_download from apscheduler.schedulers.background import BackgroundScheduler from datetime import datetime import json import re import markdown import asyncio from apscheduler.schedulers.asyncio import AsyncIOScheduler # import weave from utils.db import TracePreprocessor from gradio.themes.soft import Soft from utils.db import DEFAULT_PRICING from dotenv import load_dotenv load_dotenv() preprocessor = TracePreprocessor() from datetime import datetime abs_path = Path(__file__).parent heatmap_explanation = (Path(__file__).parent / "heatmap_explanation.md").read_text() cost_explanation = (Path(__file__).parent / "cost_explanation.md").read_text() def restart_space(): API.restart_space(repo_id=REPO_ID, token=HF_TOKEN) # New function to download results def download_latest_results(): print("Downloading latest results...") snapshot_download(RESULTS_REPO_ID, local_dir= "evals_upload", repo_type='dataset', tqdm_class=None, etag_timeout=30, max_workers=4, ) print("Download complete.") def download_db_files(): snapshot_download(RESULTS_REPO_ID, local_dir= "preprocessed_traces", repo_type='dataset', tqdm_class=None, etag_timeout=30, max_workers=4, allow_patterns="*.db", ) print("Download complete.") def get_analyzed_traces(agent_name, benchmark_name): return preprocessor.get_analyzed_traces(agent_name, benchmark_name) def get_failure_report(agent_name, benchmark_name): return preprocessor.get_failure_report(agent_name, benchmark_name) def parse_json_files(folder_path, benchmark_name, aggregate=True): # Handle inspect_evals prefix if benchmark_name.startswith('inspect_evals/'): actual_benchmark = benchmark_name.split('/')[-1] return preprocessor.get_parsed_results(actual_benchmark, aggregate=aggregate) return preprocessor.get_parsed_results(benchmark_name, aggregate=aggregate) def update_agent_dropdown(benchmark_name, metric): df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name) agents = df['Agent Name'].tolist() best_agent = get_best_agent(benchmark_name, metric) return gr.Dropdown(choices=agents, value=best_agent, label="Select Agent") def get_best_agent(benchmark_name, metric): df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name) return df.loc[df[metric].idxmax()]['Agent Name'] def update_task_analysis(benchmark_name, agent_name): if not agent_name: return "Please select an agent.", None, None, "" analyzed_traces = get_analyzed_traces(agent_name, benchmark_name) if not analyzed_traces: return f"No analysis available for agent: {agent_name}", None, None, "" task_ids = list(analyzed_traces.keys()) overview, flow_chart, _ = update_task_details(benchmark_name, agent_name, task_ids[0]) return overview, flow_chart, gr.Dropdown(choices=task_ids, value=task_ids[0], label="Select Task"), "" def update_task_details(benchmark_name, agent_name, task_id): if not task_id: return "Please select a task.", None, "" analyzed_traces = get_analyzed_traces(agent_name, benchmark_name) if not analyzed_traces or task_id not in analyzed_traces: return f"No analysis available for task: {task_id}", None, "" analysis = analyzed_traces[task_id] summary = analysis.get('task_analysis', {}) overview = f"### Summary\n\n{summary.get('overview', 'No overview available.')}\n\n" # overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n" # overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n" # overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n" if summary.get('overview', 'No overview available.') != "Not available": flow_chart = create_flow_chart(analysis['steps']) else: flow_chart = None return overview, flow_chart, "" def format_call_info(step, step_index): call_data = step['call_data'] analysis = step['analysis'] def format_json(obj): # if isinstance(obj, dict) and 'choices' in obj: # # Special handling for message content # formatted_content = format_message_content(obj['choices'][0]) # return f'
' # else: json_str = json.dumps(obj, indent=2) json_str = json_str.replace(' ', ' ') json_str = json_str.replace('\n', '{m.group(1)}', html_content, flags=re.DOTALL) return html_content formatted_info = f"""
The standardized, cost-aware, and third-party leaderboard for evaluating agents.
Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.
For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.
Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.
We see HAL being useful for four types of users:
Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.
Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.
HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.
Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.
CORE-Bench evaluates the ability of agents to computationally reproduce the results of published scientific papers. Agents are given the codebase of a paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. The benchmark has tasks at three difficulty levels:
""") gr.HTML("""CORE-Bench-Hard: The agent is given the codebase of the paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. This level is most akin to fully reproducing a paper and is the most realistic and challenging level.
""") with gr.Row(): with gr.Column(scale=2): Leaderboard( value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]), select_columns=SelectColumns( default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"], cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.COREBENCH_HIDE_COLUMNS, search_columns=config.COREBENCH_SEARCH_COLUMNS, datatype='markdown' ) # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"]) with gr.Row(): gr.Markdown(cost_explanation) with gr.Row(): scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"])) gr.HTML('') gr.Markdown(heatmap_explanation) with gr.Row(): task_success_heatmap = gr.Plot() demo.load( lambda: create_task_success_heatmap( preprocessor.get_task_success_data('corebench_hard'), 'CORE-Bench-Hard' ), outputs=[task_success_heatmap] ) with gr.Tab("CORE-Bench-Medium"): gr.HTML("""CORE-Bench-Medium: The agent is given a Dockerfile and instructions on how to use the Dockerfile to fully reproduce the paper. This level mainly evaluates agents ability to use and interact with the terminal. The agent must then answer questions about the output of the code, as in the above level.
""") with gr.Row(): with gr.Column(scale=2): Leaderboard( value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]), select_columns=SelectColumns( default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"], cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.COREBENCH_HIDE_COLUMNS, search_columns=config.COREBENCH_SEARCH_COLUMNS, datatype='markdown' ) # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"]) with gr.Row(): gr.Markdown(cost_explanation) with gr.Row(): scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"])) gr.HTML('') gr.Markdown(heatmap_explanation) with gr.Row(): task_success_heatmap = gr.Plot() demo.load( lambda: create_task_success_heatmap( preprocessor.get_task_success_data('corebench_medium'), 'CORE-Bench-Medium' ), outputs=[task_success_heatmap] ) with gr.Tab("CORE-Bench-Easy"): gr.HTML("""CORE-Bench-Easy: The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
""") with gr.Row(): with gr.Column(scale=2): Leaderboard( value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]), select_columns=SelectColumns( default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"], cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.COREBENCH_HIDE_COLUMNS, search_columns=config.COREBENCH_SEARCH_COLUMNS, datatype='markdown' ) # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"]) with gr.Row(): gr.Markdown(cost_explanation) with gr.Row(): scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"])) gr.HTML('') gr.Markdown(heatmap_explanation) with gr.Row(): task_success_heatmap = gr.Plot() demo.load( lambda: create_task_success_heatmap( preprocessor.get_task_success_data('corebench_easy'), 'CORE-Bench-Easy' ), outputs=[task_success_heatmap] ) with gr.Tab("Cybench"): gr.Markdown("""Cybench is a benchmark for evaluating the cybersecurity capabilities and risks of language models. Cybench includes 40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties. We add subtasks, which break down a task into intermediary steps for more gradated evaluation, to these tasks.""") # Get models used in Cybench benchmark cybench_models = preprocessor.get_models_for_benchmark('cybench') # Initialize pricing inputs dictionary cybench_pricing_inputs = {} with gr.Accordion("Token Pricing Configuration", open=False): for model in DEFAULT_PRICING.keys(): if model in cybench_models: with gr.Row(): gr.Markdown(f"**{model}**") cybench_pricing_inputs[f"{model}_input"] = gr.Number( value=DEFAULT_PRICING[model]["prompt_tokens"], label="Input ($/1M tokens)", step=0.01 ) cybench_pricing_inputs[f"{model}_output"] = gr.Number( value=DEFAULT_PRICING[model]["completion_tokens"], label="Output ($/1M tokens)", step=0.01 ) with gr.Row(): with gr.Column(scale=2): cybench_leaderboard = Leaderboard( value=create_leaderboard( preprocessor.get_parsed_results_with_costs('cybench'), ci_metrics=["Accuracy", "Total Cost"] ), select_columns=SelectColumns( default_selection=config.CYBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"], cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.CYBENCH_HIDE_COLUMNS, search_columns=config.CYBENCH_SEARCH_COLUMNS, datatype='markdown' ) with gr.Row(): gr.Markdown(cost_explanation) with gr.Row(): cybench_scatter_plot = gr.Plot() def update_cybench(*pricing_values): pricing_config = {} values_iter = iter(pricing_values) for model in DEFAULT_PRICING.keys(): if model in cybench_models: pricing_config[model] = { "prompt_tokens": next(values_iter), "completion_tokens": next(values_iter) } return update_visualizations('cybench', pricing_config) # Connect all pricing inputs to the update function cybench_pricing_input_list = [v for v in cybench_pricing_inputs.values()] for input_component in cybench_pricing_input_list: input_component.change( fn=update_cybench, inputs=cybench_pricing_input_list, outputs=[cybench_leaderboard, cybench_scatter_plot] ) # Initialize the scatter plot demo.load( fn=lambda: create_scatter_plot( preprocessor.get_parsed_results_with_costs('cybench'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"] ), outputs=cybench_scatter_plot ) # Keep the existing heatmap section gr.HTML('') gr.Markdown(heatmap_explanation) with gr.Row(): task_success_heatmap = gr.Plot() demo.load( lambda: create_task_success_heatmap( preprocessor.get_task_success_data('cybench'), 'Cybench' ), outputs=[task_success_heatmap] ) with gr.Tab("GAIA"): gr.Markdown("""GAIA is a benchmark for General AI Assistants. GAIA proposes real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency. GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.""") # Get models used in GAIA benchmark gaia_models = preprocessor.get_models_for_benchmark('gaia') # Initialize pricing inputs dictionary gaia_pricing_inputs = {} with gr.Accordion("Token Pricing Configuration", open=False): for model in DEFAULT_PRICING.keys(): # Only show pricing inputs for models used in GAIA if model in gaia_models: with gr.Row(): gr.Markdown(f"**{model}**") gaia_pricing_inputs[f"{model}_input"] = gr.Number( value=DEFAULT_PRICING[model]["prompt_tokens"], label="Input ($/1M tokens)", step=0.01 ) gaia_pricing_inputs[f"{model}_output"] = gr.Number( value=DEFAULT_PRICING[model]["completion_tokens"], label="Output ($/1M tokens)", step=0.01 ) with gr.Row(): with gr.Column(scale=2): gaia_leaderboard = Leaderboard( value=create_leaderboard( preprocessor.get_parsed_results_with_costs('gaia'), ci_metrics=["Accuracy", "Total Cost"] ), select_columns=SelectColumns( default_selection=config.GAIA_ON_LOAD_COLUMNS + ["Verified", "Traces"], cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.GAIA_HIDE_COLUMNS, search_columns=config.GAIA_SEARCH_COLUMNS, datatype='markdown' ) with gr.Row(): gr.Markdown(cost_explanation) with gr.Row(): gaia_scatter_plot = gr.Plot() def update_gaia(*pricing_values): pricing_config = {} values_iter = iter(pricing_values) for model in DEFAULT_PRICING.keys(): if model in gaia_models: # Only include used models pricing_config[model] = { "prompt_tokens": next(values_iter), "completion_tokens": next(values_iter) } return update_visualizations('gaia', pricing_config) # Connect all pricing inputs to the update function gaia_pricing_input_list = [v for v in gaia_pricing_inputs.values()] for input_component in gaia_pricing_input_list: input_component.change( fn=update_gaia, inputs=gaia_pricing_input_list, outputs=[gaia_leaderboard, gaia_scatter_plot] ) # Initialize the scatter plot demo.load( fn=lambda: create_scatter_plot( preprocessor.get_parsed_results_with_costs('gaia'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"] ), outputs=gaia_scatter_plot ) # Keep the existing heatmap section gr.HTML('') gr.Markdown(heatmap_explanation) with gr.Row(): task_success_heatmap = gr.Plot() demo.load( lambda: create_task_success_heatmap( preprocessor.get_task_success_data('gaia'), 'GAIA' ), outputs=[task_success_heatmap] ) with gr.Tab("SWE-bench Verified"): gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. We are currently actively developing this platform and this benchmark is not fully implemented yet.""") # Get models used in SWE-bench benchmark swebench_models = preprocessor.get_models_for_benchmark('swebench_verified') # Initialize pricing inputs dictionary swebench_pricing_inputs = {} with gr.Accordion("Token Pricing Configuration", open=False): for model in DEFAULT_PRICING.keys(): if model in swebench_models: with gr.Row(): gr.Markdown(f"**{model}**") swebench_pricing_inputs[f"{model}_input"] = gr.Number( value=DEFAULT_PRICING[model]["prompt_tokens"], label="Input ($/1M tokens)", step=0.01 ) swebench_pricing_inputs[f"{model}_output"] = gr.Number( value=DEFAULT_PRICING[model]["completion_tokens"], label="Output ($/1M tokens)", step=0.01 ) with gr.Row(): with gr.Column(scale=2): swebench_leaderboard = Leaderboard( value=create_leaderboard( preprocessor.get_parsed_results_with_costs('swebench_verified'), ci_metrics=["Accuracy", "Total Cost"] ), select_columns=SelectColumns( default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"], cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.SWEBENCH_HIDE_COLUMNS, search_columns=config.SWEBENCH_SEARCH_COLUMNS, datatype='markdown' ) with gr.Row(): gr.Markdown(cost_explanation) with gr.Row(): swebench_scatter_plot = gr.Plot() gr.HTML('') gr.Markdown(heatmap_explanation) with gr.Row(): task_success_heatmap = gr.Plot() demo.load( lambda: create_task_success_heatmap( preprocessor.get_task_success_data('swebench_verified'), 'SWE-bench Verified' ), outputs=[task_success_heatmap] ) def update_swebench(*pricing_values): pricing_config = {} values_iter = iter(pricing_values) for model in DEFAULT_PRICING.keys(): if model in swebench_models: pricing_config[model] = { "prompt_tokens": next(values_iter), "completion_tokens": next(values_iter) } return update_visualizations('swebench_verified', pricing_config) # Connect all pricing inputs to the update function swebench_pricing_input_list = [v for v in swebench_pricing_inputs.values()] for input_component in swebench_pricing_input_list: input_component.change( fn=update_swebench, inputs=swebench_pricing_input_list, outputs=[swebench_leaderboard, swebench_scatter_plot] ) # Initialize the scatter plot demo.load( fn=lambda: create_scatter_plot( preprocessor.get_parsed_results_with_costs('swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"] ), outputs=swebench_scatter_plot ) # Keep the rest of the SWE-bench tab content... with gr.Tab("SWE-bench Verified (Mini)"): gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The Mini version is a smaller subset of tasks - you can find the specific task IDs included in this subset [here](https://github.com/benediktstroebl/agent-eval-harness/blob/7b231a952828022a43977f21acfd452adda5088c/agent_eval_harness/benchmarks/swebench_verified_mini_task_ids.txt). We are currently actively developing this platform and this benchmark is not fully implemented yet.""") with gr.Row(): with gr.Column(scale=2): Leaderboard( value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini'), ci_metrics=["Accuracy", "Total Cost"]), select_columns=SelectColumns( default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"], cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.SWEBENCH_HIDE_COLUMNS, search_columns=config.SWEBENCH_SEARCH_COLUMNS, datatype='markdown' ) gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"]) with gr.Row(): gr.Markdown(cost_explanation) with gr.Row(): scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"])) gr.HTML('') gr.Markdown(heatmap_explanation) with gr.Row(): task_success_heatmap = gr.Plot() demo.load( lambda: create_task_success_heatmap( preprocessor.get_task_success_data('swebench_verified_mini'), 'SWE-bench Verified (Mini)' ), outputs=[task_success_heatmap] ) with gr.Tab("USACO"): gr.Markdown("""The USA Computing Olympiad (USACO) is a computer programming competition for pre-college students. This benchmark evaluates the performance of AI agents on a set of 307 USACO tasks. The agents are evaluated based on the number of tasks correctly solved.""") with gr.Row(): with gr.Column(scale=2): Leaderboard( value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]), select_columns=SelectColumns( default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified", "Traces"], cant_deselect=["Agent Name"], label="Select Columns to Display:", ), hide_columns=config.USACO_HIDE_COLUMNS, search_columns=config.USACO_SEARCH_COLUMNS, datatype='markdown' ) gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"]) with gr.Row(): gr.Markdown(cost_explanation) with gr.Row(): scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"])) gr.HTML('') gr.Markdown(heatmap_explanation) with gr.Row(): task_success_heatmap = gr.Plot() demo.load( lambda: create_task_success_heatmap( preprocessor.get_task_success_data('usaco'), 'USACO' ), outputs=[task_success_heatmap] ) # gr.HTML(""" # # """) # with gr.Group(elem_classes=["grouped-section"]): # gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor") # gr.Markdown('The agent monitor provides an overview of the recurring errors an agent makes as well as a summary of the steps the agent takes to solve a task. It currently consists of two main components:') # gr.HTML('') # gr.Markdown("## Failure report for each agent") # gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.') # gr.HTML('') # with gr.Row(): # with gr.Column(scale=1): # failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report") # gr.HTML('') # with gr.Row(): # with gr.Column(scale=1): # failure_categories_overview = gr.Markdown() # with gr.Column(scale=1): # failure_categories_chart = gr.Plot() # # Initialize the failure report agent dropdown with all agents # demo.load(update_agent_dropdown, # inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], # outputs=[failure_report_agent_dropdown]) # # Update failure report when agent is selected # failure_report_agent_dropdown.change(update_failure_report, # inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)], # outputs=[failure_categories_overview, failure_categories_chart]) # gr.HTML('') # gr.Markdown("## Task overview") # gr.HTML('') # with gr.Row(): # with gr.Column(scale=1): # agent_dropdown = gr.Dropdown(label="Select Agent") # with gr.Column(scale=1): # task_dropdown = gr.Dropdown(label="Select USACO Task") # gr.HTML('') # with gr.Row(): # task_overview = gr.Markdown() # with gr.Row(): # flow_chart = gr.Plot(label="Task Flow") # # Initialize the agent dropdown with the best agent # demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown]) # demo.load(update_task_analysis, inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) # agent_dropdown.change(update_task_analysis, # inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown], # outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)]) # task_dropdown.change(update_task_details, # inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown], # outputs=[task_overview, flow_chart, gr.Textbox(visible=False)]) # gr.Markdown("## Raw predictions") # gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.') # with gr.Accordion("Expand to inspect raw predictions of agents...", open=False): # with gr.Row(): # with gr.Column(scale=1): # raw_agent_dropdown = gr.Dropdown(label="Select Agent") # with gr.Column(scale=1): # raw_task_dropdown = gr.Dropdown(label="Select Task") # with gr.Column(scale=1): # raw_step_dropdown = gr.Dropdown(label="Select Step") # with gr.Row(): # raw_call_details = gr.HTML() # def update_raw_task_dropdown(agent_name): # analyzed_traces = get_analyzed_traces(agent_name, "usaco") # if not analyzed_traces: # return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}." # task_ids = list(analyzed_traces.keys()) # steps = analyzed_traces[task_ids[0]]['steps'] # return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0) # def update_raw_step_dropdown(agent_name, task_id): # analyzed_traces = get_analyzed_traces(agent_name, "usaco") # if not analyzed_traces or task_id not in analyzed_traces: # return gr.Dropdown(choices=[], label="Select Step", value="No data available.") # steps = analyzed_traces[task_id]['steps'] # return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0) # def update_raw_call_details(agent_name, task_id, step_index): # analyzed_traces = get_analyzed_traces(agent_name, "usaco") # if not analyzed_traces or task_id not in analyzed_traces: # return "No data available for this selection." # steps = analyzed_traces[task_id]['steps'] # if step_index is None: # return "Invalid step selection." # step = steps[step_index] # return format_call_info(step, step_index) # # Initialize the raw agent dropdown with all agents # demo.load(update_agent_dropdown, # inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], # outputs=[raw_agent_dropdown]) # demo.load(update_raw_task_dropdown, # inputs=[raw_agent_dropdown], # outputs=[raw_task_dropdown, raw_step_dropdown]) # demo.load(update_raw_call_details, # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], # outputs=[raw_call_details]) # raw_agent_dropdown.change(update_raw_task_dropdown, # inputs=[raw_agent_dropdown], # outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details]) # raw_task_dropdown.change(update_raw_step_dropdown, # inputs=[raw_agent_dropdown, raw_task_dropdown], # outputs=[raw_step_dropdown, raw_call_details]) # raw_step_dropdown.change(update_raw_call_details, # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], # outputs=[raw_call_details]) # Will trigger autoscaling of plots when tabs are switched tabs.select(fn=None, inputs=None, outputs=None, js=""" function() { setTimeout(function() { window.dispatchEvent(new Event('resize')); }, 100); } """) # gr.HTML("""