diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -19,6 +19,10 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler # import weave from utils.db import TracePreprocessor from gradio.themes.soft import Soft +from utils.db import DEFAULT_PRICING +from dotenv import load_dotenv + +load_dotenv() preprocessor = TracePreprocessor() @@ -40,6 +44,17 @@ def download_latest_results(): max_workers=4, ) print("Download complete.") + +def download_db_files(): + snapshot_download(RESULTS_REPO_ID, + local_dir= "preprocessed_traces", + repo_type='dataset', + tqdm_class=None, + etag_timeout=30, + max_workers=4, + allow_patterns="*.db", + ) + print("Download complete.") def get_analyzed_traces(agent_name, benchmark_name): @@ -251,8 +266,32 @@ class MyTheme(Soft): my_theme = MyTheme() +# Add after the other helper functions, before the UI code +def update_visualizations(benchmark_name, pricing_config): + """Update leaderboard and scatter plot with new pricing""" + # Get updated results with new pricing + results_df = preprocessor.get_parsed_results_with_costs(benchmark_name, pricing_config) + + # Create updated leaderboard + leaderboard_df = create_leaderboard( + results_df, + ci_metrics=["Accuracy", "Total Cost"] + ) + + # Create updated scatter plot + scatter_fig = create_scatter_plot( + results_df, + "Total Cost", + "Accuracy", + "Total Cost (in USD)", + "Accuracy", + ["Agent Name"] + ) + + return leaderboard_df, scatter_fig + with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderboard") as demo: - # gr.Markdown((Path(__file__).parent / "header.md").read_text(), elem_classes=["text-large"]) + # First add the header HTML gr.HTML("""

Holistic Agent Leaderboard (HAL)

- A standardized, cost-aware, and third-party leaderboard for evaluating agents. + The standardized, cost-aware, and third-party leaderboard for evaluating agents.

""") + + # Add the about section as an accordion + with gr.Accordion("About HAL", open=False, elem_classes=["about-section"]): + gr.Markdown((Path(__file__).parent / "about.md").read_text()) + + # Continue with the features HTML gr.HTML(""" - + -
-
-
Standardized
-
-

Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.

+
+
+
Standardized
+
+

Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.

+
-
-
-
Cost-controlled
-
-

For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.

+
+
Cost-controlled
+
+

For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.

+
-
-
-
Third-party
-
-

Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.

+
+
Third-party
+
+

Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.

+
-
- + -

Who is it for?

-

We see HAL being useful for four types of users:

+

Who is it for?

+

We see HAL being useful for four types of users:

-
-
-

Downstream Users & Procurers

-

Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.

-