Spaces:

agent-evals
/

leaderboard

Running

File size: 3,804 Bytes

import pandas as pd

TYPES = [
    "str",
    "number",
    "number"
]

SWEBENCH_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
   ]
SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

USACO_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
   ]
USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

COREBENCH_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
   ]
COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]



MLAGENTBENCH_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Overall Score",
    "Total Cost",
   ]
MLAGENTBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
MLAGENTBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Accuracy']


NUMERIC_INTERVALS = {
    "?": pd.Interval(-1, 0, closed="right"),
    "~1.5": pd.Interval(0, 2, closed="right"),
    "~3": pd.Interval(2, 4, closed="right"),
    "~7": pd.Interval(4, 9, closed="right"),
    "~13": pd.Interval(9, 20, closed="right"),
    "~35": pd.Interval(20, 45, closed="right"),
    "~60": pd.Interval(45, 70, closed="right"),
    "70+": pd.Interval(70, 10000, closed="right"),
}

CYBENCH_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
]
CYBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
CYBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

APPWORLD_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
    "Scenario Goal Completion"
]
APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

GAIA_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy", 
    "Level 1 Accuracy",
    "Level 2 Accuracy",
    "Level 3 Accuracy",
    "Total Cost",
    "Runs",
]
GAIA_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
GAIA_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']