File size: 3,804 Bytes
7c691e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201da5d
7c691e6
 
 
 
 
 
 
 
201da5d
7c691e6
 
 
 
 
 
 
 
201da5d
7c691e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201da5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56a86ce
201da5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd

TYPES = [
    "str",
    "number",
    "number"
]

SWEBENCH_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
   ]
SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

USACO_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
   ]
USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

COREBENCH_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
   ]
COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]



MLAGENTBENCH_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Overall Score",
    "Total Cost",
   ]
MLAGENTBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
MLAGENTBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Accuracy']


NUMERIC_INTERVALS = {
    "?": pd.Interval(-1, 0, closed="right"),
    "~1.5": pd.Interval(0, 2, closed="right"),
    "~3": pd.Interval(2, 4, closed="right"),
    "~7": pd.Interval(4, 9, closed="right"),
    "~13": pd.Interval(9, 20, closed="right"),
    "~35": pd.Interval(20, 45, closed="right"),
    "~60": pd.Interval(45, 70, closed="right"),
    "70+": pd.Interval(70, 10000, closed="right"),
}

CYBENCH_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
]
CYBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
CYBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

APPWORLD_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy",
    "Total Cost",
    "Runs",
    "Scenario Goal Completion"
]
APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

GAIA_ON_LOAD_COLUMNS = [
    "Agent Name",
    "Accuracy", 
    "Level 1 Accuracy",
    "Level 2 Accuracy",
    "Level 3 Accuracy",
    "Total Cost",
    "Runs",
]
GAIA_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
GAIA_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']