Spaces:
Running
Running
Zachary Siegel
commited on
Commit
·
fd01afd
1
Parent(s):
46c98e2
update corebench results
Browse files- .gitignore +0 -2
- evals_live/{corebench_easy_autogpt_gpt4o-mini.json → corebench_easy_autogpt_gpt-4o-mini_20241117.json} +2 -2
- evals_live/{corebench_easy_autogpt_gpt4o.json → corebench_easy_autogpt_gpt-4o_20241117.json} +2 -2
- evals_live/{corebench_hard_coreagent_o1-mini.json → corebench_easy_core-agent_gpt-4o-mini_20241117.json} +1 -1
- evals_live/{corebench_easy_coreagent_gpt4o-mini.json → corebench_easy_core-agent_gpt-4o_20241117.json} +2 -2
- evals_live/{corebench_easy_coreagent_gpt4o.json → corebench_hard_autogpt_gpt-4o-mini_20241116.json} +2 -2
- evals_live/corebench_hard_autogpt_gpt-4o_20241116.json +3 -0
- evals_live/corebench_hard_autogpt_gpt4o-mini.json +0 -3
- evals_live/corebench_hard_autogpt_gpt4o.json +0 -3
- evals_live/corebench_hard_core-agent_claude-3_5-sonnet_20241116.json +3 -0
- evals_live/corebench_hard_core-agent_gpt-4o-mini_20241116.json +3 -0
- evals_live/corebench_hard_core-agent_gpt-4o_20241116.json +3 -0
- evals_live/corebench_hard_core-agent_o1-mini_20241116.json +3 -0
- evals_live/corebench_hard_coreagent_claude-35-sonnet.json +0 -3
- evals_live/corebench_hard_coreagent_gpt4o-mini.json +0 -3
- evals_live/corebench_hard_coreagent_gpt4o.json +0 -3
- evals_live/corebench_medium_autogpt_gpt-4o-mini_20241117.json +3 -0
- evals_live/corebench_medium_autogpt_gpt-4o_20241117.json +3 -0
- evals_live/corebench_medium_autogpt_gpt4o-mini.json +0 -3
- evals_live/corebench_medium_autogpt_gpt4o.json +0 -3
- evals_live/corebench_medium_core-agent_gpt-4o-mini_20241117.json +3 -0
- evals_live/corebench_medium_core-agent_gpt-4o_20241117.json +3 -0
- evals_live/corebench_medium_coreagent_gpt4o-mini.json +0 -3
- evals_live/corebench_medium_coreagent_gpt4o.json +0 -3
- evals_live/inspect_evalsswe_bench_1729180046_UPLOAD (1).json +3 -0
- evals_live/usaco_USACO_Episodic_gpt-4o-mini-2024-07-18_172342962.json +3 -0
.gitignore
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
__pycache__/**
|
2 |
/evals_processed
|
3 |
/evals_upload
|
4 |
-
/evals_live
|
5 |
/utils/__pycache__
|
6 |
/agent_monitor/__pycache__
|
7 |
preprocessed_traces.db
|
8 |
-
/evals_live
|
|
|
1 |
__pycache__/**
|
2 |
/evals_processed
|
3 |
/evals_upload
|
|
|
4 |
/utils/__pycache__
|
5 |
/agent_monitor/__pycache__
|
6 |
preprocessed_traces.db
|
|
evals_live/{corebench_easy_autogpt_gpt4o-mini.json → corebench_easy_autogpt_gpt-4o-mini_20241117.json}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:388d35e786f38b7cbee4f30c91c4b747abb5376c5b53d501c4e38b0c7dce62d8
|
3 |
+
size 1448
|
evals_live/{corebench_easy_autogpt_gpt4o.json → corebench_easy_autogpt_gpt-4o_20241117.json}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd831e7b4b6f25b72918ca5ac5383ac406a2ade6893fb9250c552f7ea4973c31
|
3 |
+
size 1445
|
evals_live/{corebench_hard_coreagent_o1-mini.json → corebench_easy_core-agent_gpt-4o-mini_20241117.json}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1461
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba6975a6b8fd8c21652b3e6a0b0753edd4277980fc0d436ceed860289a9a6660
|
3 |
size 1461
|
evals_live/{corebench_easy_coreagent_gpt4o-mini.json → corebench_easy_core-agent_gpt-4o_20241117.json}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff864519fce95e441bba6a40ad2ef318cd09fc068d5d828d05b840109530fd82
|
3 |
+
size 1451
|
evals_live/{corebench_easy_coreagent_gpt4o.json → corebench_hard_autogpt_gpt-4o-mini_20241116.json}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:277730926ab9f8e32374d3e8d4c0e1dcc34abb8c44a5102dc0b8e0b7c8b6e1c2
|
3 |
+
size 1456
|
evals_live/corebench_hard_autogpt_gpt-4o_20241116.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38e45af6eb350539935b6274881a15417f355742c1ed218fab9283eb46122734
|
3 |
+
size 1438
|
evals_live/corebench_hard_autogpt_gpt4o-mini.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7db6d6a712796f7853a71fd23f0e680db726902485eb104a205d9f91ff2bb1ca
|
3 |
-
size 1555
|
|
|
|
|
|
|
|
evals_live/corebench_hard_autogpt_gpt4o.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b25c527daa532f3a724b3dffb97e6d3e1749185353203083b5d37000565824e0
|
3 |
-
size 1543
|
|
|
|
|
|
|
|
evals_live/corebench_hard_core-agent_claude-3_5-sonnet_20241116.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2aed8e6a9228b2ccfdfbfb31690d0a6892f274f2bca7981a097ad2985fc2dd0d
|
3 |
+
size 1474
|
evals_live/corebench_hard_core-agent_gpt-4o-mini_20241116.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:502d2787bc0ed86175a22e37146425c7caf2e6aa80181cda780b68ababc8fe30
|
3 |
+
size 1461
|
evals_live/corebench_hard_core-agent_gpt-4o_20241116.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f033c04d67cf33164e2ff140a7cad7fe918284cbdbcfa889d5de3eb58971818
|
3 |
+
size 1452
|
evals_live/corebench_hard_core-agent_o1-mini_20241116.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a3ef73cfe040f4e14066e51b8c9389ea8979568cd14c9855556322138e041df8
|
3 |
+
size 1454
|
evals_live/corebench_hard_coreagent_claude-35-sonnet.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ec0018f552500ca07bfb6a1451f848b4d009be35f0e016c0e0461467373b8fb7
|
3 |
-
size 1450
|
|
|
|
|
|
|
|
evals_live/corebench_hard_coreagent_gpt4o-mini.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:580936859a1d1e49be19658c85e1203b6343d6dd466a3f629d48202a697d3102
|
3 |
-
size 1559
|
|
|
|
|
|
|
|
evals_live/corebench_hard_coreagent_gpt4o.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:185d3f7edda2cb66bdb43e6369a4210a4247b5680ce74d7474a116097286c5f1
|
3 |
-
size 1549
|
|
|
|
|
|
|
|
evals_live/corebench_medium_autogpt_gpt-4o-mini_20241117.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac8182bae5d5232772aa41db3ff01af64abd8d7887f7eed299d3857e53d0ff52
|
3 |
+
size 1460
|
evals_live/corebench_medium_autogpt_gpt-4o_20241117.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f66dc99fc03fbf39071ca80d5c3442c33e611ef6a11f7b306382f6ab41157548
|
3 |
+
size 1449
|
evals_live/corebench_medium_autogpt_gpt4o-mini.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6ecf9980643ac88ff0ce3abf2a3abbd18819e4e1db13e1538f49026acf8c01c8
|
3 |
-
size 1561
|
|
|
|
|
|
|
|
evals_live/corebench_medium_autogpt_gpt4o.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:63b801c2d998515a03115b7aaf3f5e68a537f7bd2499fe350eea6f1423810ab5
|
3 |
-
size 1550
|
|
|
|
|
|
|
|
evals_live/corebench_medium_core-agent_gpt-4o-mini_20241117.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a55797f20e87a7445c32f24860d4de495b49d3fe74d3f747eadaa39eef2e665
|
3 |
+
size 1466
|
evals_live/corebench_medium_core-agent_gpt-4o_20241117.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f1e61f3d7d5140973ddb9bccf17bac86c6e18db2b6332a4ba61b776a716877e
|
3 |
+
size 1456
|
evals_live/corebench_medium_coreagent_gpt4o-mini.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:988848a4cdf07facf6e1fe74721e2f4973136e0773fa3a61eb0e6c86273c52ea
|
3 |
-
size 1566
|
|
|
|
|
|
|
|
evals_live/corebench_medium_coreagent_gpt4o.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a2cc16c05d31212d2c59a9ac2bafb30081620ff7d61814b4c9e5005b52ec911c
|
3 |
-
size 1556
|
|
|
|
|
|
|
|
evals_live/inspect_evalsswe_bench_1729180046_UPLOAD (1).json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9751c8ce22bbfb3e1f7003097c089ac19576f5a3796acd056687adc56ec85c93
|
3 |
+
size 25215800
|
evals_live/usaco_USACO_Episodic_gpt-4o-mini-2024-07-18_172342962.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a7fdc0fa5f1a34caa83aaeea58d25c90b4c2f5f149ecfc3f6c4a606cb20791f
|
3 |
+
size 435042276
|