Zachary Siegel commited on
Commit
fd01afd
·
1 Parent(s): 46c98e2

update corebench results

Browse files
Files changed (26) hide show
  1. .gitignore +0 -2
  2. evals_live/{corebench_easy_autogpt_gpt4o-mini.json → corebench_easy_autogpt_gpt-4o-mini_20241117.json} +2 -2
  3. evals_live/{corebench_easy_autogpt_gpt4o.json → corebench_easy_autogpt_gpt-4o_20241117.json} +2 -2
  4. evals_live/{corebench_hard_coreagent_o1-mini.json → corebench_easy_core-agent_gpt-4o-mini_20241117.json} +1 -1
  5. evals_live/{corebench_easy_coreagent_gpt4o-mini.json → corebench_easy_core-agent_gpt-4o_20241117.json} +2 -2
  6. evals_live/{corebench_easy_coreagent_gpt4o.json → corebench_hard_autogpt_gpt-4o-mini_20241116.json} +2 -2
  7. evals_live/corebench_hard_autogpt_gpt-4o_20241116.json +3 -0
  8. evals_live/corebench_hard_autogpt_gpt4o-mini.json +0 -3
  9. evals_live/corebench_hard_autogpt_gpt4o.json +0 -3
  10. evals_live/corebench_hard_core-agent_claude-3_5-sonnet_20241116.json +3 -0
  11. evals_live/corebench_hard_core-agent_gpt-4o-mini_20241116.json +3 -0
  12. evals_live/corebench_hard_core-agent_gpt-4o_20241116.json +3 -0
  13. evals_live/corebench_hard_core-agent_o1-mini_20241116.json +3 -0
  14. evals_live/corebench_hard_coreagent_claude-35-sonnet.json +0 -3
  15. evals_live/corebench_hard_coreagent_gpt4o-mini.json +0 -3
  16. evals_live/corebench_hard_coreagent_gpt4o.json +0 -3
  17. evals_live/corebench_medium_autogpt_gpt-4o-mini_20241117.json +3 -0
  18. evals_live/corebench_medium_autogpt_gpt-4o_20241117.json +3 -0
  19. evals_live/corebench_medium_autogpt_gpt4o-mini.json +0 -3
  20. evals_live/corebench_medium_autogpt_gpt4o.json +0 -3
  21. evals_live/corebench_medium_core-agent_gpt-4o-mini_20241117.json +3 -0
  22. evals_live/corebench_medium_core-agent_gpt-4o_20241117.json +3 -0
  23. evals_live/corebench_medium_coreagent_gpt4o-mini.json +0 -3
  24. evals_live/corebench_medium_coreagent_gpt4o.json +0 -3
  25. evals_live/inspect_evalsswe_bench_1729180046_UPLOAD (1).json +3 -0
  26. evals_live/usaco_USACO_Episodic_gpt-4o-mini-2024-07-18_172342962.json +3 -0
.gitignore CHANGED
@@ -1,8 +1,6 @@
1
  __pycache__/**
2
  /evals_processed
3
  /evals_upload
4
- /evals_live
5
  /utils/__pycache__
6
  /agent_monitor/__pycache__
7
  preprocessed_traces.db
8
- /evals_live
 
1
  __pycache__/**
2
  /evals_processed
3
  /evals_upload
 
4
  /utils/__pycache__
5
  /agent_monitor/__pycache__
6
  preprocessed_traces.db
 
evals_live/{corebench_easy_autogpt_gpt4o-mini.json → corebench_easy_autogpt_gpt-4o-mini_20241117.json} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7c5c25d4b26a725ba5dd52dfc34df003ca6b6beb1136814b86bc89416fcc18f
3
- size 1554
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:388d35e786f38b7cbee4f30c91c4b747abb5376c5b53d501c4e38b0c7dce62d8
3
+ size 1448
evals_live/{corebench_easy_autogpt_gpt4o.json → corebench_easy_autogpt_gpt-4o_20241117.json} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67194b74b04e00aef5ab5c4ec79aae6308242a64ff777a04c553fa19517189b2
3
- size 1544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd831e7b4b6f25b72918ca5ac5383ac406a2ade6893fb9250c552f7ea4973c31
3
+ size 1445
evals_live/{corebench_hard_coreagent_o1-mini.json → corebench_easy_core-agent_gpt-4o-mini_20241117.json} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9bc1c5617b4a667754c3da757af96dbaef52be140e2a516c4327b2c7c60df7e8
3
  size 1461
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba6975a6b8fd8c21652b3e6a0b0753edd4277980fc0d436ceed860289a9a6660
3
  size 1461
evals_live/{corebench_easy_coreagent_gpt4o-mini.json → corebench_easy_core-agent_gpt-4o_20241117.json} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37bf82f1a3d8e942088194e54d6789e0d69a088874372511ad32c4b5a4b2bc1e
3
- size 1559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff864519fce95e441bba6a40ad2ef318cd09fc068d5d828d05b840109530fd82
3
+ size 1451
evals_live/{corebench_easy_coreagent_gpt4o.json → corebench_hard_autogpt_gpt-4o-mini_20241116.json} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91f26a427d4b27415aa1b077d61a192791cbe1c4f285d562b445118e38238232
3
- size 1549
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:277730926ab9f8e32374d3e8d4c0e1dcc34abb8c44a5102dc0b8e0b7c8b6e1c2
3
+ size 1456
evals_live/corebench_hard_autogpt_gpt-4o_20241116.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38e45af6eb350539935b6274881a15417f355742c1ed218fab9283eb46122734
3
+ size 1438
evals_live/corebench_hard_autogpt_gpt4o-mini.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7db6d6a712796f7853a71fd23f0e680db726902485eb104a205d9f91ff2bb1ca
3
- size 1555
 
 
 
 
evals_live/corebench_hard_autogpt_gpt4o.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b25c527daa532f3a724b3dffb97e6d3e1749185353203083b5d37000565824e0
3
- size 1543
 
 
 
 
evals_live/corebench_hard_core-agent_claude-3_5-sonnet_20241116.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2aed8e6a9228b2ccfdfbfb31690d0a6892f274f2bca7981a097ad2985fc2dd0d
3
+ size 1474
evals_live/corebench_hard_core-agent_gpt-4o-mini_20241116.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:502d2787bc0ed86175a22e37146425c7caf2e6aa80181cda780b68ababc8fe30
3
+ size 1461
evals_live/corebench_hard_core-agent_gpt-4o_20241116.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f033c04d67cf33164e2ff140a7cad7fe918284cbdbcfa889d5de3eb58971818
3
+ size 1452
evals_live/corebench_hard_core-agent_o1-mini_20241116.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3ef73cfe040f4e14066e51b8c9389ea8979568cd14c9855556322138e041df8
3
+ size 1454
evals_live/corebench_hard_coreagent_claude-35-sonnet.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec0018f552500ca07bfb6a1451f848b4d009be35f0e016c0e0461467373b8fb7
3
- size 1450
 
 
 
 
evals_live/corebench_hard_coreagent_gpt4o-mini.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:580936859a1d1e49be19658c85e1203b6343d6dd466a3f629d48202a697d3102
3
- size 1559
 
 
 
 
evals_live/corebench_hard_coreagent_gpt4o.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:185d3f7edda2cb66bdb43e6369a4210a4247b5680ce74d7474a116097286c5f1
3
- size 1549
 
 
 
 
evals_live/corebench_medium_autogpt_gpt-4o-mini_20241117.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac8182bae5d5232772aa41db3ff01af64abd8d7887f7eed299d3857e53d0ff52
3
+ size 1460
evals_live/corebench_medium_autogpt_gpt-4o_20241117.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f66dc99fc03fbf39071ca80d5c3442c33e611ef6a11f7b306382f6ab41157548
3
+ size 1449
evals_live/corebench_medium_autogpt_gpt4o-mini.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ecf9980643ac88ff0ce3abf2a3abbd18819e4e1db13e1538f49026acf8c01c8
3
- size 1561
 
 
 
 
evals_live/corebench_medium_autogpt_gpt4o.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:63b801c2d998515a03115b7aaf3f5e68a537f7bd2499fe350eea6f1423810ab5
3
- size 1550
 
 
 
 
evals_live/corebench_medium_core-agent_gpt-4o-mini_20241117.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a55797f20e87a7445c32f24860d4de495b49d3fe74d3f747eadaa39eef2e665
3
+ size 1466
evals_live/corebench_medium_core-agent_gpt-4o_20241117.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f1e61f3d7d5140973ddb9bccf17bac86c6e18db2b6332a4ba61b776a716877e
3
+ size 1456
evals_live/corebench_medium_coreagent_gpt4o-mini.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:988848a4cdf07facf6e1fe74721e2f4973136e0773fa3a61eb0e6c86273c52ea
3
- size 1566
 
 
 
 
evals_live/corebench_medium_coreagent_gpt4o.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2cc16c05d31212d2c59a9ac2bafb30081620ff7d61814b4c9e5005b52ec911c
3
- size 1556
 
 
 
 
evals_live/inspect_evalsswe_bench_1729180046_UPLOAD (1).json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9751c8ce22bbfb3e1f7003097c089ac19576f5a3796acd056687adc56ec85c93
3
+ size 25215800
evals_live/usaco_USACO_Episodic_gpt-4o-mini-2024-07-18_172342962.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a7fdc0fa5f1a34caa83aaeea58d25c90b4c2f5f149ecfc3f6c4a606cb20791f
3
+ size 435042276