Spaces:
Running
Running
Yuxuan-Zhang-Dexter
commited on
Commit
·
dafeb92
1
Parent(s):
bbddc27
update model names
Browse files- assets/model_color.json +2 -2
- rank_data_03_25_2025.json +8 -8
assets/model_color.json
CHANGED
@@ -13,8 +13,8 @@
|
|
13 |
"o1-2024-12-17": "#4DB6AC",
|
14 |
"o1-mini-2024-09-12": "#26A69A",
|
15 |
"o3-mini-2025-01-31(medium)": "#80CBC4",
|
16 |
-
"o3": "#26C6DA",
|
17 |
-
"o4-mini": "#00ACC1",
|
18 |
"grok-3-beta": "#FF7043",
|
19 |
"deepseek-v3": "#FFC107",
|
20 |
"deepseek-r1": "#FFA000",
|
|
|
13 |
"o1-2024-12-17": "#4DB6AC",
|
14 |
"o1-mini-2024-09-12": "#26A69A",
|
15 |
"o3-mini-2025-01-31(medium)": "#80CBC4",
|
16 |
+
"o3-2025-04-16": "#26C6DA",
|
17 |
+
"o4-mini-2025-04-16": "#00ACC1",
|
18 |
"grok-3-beta": "#FF7043",
|
19 |
"deepseek-v3": "#FFC107",
|
20 |
"deepseek-r1": "#FFA000",
|
rank_data_03_25_2025.json
CHANGED
@@ -50,7 +50,7 @@
|
|
50 |
"runs": 1,
|
51 |
"results": [
|
52 |
{
|
53 |
-
"model": "o3",
|
54 |
"score": 256,
|
55 |
"steps": 108,
|
56 |
"time": "58:09",
|
@@ -120,7 +120,7 @@
|
|
120 |
"rank": 10
|
121 |
},
|
122 |
{
|
123 |
-
"model": "o4-mini",
|
124 |
"score": 128,
|
125 |
"steps": "",
|
126 |
"time": "",
|
@@ -218,14 +218,14 @@
|
|
218 |
"runs": 3,
|
219 |
"results": [
|
220 |
{
|
221 |
-
"model": "o4-mini",
|
222 |
"score_runs": "123,131",
|
223 |
"average_score": 127,
|
224 |
"steps": 25,
|
225 |
"rank": 1
|
226 |
},
|
227 |
{
|
228 |
-
"model": "o3",
|
229 |
"score_runs": "115, 122",
|
230 |
"average_score": 118.5,
|
231 |
"steps": 25,
|
@@ -321,7 +321,7 @@
|
|
321 |
"runs": 3,
|
322 |
"results": [
|
323 |
{
|
324 |
-
"model": "o3",
|
325 |
"levels_cracked": "5",
|
326 |
"steps": "[16, 40, 59, 110]",
|
327 |
"rank": 1
|
@@ -345,7 +345,7 @@
|
|
345 |
"rank": 4
|
346 |
},
|
347 |
{
|
348 |
-
"model": "o4-mini",
|
349 |
"levels_cracked": "2",
|
350 |
"steps": "",
|
351 |
"rank": 5
|
@@ -434,7 +434,7 @@
|
|
434 |
"note": "stuck at the end not present evidence"
|
435 |
},
|
436 |
{
|
437 |
-
"model": "o3",
|
438 |
"levels_cracked": "3",
|
439 |
"lives_left": "[5, 3, 3, 0]",
|
440 |
"cracked_details": "4: 4/8",
|
@@ -506,7 +506,7 @@
|
|
506 |
"note": "stuck in the 3rd evidence present"
|
507 |
},
|
508 |
{
|
509 |
-
"model": "o4-mini",
|
510 |
"levels_cracked": "0",
|
511 |
"lives_left": "0",
|
512 |
"cracked_details": "1:1/5",
|
|
|
50 |
"runs": 1,
|
51 |
"results": [
|
52 |
{
|
53 |
+
"model": "o3-2025-04-16",
|
54 |
"score": 256,
|
55 |
"steps": 108,
|
56 |
"time": "58:09",
|
|
|
120 |
"rank": 10
|
121 |
},
|
122 |
{
|
123 |
+
"model": "o4-mini-2025-04-16",
|
124 |
"score": 128,
|
125 |
"steps": "",
|
126 |
"time": "",
|
|
|
218 |
"runs": 3,
|
219 |
"results": [
|
220 |
{
|
221 |
+
"model": "o4-mini-2025-04-16",
|
222 |
"score_runs": "123,131",
|
223 |
"average_score": 127,
|
224 |
"steps": 25,
|
225 |
"rank": 1
|
226 |
},
|
227 |
{
|
228 |
+
"model": "o3-2025-04-16",
|
229 |
"score_runs": "115, 122",
|
230 |
"average_score": 118.5,
|
231 |
"steps": 25,
|
|
|
321 |
"runs": 3,
|
322 |
"results": [
|
323 |
{
|
324 |
+
"model": "o3-2025-04-16",
|
325 |
"levels_cracked": "5",
|
326 |
"steps": "[16, 40, 59, 110]",
|
327 |
"rank": 1
|
|
|
345 |
"rank": 4
|
346 |
},
|
347 |
{
|
348 |
+
"model": "o4-mini-2025-04-16",
|
349 |
"levels_cracked": "2",
|
350 |
"steps": "",
|
351 |
"rank": 5
|
|
|
434 |
"note": "stuck at the end not present evidence"
|
435 |
},
|
436 |
{
|
437 |
+
"model": "o3-2025-04-16",
|
438 |
"levels_cracked": "3",
|
439 |
"lives_left": "[5, 3, 3, 0]",
|
440 |
"cracked_details": "4: 4/8",
|
|
|
506 |
"note": "stuck in the 3rd evidence present"
|
507 |
},
|
508 |
{
|
509 |
+
"model": "o4-mini-2025-04-16",
|
510 |
"levels_cracked": "0",
|
511 |
"lives_left": "0",
|
512 |
"cracked_details": "1:1/5",
|