Presidentlin commited on
Commit
2336e04
·
1 Parent(s): e955c4c
Files changed (1) hide show
  1. src/lib/benchmarks/openai.ts +151 -12
src/lib/benchmarks/openai.ts CHANGED
@@ -71,15 +71,34 @@ export const openaiBenchmarks: Benchmark[] = [
71
  provider: "OpenAI",
72
  inputPrice: 2.0,
73
  outputPrice: 8.0,
74
- source: "https://github.com/openai/simple-evals",
75
  benchmark: {
76
  mmlu: 90.2,
77
  gpqa: 66.3,
 
78
  humaneval: 94.5,
79
  simpleqa: 41.6,
80
- // math: 82.1,
81
- // mgsm: 86.9,
82
- // drop: 79.4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  },
84
  },
85
  {
@@ -87,15 +106,21 @@ export const openaiBenchmarks: Benchmark[] = [
87
  provider: "OpenAI",
88
  inputPrice: 0.4,
89
  outputPrice: 1.6,
90
- source: "https://github.com/openai/simple-evals",
91
  benchmark: {
92
  mmlu: 87.5,
93
  gpqa: 65.0,
 
94
  humaneval: 93.8,
95
  simpleqa: 16.8,
96
- // math: 81.4,
97
- // mgsm: 88.2,
98
- // drop: 81.0,
 
 
 
 
 
99
  },
100
  },
101
  {
@@ -103,17 +128,23 @@ export const openaiBenchmarks: Benchmark[] = [
103
  provider: "OpenAI",
104
  inputPrice: 0.1,
105
  outputPrice: 0.4,
106
- source: "https://github.com/openai/simple-evals",
107
  benchmark: {
108
  mmlu: 80.1,
109
  gpqa: 50.3,
 
110
  humaneval: 87.0,
111
  simpleqa: 7.6,
112
- // math: 62.3,
113
- // mgsm: 73.0,
114
- // drop: 82.2,
 
 
 
 
115
  },
116
  },
 
117
  {
118
  model: "GPT-4.5-preview-2025-02-27",
119
  provider: "OpenAI",
@@ -178,4 +209,112 @@ export const openaiBenchmarks: Benchmark[] = [
178
  // drop: 83.2,
179
  },
180
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  ];
 
71
  provider: "OpenAI",
72
  inputPrice: 2.0,
73
  outputPrice: 8.0,
74
+ source: "https://openai.com/index/gpt-4-1/",
75
  benchmark: {
76
  mmlu: 90.2,
77
  gpqa: 66.3,
78
+ gpqa_diamond: 66.3,
79
  humaneval: 94.5,
80
  simpleqa: 41.6,
81
+ swe_bench_verified: 54.6,
82
+ aider_polyglot: 52.9,
83
+ mmmlu: 90.2,
84
+ video_mme: 72.0,
85
+ // Not yet in BenchmarkMetric
86
+ aime_24: 48.1,
87
+ // aime_2025: undefined,
88
+ // mmlu_pro: undefined,
89
+ // egoschema: undefined,
90
+ // loft: undefined,
91
+ // lcb: undefined,
92
+ // bigcodebench: undefined,
93
+ // mbpp: undefined,
94
+ // livecodebench_v6: undefined,
95
+ // lbpp_v2: undefined,
96
+ // bigbench_extra_hard: undefined,
97
+ // global_mmlu_lite: undefined,
98
+ // facts_grounding: undefined,
99
+ // humanitys_last_exam: undefined,
100
+ mrcr_v2_avg_128k: 57.2,
101
+ mrcr_v2_pointwise_1m: 46.3,
102
  },
103
  },
104
  {
 
106
  provider: "OpenAI",
107
  inputPrice: 0.4,
108
  outputPrice: 1.6,
109
+ source: "https://openai.com/index/gpt-4-1/",
110
  benchmark: {
111
  mmlu: 87.5,
112
  gpqa: 65.0,
113
+ gpqa_diamond: 65.0,
114
  humaneval: 93.8,
115
  simpleqa: 16.8,
116
+ swe_bench_verified: 23.6,
117
+ aider_polyglot: 31.6,
118
+ mmmlu: 87.5,
119
+
120
+ aime_24: 49.6,
121
+ mrcr_v2_avg_128k: 47.2,
122
+ mrcr_v2_pointwise_1m: 33.3,
123
+ // video_mme: undefined,
124
  },
125
  },
126
  {
 
128
  provider: "OpenAI",
129
  inputPrice: 0.1,
130
  outputPrice: 0.4,
131
+ source: "https://openai.com/index/gpt-4-1/",
132
  benchmark: {
133
  mmlu: 80.1,
134
  gpqa: 50.3,
135
+ gpqa_diamond: 50.3,
136
  humaneval: 87.0,
137
  simpleqa: 7.6,
138
+ swe_bench_verified: 9.8,
139
+ aider_polyglot: 6.2,
140
+ mmmlu: 80.1,
141
+ aime_24: 29.4,
142
+ mrcr_v2_avg_128k: 36.6,
143
+ mrcr_v2_pointwise_1m: 12.0,
144
+ // video_mme: undefined,
145
  },
146
  },
147
+
148
  {
149
  model: "GPT-4.5-preview-2025-02-27",
150
  provider: "OpenAI",
 
209
  // drop: 83.2,
210
  },
211
  },
212
+ {
213
+ model: "OpenAI o3",
214
+ provider: "OpenAI",
215
+ inputPrice: 2.0,
216
+ outputPrice: 8.0,
217
+ source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
218
+ benchmark: {
219
+ aime_24: 91.6, // "o3 (no tools)"
220
+ aime_2025: 88.9, // "o3 (no tools)"
221
+ //codeforces: 2706, // "o3 (with terminal)"
222
+ gpqa_diamond: 83.3, // "o3 (no tools)"
223
+ humanitys_last_exam: 20.32, // "o3 (no tools)"
224
+ mmmu: 82.9,
225
+ //mathvista: 86.8,
226
+ //charxiv_reasoning: 78.6,
227
+ //swe_lancer_ic_swe_diamond: 65250, // "o3-high"
228
+ swe_bench_verified: 69.1,
229
+ aider_polyglot: 81.3, // "(whole)"
230
+ //scale_multichallenge: 56.51,
231
+ //browsecomp: 8.35, // "o3 with python +browsing*"
232
+ //tau_bench: 52.0, // "(Airline)"
233
+ // tau_bench_retail: 73.9, // "(Retail)"
234
+ },
235
+ },
236
+ {
237
+ model: "OpenAI o3-pro",
238
+ provider: "OpenAI",
239
+ inputPrice: 20.0,
240
+ outputPrice: 80.0,
241
+ source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
242
+ benchmark: {
243
+ // Benchmarks for o3-pro are not explicitly listed, but it's described as "designed to think longer and provide the most reliable responses."
244
+ // Assuming similar or slightly better performance than o3 in relevant areas.
245
+ gpqa_diamond: 83.3, // Placeholder, likely similar or slightly better than o3
246
+ humanitys_last_exam: 24.90, // "o3 (python + browsing**tools)" - this is likely the "pro" version's capability
247
+ },
248
+ },
249
+ {
250
+ model: "OpenAI o4-mini",
251
+ provider: "OpenAI",
252
+ inputPrice: 1.10,
253
+ outputPrice: 4.40,
254
+
255
+ source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
256
+ benchmark: {
257
+ aime_24: 93.4, // "o4-mini (no tools)"
258
+ aime_2025: 92.7, // "o4-mini (no tools)"
259
+ //codeforces: 2719, // "o4-mini (with terminal)"
260
+ gpqa_diamond: 81.4, // "o4-mini (no tools)"
261
+ humanitys_last_exam: 14.28, // "o4-mini (no tools)"
262
+ mmmu: 81.6,
263
+ //mathvista: 84.3,
264
+ //charxiv_reasoning: 72.0,
265
+ //swe_lancer_ic_swe_diamond: 56375, // "o4-mini-high"
266
+ swe_bench_verified: 68.1,
267
+ aider_polyglot: 68.9, // "(whole)"
268
+ //scale_multichallenge: 42.99,
269
+ //browsecomp: 1.5, // "o4-mini with python +browsing** tools"
270
+ //tau_bench: 49.2, // "(Airline)"
271
+ //tau_bench_retail: 71.8, // "(Retail)"
272
+ },
273
+ },
274
+ {
275
+ model: "OpenAI o1",
276
+ provider: "OpenAI",
277
+ inputPrice: 15.0,
278
+ outputPrice: 60.0,
279
+ source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
280
+ benchmark: {
281
+ aime_24: 74.3,
282
+ aime_2025: 79.2,
283
+ //codeforces: 189,
284
+ gpqa_diamond: 78.0,
285
+ humanitys_last_exam: 8.12, // "o1-pro"
286
+ mmmu: 77.6,
287
+ //mathvista: 71.8,
288
+ //charxiv_reasoning: 55.1,
289
+ //swe_lancer_ic_swe_diamond: 28500, // "o1-high"
290
+ swe_bench_verified: 48.9,
291
+ aider_polyglot: 64.4, // "(whole)"
292
+ //scale_multichallenge: 44.93,
293
+ //browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
294
+ //tau_bench: 50.0, // "(Airline)"
295
+ //tau_bench_retail: 70.8, // "(Retail)"
296
+ },
297
+ },
298
+ {
299
+ model: "OpenAI o3-mini",
300
+ provider: "OpenAI",
301
+ inputPrice: 1.10,
302
+ outputPrice: 4.40,
303
+ source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
304
+ benchmark: {
305
+ aime_24: 87.3,
306
+ aime_2025: 86.5,
307
+ //codeforces: 1207,
308
+ gpqa_diamond: 77.0,
309
+ humanitys_last_exam: 13.40,
310
+ // MMMU, MathVista, CharXiv-Reasoning not explicitly listed for o3-mini, assuming lower than o4-mini
311
+ //swe_lancer_ic_swe_diamond: 17375, // "o3-mini-high"
312
+ swe_bench_verified: 49.3,
313
+ aider_polyglot: 61.7, // "(diff)"
314
+ //scale_multichallenge: 39.89,
315
+ // BrowseComp not explicitly listed for o3-mini
316
+ //tau_bench: 32.4, // "(Airline)"
317
+ //tau_bench_retail: 57.6, // "(Retail)"
318
+ },
319
+ },
320
  ];