Presidentlin commited on
Commit
de1d6b5
·
1 Parent(s): 85d193b
Files changed (1) hide show
  1. src/lib/benchmarks/google.ts +272 -195
src/lib/benchmarks/google.ts CHANGED
@@ -1,208 +1,285 @@
1
  import { Benchmark } from "./types";
2
 
3
  export const googleBenchmarks: Benchmark[] = [
4
- {
5
- model: "Gemini 2.5 Pro (Thinking-enabled, default)",
6
- provider: "Google",
7
- inputPrice: 2.5,
8
- outputPrice: 15.0,
9
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
10
- benchmark: {
11
- livecodebench_v6: 69.0,
12
- aider_polyglot: 82.2,
13
- swe_bench_verified: 67.2,
14
- gpqa_diamond: 86.4,
15
- aime_2025: 88.0,
16
- humanitys_last_exam: 21.6,
17
- simpleqa: 54.0,
18
- facts_grounding: 87.8,
19
- global_mmlu_lite: 89.2,
20
- mrcr_v2_avg_128k: 58.0,
21
- mrcr_v2_pointwise_1m: 16.4,
22
- mmmu: 82.0,
23
- // loft_128k: 87.0,
24
- // loft_1m: 69.8,
25
  },
26
- },
27
- {
28
- model: "Gemini 2.5 Flash (Thinking-enabled, default)",
29
- provider: "Google",
30
- inputPrice: 0.15,
31
- outputPrice: 3.5,
32
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
33
- benchmark: {
34
- livecodebench_v6: 55.4,
35
- aider_polyglot: 56.7,
36
- swe_bench_verified: 60.3,
37
- gpqa_diamond: 82.8,
38
- aime_2025: 72.0,
39
- humanitys_last_exam: 11.0,
40
- simpleqa: 26.9,
41
- facts_grounding: 85.3,
42
- global_mmlu_lite: 88.4,
43
- mrcr_v2_avg_128k: 54.3,
44
- mrcr_v2_pointwise_1m: 21.0,
45
- mmmu: 79.7,
46
- // loft_128k: 82.1,
47
- // loft_1m: 58.9,
48
  },
49
- },
50
- {
51
- model: "Gemini 2.5 Flash (Non-Thinking)",
52
- provider: "Google",
53
- inputPrice: 0.30,
54
- outputPrice: 2.50,
55
- source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
56
- benchmark: {
57
- humanitys_last_exam: 8.4,
58
- gpqa_diamond: 78.3,
59
- aime_2025: 61.6,
60
- livecodebench_v6: 41.1,
61
- aider_polyglot: 44.0,
62
- swe_bench_verified: 50.0,
63
- simpleqa: 25.8,
64
- facts_grounding: 83.4,
65
- mmmu: 76.9,
66
- // vibe_eval: 66.2,
67
- mrcr_v2_avg_128k: 34.1,
68
- mrcr_v2_pointwise_1m: 16.8,
69
- global_mmlu_lite: 85.8,
70
- // loft_128k: 76.2,
71
- // loft_1m: 49.5,
72
  },
73
- },
74
- {
75
- model: "Gemini 2.5 Flash-Lite (Non-Thinking)",
76
- provider: "Google",
77
- inputPrice: 0.10,
78
- outputPrice: 0.40,
79
- source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
80
- benchmark: {
81
- humanitys_last_exam: 5.1,
82
- gpqa_diamond: 64.6,
83
- aime_2025: 49.8,
84
- livecodebench_v6: 33.7,
85
- aider_polyglot: 26.7,
86
- swe_bench_verified: 42.6,
87
- simpleqa: 10.7,
88
- facts_grounding: 84.1,
89
- mmmu: 72.9,
90
- // vibe_eval: 51.3,
91
- mrcr_v2_avg_128k: 16.6,
92
- mrcr_v2_pointwise_1m: 4.1,
93
- global_mmlu_lite: 81.1,
94
- // loft_128k: 65.7,
95
- // loft_1m: 31.1,
96
  },
97
- },
98
- {
99
- model: "Gemini 2.5 Flash-Lite (Thinking)",
100
- provider: "Google",
101
- inputPrice: 0.10,
102
- outputPrice: 0.40,
103
- source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
104
- benchmark: {
105
- humanitys_last_exam: 6.9,
106
- gpqa_diamond: 66.7,
107
- aime_2025: 63.1,
108
- livecodebench_v6: 34.3,
109
- aider_polyglot: 27.1,
110
- swe_bench_verified: 44.9,
111
- simpleqa: 13.0,
112
- facts_grounding: 86.8,
113
- mmmu: 72.9,
114
- // vibe_eval: 57.5,
115
- mrcr_v2_avg_128k: 30.6,
116
- mrcr_v2_pointwise_1m: 5.4,
117
- global_mmlu_lite: 84.5,
118
- // loft_128k: 67.3,
119
- // loft_1m: 38.4,
120
  },
121
- },
122
- {
123
- model: "Gemini 2.0 Flash-Lite",
124
- provider: "Google",
125
- inputPrice: 0.10,
126
- outputPrice: 0.40,
127
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
128
- benchmark: {
129
- livecodebench_v6: 29.1,
130
- aider_polyglot: 10.5,
131
- swe_bench_verified: 23.1,
132
- gpqa_diamond: 50.5,
133
- aime_2025: 23.8,
134
- humanitys_last_exam: 4.6,
135
- simpleqa: 16.5,
136
- facts_grounding: 82.4,
137
- global_mmlu_lite: 78.0,
138
- // loft_128k: 50.7,
139
- // loft_1m: 7.6,
 
 
 
 
140
  },
141
- },
142
- {
143
- model: "Gemini 2.0 Flash",
144
- provider: "Google",
145
- inputPrice: 0.1,
146
- outputPrice: 0.4,
147
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
148
- benchmark: {
149
- aime_2025: 29.7,
150
- gpqa_diamond: 65.2,
151
- simpleqa: 29.9,
152
- global_mmlu_lite: 83.4,
153
- livecodebench_v6: 29.1,
154
- mmmu: 69.3,
155
- facts_grounding: 84.6,
156
- humanitys_last_exam: 5.1,
157
- mrcr_v2_avg_128k: 19.0,
158
- mrcr_v2_pointwise_1m: 5.3,
159
- // loft_128k: 58.0,
160
- // loft_1m: 7.6,
161
  },
162
- },
163
- {
164
- model: "Gemini 1.5 Pro",
165
- provider: "Google",
166
- inputPrice: 0.015,
167
- outputPrice: 0.075,
168
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
169
- benchmark: {
170
- livecodebench_v6: 29.7,
171
- aider_polyglot: 16.9,
172
- swe_bench_verified: 34.2,
173
- gpqa_diamond: 58.1,
174
- aime_2025: 17.5,
175
- humanitys_last_exam: 4.6,
176
- simpleqa: 24.9,
177
- facts_grounding: 80.0,
178
- global_mmlu_lite: 80.8,
179
- mrcr_v2_avg_128k: 26.2,
180
- mrcr_v2_pointwise_1m: 12.1,
181
- mmmu: 67.7,
182
- // loft_128k: 75.9,
183
- // loft_1m: 47.1,
184
  },
185
- },
186
- {
187
- model: "Gemini 1.5 Flash",
188
- provider: "Google",
189
- inputPrice: 0.0025,
190
- outputPrice: 0.0075,
191
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
192
- benchmark: {
193
- livecodebench_v6: 30.3,
194
- aider_polyglot: 2.8,
195
- swe_bench_verified: 19.7,
196
- gpqa_diamond: 50.0,
197
- aime_2025: 14.7,
198
- simpleqa: 8.6,
199
- facts_grounding: 82.9,
200
- global_mmlu_lite: 72.5,
201
- mrcr_v2_avg_128k: 18.4,
202
- mrcr_v2_pointwise_1m: 10.2,
203
- mmmu: 58.3,
204
- // loft_128k: 67.3,
205
- // loft_1m: 36.7,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  },
207
- },
208
  ];
 
1
  import { Benchmark } from "./types";
2
 
3
  export const googleBenchmarks: Benchmark[] = [
4
+ {
5
+ model: "Gemini 2.5 Pro (Thinking-enabled, <=200k context)",
6
+ provider: "Google",
7
+ inputPrice: 1.25,
8
+ outputPrice: 10.0,
9
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
10
+ benchmark: {
11
+ livecodebench_v6: 69.0,
12
+ aider_polyglot: 82.2,
13
+ swe_bench_verified: 67.2,
14
+ gpqa_diamond: 86.4,
15
+ aime_2025: 88.0,
16
+ humanitys_last_exam: 21.6,
17
+ simpleqa: 54.0,
18
+ facts_grounding: 87.8,
19
+ global_mmlu_lite: 89.2,
20
+ mrcr_v2_avg_128k: 58.0,
21
+ mrcr_v2_pointwise_1m: 16.4,
22
+ mmmu: 82.0,
23
+ },
 
24
  },
25
+ {
26
+ model: "Gemini 2.5 Pro (Thinking-enabled, >200k context)",
27
+ provider: "Google",
28
+ inputPrice: 2.5,
29
+ outputPrice: 15.0,
30
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
31
+ benchmark: {
32
+ livecodebench_v6: 69.0,
33
+ aider_polyglot: 82.2,
34
+ swe_bench_verified: 67.2,
35
+ gpqa_diamond: 86.4,
36
+ aime_2025: 88.0,
37
+ humanitys_last_exam: 21.6,
38
+ simpleqa: 54.0,
39
+ facts_grounding: 87.8,
40
+ global_mmlu_lite: 89.2,
41
+ mrcr_v2_avg_128k: 58.0,
42
+ mrcr_v2_pointwise_1m: 16.4,
43
+ mmmu: 82.0,
44
+ },
 
 
45
  },
46
+ {
47
+ model: "Gemini 2.5 Flash (Thinking-enabled, default)",
48
+ provider: "Google",
49
+ inputPrice: 0.15,
50
+ outputPrice: 3.5,
51
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
52
+ benchmark: {
53
+ livecodebench_v6: 55.4,
54
+ aider_polyglot: 56.7,
55
+ swe_bench_verified: 60.3,
56
+ gpqa_diamond: 82.8,
57
+ aime_2025: 72.0,
58
+ humanitys_last_exam: 11.0,
59
+ simpleqa: 26.9,
60
+ facts_grounding: 85.3,
61
+ global_mmlu_lite: 88.4,
62
+ mrcr_v2_avg_128k: 54.3,
63
+ mrcr_v2_pointwise_1m: 21.0,
64
+ mmmu: 79.7,
65
+ // loft_128k: 82.1,
66
+ // loft_1m: 58.9,
67
+ },
 
68
  },
69
+ {
70
+ model: "Gemini 2.5 Flash (Non-Thinking)",
71
+ provider: "Google",
72
+ inputPrice: 0.30,
73
+ outputPrice: 2.50,
74
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
75
+ benchmark: {
76
+ humanitys_last_exam: 8.4,
77
+ gpqa_diamond: 78.3,
78
+ aime_2025: 61.6,
79
+ livecodebench_v6: 41.1,
80
+ aider_polyglot: 44.0,
81
+ swe_bench_verified: 50.0,
82
+ simpleqa: 25.8,
83
+ facts_grounding: 83.4,
84
+ mmmu: 76.9,
85
+ // vibe_eval: 66.2,
86
+ mrcr_v2_avg_128k: 34.1,
87
+ mrcr_v2_pointwise_1m: 16.8,
88
+ global_mmlu_lite: 85.8,
89
+ // loft_128k: 76.2,
90
+ // loft_1m: 49.5,
91
+ },
92
  },
93
+ {
94
+ model: "Gemini 2.5 Flash-Lite (Non-Thinking)",
95
+ provider: "Google",
96
+ inputPrice: 0.10,
97
+ outputPrice: 0.40,
98
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
99
+ benchmark: {
100
+ humanitys_last_exam: 5.1,
101
+ gpqa_diamond: 64.6,
102
+ aime_2025: 49.8,
103
+ livecodebench_v6: 33.7,
104
+ aider_polyglot: 26.7,
105
+ swe_bench_verified: 42.6,
106
+ simpleqa: 10.7,
107
+ facts_grounding: 84.1,
108
+ mmmu: 72.9,
109
+ // vibe_eval: 51.3,
110
+ mrcr_v2_avg_128k: 16.6,
111
+ mrcr_v2_pointwise_1m: 4.1,
112
+ global_mmlu_lite: 81.1,
113
+ // loft_128k: 65.7,
114
+ // loft_1m: 31.1,
115
+ },
116
  },
117
+ {
118
+ model: "Gemini 2.5 Flash-Lite (Thinking)",
119
+ provider: "Google",
120
+ inputPrice: 0.10,
121
+ outputPrice: 0.40,
122
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
123
+ benchmark: {
124
+ humanitys_last_exam: 6.9,
125
+ gpqa_diamond: 66.7,
126
+ aime_2025: 63.1,
127
+ livecodebench_v6: 34.3,
128
+ aider_polyglot: 27.1,
129
+ swe_bench_verified: 44.9,
130
+ simpleqa: 13.0,
131
+ facts_grounding: 86.8,
132
+ mmmu: 72.9,
133
+ // vibe_eval: 57.5,
134
+ mrcr_v2_avg_128k: 30.6,
135
+ mrcr_v2_pointwise_1m: 5.4,
136
+ global_mmlu_lite: 84.5,
137
+ // loft_128k: 67.3,
138
+ // loft_1m: 38.4,
139
+ },
140
  },
141
+ {
142
+ model: "Gemini 2.0 Flash-Lite",
143
+ provider: "Google",
144
+ inputPrice: 0.10,
145
+ outputPrice: 0.40,
146
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
147
+ benchmark: {
148
+ livecodebench_v6: 29.1,
149
+ aider_polyglot: 10.5,
150
+ swe_bench_verified: 23.1,
151
+ gpqa_diamond: 50.5,
152
+ aime_2025: 23.8,
153
+ humanitys_last_exam: 4.6,
154
+ simpleqa: 16.5,
155
+ facts_grounding: 82.4,
156
+ global_mmlu_lite: 78.0,
157
+ // loft_128k: 50.7,
158
+ // loft_1m: 7.6,
159
+ },
 
160
  },
161
+ {
162
+ model: "Gemini Diffusion",
163
+ provider: "Google",
164
+ inputPrice: 0,
165
+ outputPrice: 0,
166
+ source: "https://deepmind.google/models/gemini-diffusion/",
167
+ benchmark: {
168
+ livecodebench_v6: 30.9,
169
+ bigcodebench: 45.4,
170
+ lbpp_v2: 56.8,
171
+ swe_bench_verified: 22.9,
172
+ humaneval: 89.6,
173
+ mbpp: 76.0,
174
+ gpqa_diamond: 40.4,
175
+ aime_2025: 23.3,
176
+ bigbench_extra_hard: 15.0,
177
+ global_mmlu_lite: 69.1,
178
+ },
 
 
 
 
179
  },
180
+
181
+ {
182
+ model: "Gemini 2.0 Flash",
183
+ provider: "Google",
184
+ inputPrice: 0.1,
185
+ outputPrice: 0.4,
186
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
187
+ benchmark: {
188
+ aime_2025: 29.7,
189
+ gpqa_diamond: 65.2,
190
+ simpleqa: 29.9,
191
+ global_mmlu_lite: 83.4,
192
+ livecodebench_v6: 29.1,
193
+ mmmu: 69.3,
194
+ facts_grounding: 84.6,
195
+ humanitys_last_exam: 5.1,
196
+ mrcr_v2_avg_128k: 19.0,
197
+ mrcr_v2_pointwise_1m: 5.3,
198
+ // loft_128k: 58.0,
199
+ // loft_1m: 7.6,
200
+ },
201
+ },
202
+ {
203
+ model: "Gemini 1.5 Pro (<=128k context)",
204
+ provider: "Google",
205
+ inputPrice: 1.25,
206
+ outputPrice: 5.00,
207
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
208
+ benchmark: {
209
+ livecodebench_v6: 29.7,
210
+ aider_polyglot: 16.9,
211
+ swe_bench_verified: 34.2,
212
+ gpqa_diamond: 58.1,
213
+ aime_2025: 17.5,
214
+ humanitys_last_exam: 4.6,
215
+ simpleqa: 24.9,
216
+ facts_grounding: 80.0,
217
+ global_mmlu_lite: 80.8,
218
+ mrcr_v2_avg_128k: 26.2,
219
+ mrcr_v2_pointwise_1m: 12.1,
220
+ mmmu: 67.7,
221
+ },
222
+ },
223
+ {
224
+ model: "Gemini 1.5 Pro (>128k context)",
225
+ provider: "Google",
226
+ inputPrice: 2.50,
227
+ outputPrice: 10.00,
228
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
229
+ benchmark: {
230
+ livecodebench_v6: 29.7,
231
+ aider_polyglot: 16.9,
232
+ swe_bench_verified: 34.2,
233
+ gpqa_diamond: 58.1,
234
+ aime_2025: 17.5,
235
+ humanitys_last_exam: 4.6,
236
+ simpleqa: 24.9,
237
+ facts_grounding: 80.0,
238
+ global_mmlu_lite: 80.8,
239
+ mrcr_v2_avg_128k: 26.2,
240
+ mrcr_v2_pointwise_1m: 12.1,
241
+ mmmu: 67.7,
242
+ },
243
+ },
244
+ // Gemini 1.5 Flash
245
+ {
246
+ model: "Gemini 1.5 Flash (<=128k context)",
247
+ provider: "Google",
248
+ inputPrice: 0.075,
249
+ outputPrice: 0.30,
250
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
251
+ benchmark: {
252
+ livecodebench_v6: 30.3,
253
+ aider_polyglot: 2.8,
254
+ swe_bench_verified: 19.7,
255
+ gpqa_diamond: 50.0,
256
+ aime_2025: 14.7,
257
+ simpleqa: 8.6,
258
+ facts_grounding: 82.9,
259
+ global_mmlu_lite: 72.5,
260
+ mrcr_v2_avg_128k: 18.4,
261
+ mrcr_v2_pointwise_1m: 10.2,
262
+ mmmu: 58.3,
263
+ },
264
+ },
265
+ {
266
+ model: "Gemini 1.5 Flash (>128k context)",
267
+ provider: "Google",
268
+ inputPrice: 0.15,
269
+ outputPrice: 0.60,
270
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
271
+ benchmark: {
272
+ livecodebench_v6: 30.3,
273
+ aider_polyglot: 2.8,
274
+ swe_bench_verified: 19.7,
275
+ gpqa_diamond: 50.0,
276
+ aime_2025: 14.7,
277
+ simpleqa: 8.6,
278
+ facts_grounding: 82.9,
279
+ global_mmlu_lite: 72.5,
280
+ mrcr_v2_avg_128k: 18.4,
281
+ mrcr_v2_pointwise_1m: 10.2,
282
+ mmmu: 58.3,
283
+ },
284
  },
 
285
  ];