src/App.tsx CHANGED
@@ -104,26 +104,21 @@ const App: React.FC = () => {
104
 
105
 
106
 
107
- const filteredBenchmarkedModels = useMemo(() => {
108
- return benchmarkedModels.filter((model) => {
109
- const providerMatch =
110
- selectedBenchmarkProviders.length === 0 || selectedBenchmarkProviders.includes(model.provider);
111
- const modelMatch =
112
- selectedBenchmarkModels.length === 0 || selectedBenchmarkModels.includes(model.name);
113
-
114
- // When not linking, allow filtering by either
115
- if (selectedBenchmarkProviders.length > 0 && selectedBenchmarkModels.length > 0) {
116
- return providerMatch || modelMatch;
117
- }
118
-
119
- return providerMatch && modelMatch; // this handles the case where one or both are empty
120
- });
121
- }, [
122
- benchmarkedModels,
123
- selectedBenchmarkProviders,
124
- selectedBenchmarkModels,
125
 
126
- ]);
127
 
128
  const sortedBenchmarkedModels = useMemo(() => {
129
  if (!benchmarkSortConfig) return filteredBenchmarkedModels;
 
104
 
105
 
106
 
107
+ const filteredBenchmarkedModels = useMemo(() => {
108
+ return benchmarkedModels.filter((model) => {
109
+ const providerMatch =
110
+ selectedBenchmarkProviders.length === 0 || selectedBenchmarkProviders.includes(model.provider);
111
+ const modelMatch =
112
+ selectedBenchmarkModels.length === 0 || selectedBenchmarkModels.includes(model.name);
113
+
114
+ return providerMatch && modelMatch;
115
+ });
116
+ }, [
117
+ benchmarkedModels,
118
+ selectedBenchmarkProviders,
119
+ selectedBenchmarkModels,
120
+ ]);
 
 
 
 
121
 
 
122
 
123
  const sortedBenchmarkedModels = useMemo(() => {
124
  if (!benchmarkSortConfig) return filteredBenchmarkedModels;
src/components/BenchmarkTable.tsx CHANGED
@@ -64,9 +64,9 @@ export const BenchmarkTable: React.FC<BenchmarkTableProps> = ({
64
  }))
65
  );
66
 
67
- const filtered = flat.filter((m) =>
68
- !selectedProviders.length || selectedProviders.includes(m.provider)
69
- );
70
 
71
 
72
  return Array.from(new Map(filtered.map((m) => [m.value, m])).values());
 
64
  }))
65
  );
66
 
67
+ const filtered = flat.filter((m) =>
68
+ !selectedProviders.length || selectedProviders.includes(m.provider)
69
+ );
70
 
71
 
72
  return Array.from(new Map(filtered.map((m) => [m.value, m])).values());
src/lib/benchmarks/google.ts CHANGED
@@ -1,153 +1,285 @@
1
  import { Benchmark } from "./types";
2
 
3
-
4
  export const googleBenchmarks: Benchmark[] = [
5
  {
6
- model: "Gemini Diffusion",
7
  provider: "Google",
8
- inputPrice: 0,
9
- outputPrice: 0,
 
10
  benchmark: {
11
- livecodebench_v6: 30.9,
12
- bigcodebench: 45.4,
13
- lbpp_v2: 56.8,
14
- swe_bench_verified: 22.9,
15
- humaneval: 89.6,
16
- mbpp: 76.0,
17
- gpqa_diamond: 40.4,
18
- aime_2025: 23.3,
19
- bigbench_extra_hard: 15.0,
20
- global_mmlu_lite: 69.1,
 
 
21
  },
22
- source: "https://deepmind.google/models/gemini-diffusion/",
23
  },
24
  {
25
- model: "Gemini 2.0 Flash-Lite",
26
  provider: "Google",
27
- inputPrice: 0.10,
28
- outputPrice: 0.40,
 
29
  benchmark: {
30
- livecodebench_v6: 28.5,
31
- bigcodebench: 45.8,
32
- lbpp_v2: 56.0,
33
- swe_bench_verified: 28.5,
34
- humaneval: 90.2,
35
- mbpp: 75.8,
36
- gpqa_diamond: 56.5,
37
- aime_2025: 20.0,
38
- bigbench_extra_hard: 21.0,
39
- global_mmlu_lite: 79.0,
 
 
40
  },
41
- source: "https://deepmind.google/models/gemini-diffusion/",
42
  },
43
-
44
  {
45
- model: "Gemini 2.5 Flash Preview (05-20)",
46
  provider: "Google",
47
  inputPrice: 0.15,
48
  outputPrice: 3.5,
49
- source: "https://ai.google.dev/gemini-api/docs/thinking",
50
  benchmark: {
51
- aime_2025: 72.0,
 
 
52
  gpqa_diamond: 82.8,
 
 
53
  simpleqa: 26.9,
 
54
  global_mmlu_lite: 88.4,
55
- swe_bench_verified: 60.4,
56
- livecodebench_v6: 63.9,
57
  mmmu: 79.7,
58
- lbpp_v2: 61.9,
59
- bigcodebench: 56.7,
60
- facts_grounding: 85.3,
61
- humanitys_last_exam: 11.0,
62
- mrcr_v2_avg_128k: 74.0,
63
- mrcr_v2_pointwise_1m: 32.0,
64
-
65
  },
66
  },
67
  {
68
- model: "Gemini 2.5 Flash Preview (04-17) Thinking",
69
  provider: "Google",
70
- inputPrice: 0.15,
71
- outputPrice: 3.5,
72
- source: "https://ai.google.dev/gemini-api/docs/thinking",
73
  benchmark: {
74
- aime_2025: 78.0,
75
  gpqa_diamond: 78.3,
76
- simpleqa: 29.7,
77
- global_mmlu_lite: 88.4,
78
- livecodebench_v6: 63.5,
79
- lbpp_v2: 51.1,
80
- bigcodebench: 44.2,
81
- mmmu: 76.7,
82
- humanitys_last_exam: 12.1
83
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  },
85
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  {
87
  model: "Gemini 2.0 Flash",
88
  provider: "Google",
89
  inputPrice: 0.1,
90
  outputPrice: 0.4,
91
- source: "https://ai.google.dev/gemini-api/docs/thinking",
92
  benchmark: {
93
- aime_2025: 27.5,
94
- gpqa_diamond: 60.1,
95
  simpleqa: 29.9,
96
  global_mmlu_lite: 83.4,
97
- livecodebench_v6: 34.5,
98
- lbpp_v2: 22.2,
99
- mmmu: 71.7,
100
  facts_grounding: 84.6,
101
  humanitys_last_exam: 5.1,
102
- mrcr_v2_avg_128k: 36.0,
103
- mrcr_v2_pointwise_1m: 6.0,
104
-
 
105
  },
106
  },
107
  {
108
- model: "Gemini 2.5 Pro Preview (05-06)",
109
  provider: "Google",
110
- inputPrice: 2.5,
111
- outputPrice: 15.0,
112
- source: "https://blog.google/products/gemini/gemini-2-5-pro-updates/",
113
  benchmark: {
114
- humanitys_last_exam: 17.8,
115
- gpqa_diamond: 83.0,
116
- aime_2025: 83.0,
117
- livecodebench_v6: 75.6,
118
- lbpp_v2: 76.5,
119
- bigcodebench: 72.7,
120
- swe_bench_verified: 63.2,
121
- simpleqa: 50.8,
122
- mmmu: 79.6,
123
-
124
- video_mme: 84.8,
125
- mrcr_v2_avg_128k: 93.0,
126
- mrcr_v2_pointwise_1m: 82.9,
127
- global_mmlu_lite: 88.6,
128
  },
129
  },
130
  {
131
- model: "Gemini 2.5 Pro Experimental (03-25)",
132
  provider: "Google",
133
- inputPrice: 2.5,
134
- outputPrice: 15.0,
135
- source: "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
136
  benchmark: {
137
- humanitys_last_exam: 18.8,
138
- gpqa_diamond: 84.0,
139
- aime_2025: 86.7,
140
- livecodebench_v6: 70.4,
141
- lbpp_v2: 74.0,
142
- bigcodebench: 68.6,
143
- swe_bench_verified: 63.8,
144
- simpleqa: 52.9,
145
- mmmu: 81.7,
146
- mrcr_v2_avg_128k: 94.5,
147
- mrcr_v2_pointwise_1m: 83.1,
148
- global_mmlu_lite: 89.8,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  },
150
  },
151
-
152
-
153
  ];
 
1
  import { Benchmark } from "./types";
2
 
 
3
  export const googleBenchmarks: Benchmark[] = [
4
  {
5
+ model: "Gemini 2.5 Pro (Thinking-enabled, <=200k context)",
6
  provider: "Google",
7
+ inputPrice: 1.25,
8
+ outputPrice: 10.0,
9
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
10
  benchmark: {
11
+ livecodebench_v6: 69.0,
12
+ aider_polyglot: 82.2,
13
+ swe_bench_verified: 67.2,
14
+ gpqa_diamond: 86.4,
15
+ aime_2025: 88.0,
16
+ humanitys_last_exam: 21.6,
17
+ simpleqa: 54.0,
18
+ facts_grounding: 87.8,
19
+ global_mmlu_lite: 89.2,
20
+ mrcr_v2_avg_128k: 58.0,
21
+ mrcr_v2_pointwise_1m: 16.4,
22
+ mmmu: 82.0,
23
  },
 
24
  },
25
  {
26
+ model: "Gemini 2.5 Pro (Thinking-enabled, >200k context)",
27
  provider: "Google",
28
+ inputPrice: 2.5,
29
+ outputPrice: 15.0,
30
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
31
  benchmark: {
32
+ livecodebench_v6: 69.0,
33
+ aider_polyglot: 82.2,
34
+ swe_bench_verified: 67.2,
35
+ gpqa_diamond: 86.4,
36
+ aime_2025: 88.0,
37
+ humanitys_last_exam: 21.6,
38
+ simpleqa: 54.0,
39
+ facts_grounding: 87.8,
40
+ global_mmlu_lite: 89.2,
41
+ mrcr_v2_avg_128k: 58.0,
42
+ mrcr_v2_pointwise_1m: 16.4,
43
+ mmmu: 82.0,
44
  },
 
45
  },
 
46
  {
47
+ model: "Gemini 2.5 Flash (Thinking-enabled, default)",
48
  provider: "Google",
49
  inputPrice: 0.15,
50
  outputPrice: 3.5,
51
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
52
  benchmark: {
53
+ livecodebench_v6: 55.4,
54
+ aider_polyglot: 56.7,
55
+ swe_bench_verified: 60.3,
56
  gpqa_diamond: 82.8,
57
+ aime_2025: 72.0,
58
+ humanitys_last_exam: 11.0,
59
  simpleqa: 26.9,
60
+ facts_grounding: 85.3,
61
  global_mmlu_lite: 88.4,
62
+ mrcr_v2_avg_128k: 54.3,
63
+ mrcr_v2_pointwise_1m: 21.0,
64
  mmmu: 79.7,
65
+ // loft_128k: 82.1,
66
+ // loft_1m: 58.9,
 
 
 
 
 
67
  },
68
  },
69
  {
70
+ model: "Gemini 2.5 Flash (Non-Thinking)",
71
  provider: "Google",
72
+ inputPrice: 0.30,
73
+ outputPrice: 2.50,
74
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
75
  benchmark: {
76
+ humanitys_last_exam: 8.4,
77
  gpqa_diamond: 78.3,
78
+ aime_2025: 61.6,
79
+ livecodebench_v6: 41.1,
80
+ aider_polyglot: 44.0,
81
+ swe_bench_verified: 50.0,
82
+ simpleqa: 25.8,
83
+ facts_grounding: 83.4,
84
+ mmmu: 76.9,
85
+ // vibe_eval: 66.2,
86
+ mrcr_v2_avg_128k: 34.1,
87
+ mrcr_v2_pointwise_1m: 16.8,
88
+ global_mmlu_lite: 85.8,
89
+ // loft_128k: 76.2,
90
+ // loft_1m: 49.5,
91
+ },
92
+ },
93
+ {
94
+ model: "Gemini 2.5 Flash-Lite (Non-Thinking)",
95
+ provider: "Google",
96
+ inputPrice: 0.10,
97
+ outputPrice: 0.40,
98
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
99
+ benchmark: {
100
+ humanitys_last_exam: 5.1,
101
+ gpqa_diamond: 64.6,
102
+ aime_2025: 49.8,
103
+ livecodebench_v6: 33.7,
104
+ aider_polyglot: 26.7,
105
+ swe_bench_verified: 42.6,
106
+ simpleqa: 10.7,
107
+ facts_grounding: 84.1,
108
+ mmmu: 72.9,
109
+ // vibe_eval: 51.3,
110
+ mrcr_v2_avg_128k: 16.6,
111
+ mrcr_v2_pointwise_1m: 4.1,
112
+ global_mmlu_lite: 81.1,
113
+ // loft_128k: 65.7,
114
+ // loft_1m: 31.1,
115
+ },
116
+ },
117
+ {
118
+ model: "Gemini 2.5 Flash-Lite (Thinking)",
119
+ provider: "Google",
120
+ inputPrice: 0.10,
121
+ outputPrice: 0.40,
122
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
123
+ benchmark: {
124
+ humanitys_last_exam: 6.9,
125
+ gpqa_diamond: 66.7,
126
+ aime_2025: 63.1,
127
+ livecodebench_v6: 34.3,
128
+ aider_polyglot: 27.1,
129
+ swe_bench_verified: 44.9,
130
+ simpleqa: 13.0,
131
+ facts_grounding: 86.8,
132
+ mmmu: 72.9,
133
+ // vibe_eval: 57.5,
134
+ mrcr_v2_avg_128k: 30.6,
135
+ mrcr_v2_pointwise_1m: 5.4,
136
+ global_mmlu_lite: 84.5,
137
+ // loft_128k: 67.3,
138
+ // loft_1m: 38.4,
139
  },
140
  },
141
+ {
142
+ model: "Gemini 2.0 Flash-Lite",
143
+ provider: "Google",
144
+ inputPrice: 0.10,
145
+ outputPrice: 0.40,
146
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
147
+ benchmark: {
148
+ livecodebench_v6: 29.1,
149
+ aider_polyglot: 10.5,
150
+ swe_bench_verified: 23.1,
151
+ gpqa_diamond: 50.5,
152
+ aime_2025: 23.8,
153
+ humanitys_last_exam: 4.6,
154
+ simpleqa: 16.5,
155
+ facts_grounding: 82.4,
156
+ global_mmlu_lite: 78.0,
157
+ // loft_128k: 50.7,
158
+ // loft_1m: 7.6,
159
+ },
160
+ },
161
+ {
162
+ model: "Gemini Diffusion",
163
+ provider: "Google",
164
+ inputPrice: 0,
165
+ outputPrice: 0,
166
+ source: "https://deepmind.google/models/gemini-diffusion/",
167
+ benchmark: {
168
+ livecodebench_v6: 30.9,
169
+ bigcodebench: 45.4,
170
+ lbpp_v2: 56.8,
171
+ swe_bench_verified: 22.9,
172
+ humaneval: 89.6,
173
+ mbpp: 76.0,
174
+ gpqa_diamond: 40.4,
175
+ aime_2025: 23.3,
176
+ bigbench_extra_hard: 15.0,
177
+ global_mmlu_lite: 69.1,
178
+ },
179
+ },
180
+
181
  {
182
  model: "Gemini 2.0 Flash",
183
  provider: "Google",
184
  inputPrice: 0.1,
185
  outputPrice: 0.4,
186
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
187
  benchmark: {
188
+ aime_2025: 29.7,
189
+ gpqa_diamond: 65.2,
190
  simpleqa: 29.9,
191
  global_mmlu_lite: 83.4,
192
+ livecodebench_v6: 29.1,
193
+ mmmu: 69.3,
 
194
  facts_grounding: 84.6,
195
  humanitys_last_exam: 5.1,
196
+ mrcr_v2_avg_128k: 19.0,
197
+ mrcr_v2_pointwise_1m: 5.3,
198
+ // loft_128k: 58.0,
199
+ // loft_1m: 7.6,
200
  },
201
  },
202
  {
203
+ model: "Gemini 1.5 Pro (<=128k context)",
204
  provider: "Google",
205
+ inputPrice: 1.25,
206
+ outputPrice: 5.00,
207
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
208
  benchmark: {
209
+ livecodebench_v6: 29.7,
210
+ aider_polyglot: 16.9,
211
+ swe_bench_verified: 34.2,
212
+ gpqa_diamond: 58.1,
213
+ aime_2025: 17.5,
214
+ humanitys_last_exam: 4.6,
215
+ simpleqa: 24.9,
216
+ facts_grounding: 80.0,
217
+ global_mmlu_lite: 80.8,
218
+ mrcr_v2_avg_128k: 26.2,
219
+ mrcr_v2_pointwise_1m: 12.1,
220
+ mmmu: 67.7,
 
 
221
  },
222
  },
223
  {
224
+ model: "Gemini 1.5 Pro (>128k context)",
225
  provider: "Google",
226
+ inputPrice: 2.50,
227
+ outputPrice: 10.00,
228
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
229
  benchmark: {
230
+ livecodebench_v6: 29.7,
231
+ aider_polyglot: 16.9,
232
+ swe_bench_verified: 34.2,
233
+ gpqa_diamond: 58.1,
234
+ aime_2025: 17.5,
235
+ humanitys_last_exam: 4.6,
236
+ simpleqa: 24.9,
237
+ facts_grounding: 80.0,
238
+ global_mmlu_lite: 80.8,
239
+ mrcr_v2_avg_128k: 26.2,
240
+ mrcr_v2_pointwise_1m: 12.1,
241
+ mmmu: 67.7,
242
+ },
243
+ },
244
+ // Gemini 1.5 Flash
245
+ {
246
+ model: "Gemini 1.5 Flash (<=128k context)",
247
+ provider: "Google",
248
+ inputPrice: 0.075,
249
+ outputPrice: 0.30,
250
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
251
+ benchmark: {
252
+ livecodebench_v6: 30.3,
253
+ aider_polyglot: 2.8,
254
+ swe_bench_verified: 19.7,
255
+ gpqa_diamond: 50.0,
256
+ aime_2025: 14.7,
257
+ simpleqa: 8.6,
258
+ facts_grounding: 82.9,
259
+ global_mmlu_lite: 72.5,
260
+ mrcr_v2_avg_128k: 18.4,
261
+ mrcr_v2_pointwise_1m: 10.2,
262
+ mmmu: 58.3,
263
+ },
264
+ },
265
+ {
266
+ model: "Gemini 1.5 Flash (>128k context)",
267
+ provider: "Google",
268
+ inputPrice: 0.15,
269
+ outputPrice: 0.60,
270
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
271
+ benchmark: {
272
+ livecodebench_v6: 30.3,
273
+ aider_polyglot: 2.8,
274
+ swe_bench_verified: 19.7,
275
+ gpqa_diamond: 50.0,
276
+ aime_2025: 14.7,
277
+ simpleqa: 8.6,
278
+ facts_grounding: 82.9,
279
+ global_mmlu_lite: 72.5,
280
+ mrcr_v2_avg_128k: 18.4,
281
+ mrcr_v2_pointwise_1m: 10.2,
282
+ mmmu: 58.3,
283
  },
284
  },
 
 
285
  ];
src/lib/benchmarks/types.ts CHANGED
@@ -13,6 +13,7 @@ export type BenchmarkMetric =
13
 
14
  // Code benchmarks (frequent)
15
  | "humaneval"
 
16
  | "mbpp"
17
  | "bigcodebench"
18
  | "livecodebench_v6"
@@ -54,6 +55,7 @@ export const benchmarkMetricOrder: BenchmarkMetric[] = [
54
  "aime_24",
55
  "aime_2025",
56
  "gpqa_diamond",
 
57
 
58
  // // Code benchmarks (frequent)
59
  // "humaneval",
 
13
 
14
  // Code benchmarks (frequent)
15
  | "humaneval"
16
+ | "aider_polyglot"
17
  | "mbpp"
18
  | "bigcodebench"
19
  | "livecodebench_v6"
 
55
  "aime_24",
56
  "aime_2025",
57
  "gpqa_diamond",
58
+ "aider_polyglot"
59
 
60
  // // Code benchmarks (frequent)
61
  // "humaneval",