Presidentlin commited on
Commit
a4caafd
·
1 Parent(s): 829c0ca
src/lib/benchmarks/google.ts CHANGED
@@ -1,153 +1,176 @@
1
  import { Benchmark } from "./types";
2
 
3
-
4
  export const googleBenchmarks: Benchmark[] = [
5
  {
6
- model: "Gemini Diffusion",
7
- provider: "Google",
8
- inputPrice: 0,
9
- outputPrice: 0,
10
- benchmark: {
11
- livecodebench_v6: 30.9,
12
- bigcodebench: 45.4,
13
- lbpp_v2: 56.8,
14
- swe_bench_verified: 22.9,
15
- humaneval: 89.6,
16
- mbpp: 76.0,
17
- gpqa_diamond: 40.4,
18
- aime_2025: 23.3,
19
- bigbench_extra_hard: 15.0,
20
- global_mmlu_lite: 69.1,
21
- },
22
- source: "https://deepmind.google/models/gemini-diffusion/",
23
- },
24
- {
25
- model: "Gemini 2.0 Flash-Lite",
26
  provider: "Google",
27
- inputPrice: 0.10,
28
- outputPrice: 0.40,
 
29
  benchmark: {
30
- livecodebench_v6: 28.5,
31
- bigcodebench: 45.8,
32
- lbpp_v2: 56.0,
33
- swe_bench_verified: 28.5,
34
- humaneval: 90.2,
35
- mbpp: 75.8,
36
- gpqa_diamond: 56.5,
37
- aime_2025: 20.0,
38
- bigbench_extra_hard: 21.0,
39
- global_mmlu_lite: 79.0,
 
 
40
  },
41
- source: "https://deepmind.google/models/gemini-diffusion/",
42
  },
43
-
44
  {
45
- model: "Gemini 2.5 Flash Preview (05-20)",
46
  provider: "Google",
47
  inputPrice: 0.15,
48
  outputPrice: 3.5,
49
- source: "https://ai.google.dev/gemini-api/docs/thinking",
50
  benchmark: {
51
- aime_2025: 72.0,
 
 
52
  gpqa_diamond: 82.8,
 
 
53
  simpleqa: 26.9,
 
54
  global_mmlu_lite: 88.4,
55
- swe_bench_verified: 60.4,
56
- livecodebench_v6: 63.9,
57
  mmmu: 79.7,
58
- lbpp_v2: 61.9,
59
- bigcodebench: 56.7,
60
- facts_grounding: 85.3,
61
- humanitys_last_exam: 11.0,
62
- mrcr_v2_avg_128k: 74.0,
63
- mrcr_v2_pointwise_1m: 32.0,
64
-
65
  },
66
  },
67
  {
68
- model: "Gemini 2.5 Flash Preview (04-17) Thinking",
69
  provider: "Google",
70
- inputPrice: 0.15,
71
- outputPrice: 3.5,
72
- source: "https://ai.google.dev/gemini-api/docs/thinking",
73
  benchmark: {
74
- aime_2025: 78.0,
75
  gpqa_diamond: 78.3,
76
- simpleqa: 29.7,
77
- global_mmlu_lite: 88.4,
78
- livecodebench_v6: 63.5,
79
- lbpp_v2: 51.1,
80
- bigcodebench: 44.2,
81
- mmmu: 76.7,
82
- humanitys_last_exam: 12.1
 
 
 
 
 
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  },
85
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  {
87
  model: "Gemini 2.0 Flash",
88
  provider: "Google",
89
  inputPrice: 0.1,
90
  outputPrice: 0.4,
91
- source: "https://ai.google.dev/gemini-api/docs/thinking",
92
  benchmark: {
93
- aime_2025: 27.5,
94
- gpqa_diamond: 60.1,
95
  simpleqa: 29.9,
96
  global_mmlu_lite: 83.4,
97
- livecodebench_v6: 34.5,
98
- lbpp_v2: 22.2,
99
- mmmu: 71.7,
100
  facts_grounding: 84.6,
101
  humanitys_last_exam: 5.1,
102
- mrcr_v2_avg_128k: 36.0,
103
- mrcr_v2_pointwise_1m: 6.0,
104
-
105
  },
106
  },
 
 
107
  {
108
- model: "Gemini 2.5 Pro Preview (05-06)",
109
  provider: "Google",
110
- inputPrice: 2.5,
111
- outputPrice: 15.0,
112
- source: "https://blog.google/products/gemini/gemini-2-5-pro-updates/",
113
  benchmark: {
114
- humanitys_last_exam: 17.8,
115
- gpqa_diamond: 83.0,
116
- aime_2025: 83.0,
117
- livecodebench_v6: 75.6,
118
- lbpp_v2: 76.5,
119
- bigcodebench: 72.7,
120
- swe_bench_verified: 63.2,
121
- simpleqa: 50.8,
122
- mmmu: 79.6,
123
-
124
- video_mme: 84.8,
125
- mrcr_v2_avg_128k: 93.0,
126
- mrcr_v2_pointwise_1m: 82.9,
127
- global_mmlu_lite: 88.6,
128
  },
129
  },
130
  {
131
- model: "Gemini 2.5 Pro Experimental (03-25)",
132
  provider: "Google",
133
- inputPrice: 2.5,
134
- outputPrice: 15.0,
135
- source: "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
136
  benchmark: {
137
- humanitys_last_exam: 18.8,
138
- gpqa_diamond: 84.0,
139
- aime_2025: 86.7,
140
- livecodebench_v6: 70.4,
141
- lbpp_v2: 74.0,
142
- bigcodebench: 68.6,
143
- swe_bench_verified: 63.8,
144
- simpleqa: 52.9,
145
- mmmu: 81.7,
146
- mrcr_v2_avg_128k: 94.5,
147
- mrcr_v2_pointwise_1m: 83.1,
148
- global_mmlu_lite: 89.8,
149
  },
150
  },
151
-
152
-
153
  ];
 
1
  import { Benchmark } from "./types";
2
 
 
3
  export const googleBenchmarks: Benchmark[] = [
4
  {
5
+ model: "Gemini 2.5 Pro (Thinking-enabled, default)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  provider: "Google",
7
+ inputPrice: 2.5,
8
+ outputPrice: 15.0,
9
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
10
  benchmark: {
11
+ livecodebench_v6: 69.0,
12
+ aider_polyglot: 82.2,
13
+ swe_bench_verified: 67.2,
14
+ gpqa_diamond: 86.4,
15
+ aime_2025: 88.0,
16
+ humanitys_last_exam: 21.6,
17
+ simpleqa: 54.0,
18
+ facts_grounding: 87.8,
19
+ global_mmlu_lite: 89.2,
20
+ mrcr_v2_avg_128k: 58.0,
21
+ mrcr_v2_pointwise_1m: 16.4,
22
+ mmmu: 82.0,
23
  },
 
24
  },
 
25
  {
26
+ model: "Gemini 2.5 Flash (Thinking-enabled, default)",
27
  provider: "Google",
28
  inputPrice: 0.15,
29
  outputPrice: 3.5,
30
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
31
  benchmark: {
32
+ livecodebench_v6: 55.4,
33
+ aider_polyglot: 56.7,
34
+ swe_bench_verified: 60.3,
35
  gpqa_diamond: 82.8,
36
+ aime_2025: 72.0,
37
+ humanitys_last_exam: 11.0,
38
  simpleqa: 26.9,
39
+ facts_grounding: 85.3,
40
  global_mmlu_lite: 88.4,
41
+ mrcr_v2_avg_128k: 54.3,
42
+ mrcr_v2_pointwise_1m: 21.0,
43
  mmmu: 79.7,
 
 
 
 
 
 
 
44
  },
45
  },
46
  {
47
+ model: "Gemini 2.5 Flash (Non-Thinking)",
48
  provider: "Google",
49
+ inputPrice: 0.30,
50
+ outputPrice: 2.50,
51
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
52
  benchmark: {
53
+ humanitys_last_exam: 8.4,
54
  gpqa_diamond: 78.3,
55
+ aime_2025: 61.6,
56
+ livecodebench_v6: 41.1,
57
+ aider_polyglot: 44.0,
58
+ swe_bench_verified: 50.0,
59
+ simpleqa: 25.8,
60
+ facts_grounding: 83.4,
61
+ mmmu: 76.9,
62
+ //vibe_eval: 66.2,
63
+ mrcr_v2_avg_128k: 34.1,
64
+ mrcr_v2_pointwise_1m: 16.8,
65
+ global_mmlu_lite: 85.8,
66
+ },
67
+ },
68
 
69
+ {
70
+ model: "Gemini 2.5 Flash-Lite (Non-Thinking)",
71
+ provider: "Google",
72
+ inputPrice: 0.10,
73
+ outputPrice: 0.40,
74
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
75
+ benchmark: {
76
+ humanitys_last_exam: 5.1,
77
+ gpqa_diamond: 64.6,
78
+ aime_2025: 49.8,
79
+ livecodebench_v6: 33.7,
80
+ aider_polyglot: 26.7,
81
+ swe_bench_verified: 42.6,
82
+ simpleqa: 10.7,
83
+ facts_grounding: 84.1,
84
+ mmmu: 72.9,
85
+ // vibe_eval: 51.3,
86
+ mrcr_v2_avg_128k: 16.6,
87
+ mrcr_v2_pointwise_1m: 4.1,
88
+ global_mmlu_lite: 81.1,
89
  },
90
  },
91
+ {
92
+ model: "Gemini 2.5 Flash-Lite (Thinking)",
93
+ provider: "Google",
94
+ inputPrice: 0.10,
95
+ outputPrice: 0.40,
96
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
97
+ benchmark: {
98
+ humanitys_last_exam: 6.9,
99
+ gpqa_diamond: 66.7,
100
+ aime_2025: 63.1,
101
+ livecodebench_v6: 34.3,
102
+ aider_polyglot: 27.1,
103
+ swe_bench_verified: 44.9,
104
+ simpleqa: 13.0,
105
+ facts_grounding: 86.8,
106
+ mmmu: 72.9,
107
+ //vibe_eval: 57.5,
108
+ mrcr_v2_avg_128k: 30.6,
109
+ mrcr_v2_pointwise_1m: 5.4,
110
+ global_mmlu_lite: 84.5,
111
+ },
112
+ },
113
+
114
  {
115
  model: "Gemini 2.0 Flash",
116
  provider: "Google",
117
  inputPrice: 0.1,
118
  outputPrice: 0.4,
119
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
120
  benchmark: {
121
+ aime_2025: 29.7,
122
+ gpqa_diamond: 65.2,
123
  simpleqa: 29.9,
124
  global_mmlu_lite: 83.4,
125
+ livecodebench_v6: 29.1,
126
+ mmmu: 69.3,
 
127
  facts_grounding: 84.6,
128
  humanitys_last_exam: 5.1,
129
+ mrcr_v2_avg_128k: 19.0,
130
+ mrcr_v2_pointwise_1m: 5.3,
 
131
  },
132
  },
133
+
134
+
135
  {
136
+ model: "Gemini 1.5 Pro",
137
  provider: "Google",
138
+ inputPrice: 0.015,
139
+ outputPrice: 0.075,
140
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
141
  benchmark: {
142
+ livecodebench_v6: 29.7,
143
+ aider_polyglot: 16.9,
144
+ swe_bench_verified: 34.2,
145
+ gpqa_diamond: 58.1,
146
+ aime_2025: 17.5,
147
+ humanitys_last_exam: 4.6,
148
+ simpleqa: 24.9,
149
+ facts_grounding: 80.0,
150
+ global_mmlu_lite: 80.8,
151
+ mrcr_v2_avg_128k: 26.2,
152
+ mrcr_v2_pointwise_1m: 12.1,
153
+ mmmu: 67.7,
 
 
154
  },
155
  },
156
  {
157
+ model: "Gemini 1.5 Flash",
158
  provider: "Google",
159
+ inputPrice: 0.0025,
160
+ outputPrice: 0.0075,
161
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
162
  benchmark: {
163
+ livecodebench_v6: 30.3,
164
+ aider_polyglot: 2.8,
165
+ swe_bench_verified: 19.7,
166
+ gpqa_diamond: 50.0,
167
+ aime_2025: 14.7,
168
+ simpleqa: 8.6,
169
+ facts_grounding: 82.9,
170
+ global_mmlu_lite: 72.5,
171
+ mrcr_v2_avg_128k: 18.4,
172
+ mrcr_v2_pointwise_1m: 10.2,
173
+ mmmu: 58.3,
 
174
  },
175
  },
 
 
176
  ];
src/lib/benchmarks/types.ts CHANGED
@@ -13,6 +13,7 @@ export type BenchmarkMetric =
13
 
14
  // Code benchmarks (frequent)
15
  | "humaneval"
 
16
  | "mbpp"
17
  | "bigcodebench"
18
  | "livecodebench_v6"
@@ -54,6 +55,7 @@ export const benchmarkMetricOrder: BenchmarkMetric[] = [
54
  "aime_24",
55
  "aime_2025",
56
  "gpqa_diamond",
 
57
 
58
  // // Code benchmarks (frequent)
59
  // "humaneval",
 
13
 
14
  // Code benchmarks (frequent)
15
  | "humaneval"
16
+ | "aider_polyglot"
17
  | "mbpp"
18
  | "bigcodebench"
19
  | "livecodebench_v6"
 
55
  "aime_24",
56
  "aime_2025",
57
  "gpqa_diamond",
58
+ "aider_polyglot"
59
 
60
  // // Code benchmarks (frequent)
61
  // "humaneval",