Commit
·
713e157
1
Parent(s):
b29bfe7
- src/lib/benchmarks/ index.ts +2 -0
- src/lib/benchmarks/deepseek.ts +111 -0
src/lib/benchmarks/ index.ts
CHANGED
@@ -3,10 +3,12 @@ import { xaiBenchmarks } from "./xai";
|
|
3 |
import { googleBenchmarks } from "./google";
|
4 |
import { anthropicBenchmarks } from "./anthropic";
|
5 |
import { openaiBenchmarks } from "./openai";
|
|
|
6 |
|
7 |
export const benchmarkData: Benchmark[] = [
|
8 |
...xaiBenchmarks,
|
9 |
...googleBenchmarks,
|
10 |
...anthropicBenchmarks,
|
11 |
...openaiBenchmarks,
|
|
|
12 |
];
|
|
|
3 |
import { googleBenchmarks } from "./google";
|
4 |
import { anthropicBenchmarks } from "./anthropic";
|
5 |
import { openaiBenchmarks } from "./openai";
|
6 |
+
import { deepseekBenchmarks } from "./deepseek";
|
7 |
|
8 |
export const benchmarkData: Benchmark[] = [
|
9 |
...xaiBenchmarks,
|
10 |
...googleBenchmarks,
|
11 |
...anthropicBenchmarks,
|
12 |
...openaiBenchmarks,
|
13 |
+
...deepseekBenchmarks
|
14 |
];
|
src/lib/benchmarks/deepseek.ts
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { Benchmark } from "./types";
|
2 |
+
|
3 |
+
export const deepseekBenchmarks: Benchmark[] = [
|
4 |
+
{
|
5 |
+
model: "DeepSeek-R1-0528",
|
6 |
+
provider: "DeepSeek",
|
7 |
+
inputPrice: 0.55, // Placeholder, update if pricing becomes available
|
8 |
+
outputPrice: 2.19,
|
9 |
+
benchmark: {
|
10 |
+
aime_24: 91.4,
|
11 |
+
aime_2025: 87.5,
|
12 |
+
gpqa_diamond: 81.0,
|
13 |
+
gpqa: 81.0, // For compatibility; can remove if you want to only use gpqa_diamond
|
14 |
+
mmlu_pro: 85.0,
|
15 |
+
mmlu: 93.4, // MMLU-Redux assumed to be "mmlu"
|
16 |
+
simpleqa: 27.8,
|
17 |
+
lcb: 73.3, // LiveCodeBench
|
18 |
+
aider_polyglot: 71.6,
|
19 |
+
swe_bench_verified: 57.6,
|
20 |
+
// Optional or less frequent benchmarks:
|
21 |
+
humanitys_last_exam: 17.7,
|
22 |
+
// Not in BenchmarkMetric, but useful (commented for type safety):
|
23 |
+
// codeforces_div1: 1930,
|
24 |
+
// frames: 83.0,
|
25 |
+
// tau_bench_airline: 53.5,
|
26 |
+
// tau_bench_retail: 63.9,
|
27 |
+
// bfcl_v3_multiturn: 37.0,
|
28 |
+
// cnmo_2024: 86.9,
|
29 |
+
// hmmt_2025: 79.4,
|
30 |
+
},
|
31 |
+
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
|
32 |
+
},
|
33 |
+
|
34 |
+
{
|
35 |
+
model: "DeepSeek-V3-0324",
|
36 |
+
provider: "DeepSeek",
|
37 |
+
inputPrice: 0.27, // Placeholder — adjust if actual pricing becomes available
|
38 |
+
outputPrice: 1.10,
|
39 |
+
benchmark: {
|
40 |
+
mmlu: 87.1, // From original DeepSeek-V3
|
41 |
+
mmlu_pro: 81.2, // Updated in V3-0324
|
42 |
+
gpqa: 68.4, // Updated in V3-0324
|
43 |
+
gpqa_diamond: 59.1, // From V3
|
44 |
+
aime_24: 59.4, // Updated in V3-0324
|
45 |
+
lcb: 49.2, // Updated LiveCodeBench
|
46 |
+
simpleqa: 24.9, // From V3
|
47 |
+
aider_polyglot: 49.6, // From V3
|
48 |
+
swe_bench_verified: 42.0 // From V3
|
49 |
+
},
|
50 |
+
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
|
51 |
+
},
|
52 |
+
{
|
53 |
+
model: "DeepSeek-V3",
|
54 |
+
provider: "DeepSeek",
|
55 |
+
inputPrice: 0.27, // Placeholder — update if real pricing is known
|
56 |
+
outputPrice: 1.10,
|
57 |
+
benchmark: {
|
58 |
+
mmlu: 87.1,
|
59 |
+
mmlu_pro: 64.4,
|
60 |
+
// mmlu_redux: 86.2, // Commented: not in BenchmarkMetric
|
61 |
+
gpqa_diamond: 59.1,
|
62 |
+
simpleqa: 24.9,
|
63 |
+
aime_24: 39.2,
|
64 |
+
lcb: 37.6, // LiveCodeBench (Pass@1)
|
65 |
+
aider_polyglot: 49.6,
|
66 |
+
swe_bench_verified: 42.0,
|
67 |
+
|
68 |
+
// Optional or not yet in your schema:
|
69 |
+
// humanitys_last_exam: undefined,
|
70 |
+
// codeforces: 51.6,
|
71 |
+
// drop: 89.0,
|
72 |
+
// gsm8k: 89.3,
|
73 |
+
// math_em: 61.6,
|
74 |
+
// mgsm: 79.8,
|
75 |
+
// cmath: 90.7,
|
76 |
+
// cruxeval_i: 67.3,
|
77 |
+
// cruxeval_o: 69.8,
|
78 |
+
// triviaqa: 82.9,
|
79 |
+
// naturalquestions: 40.0,
|
80 |
+
// agieval: 79.6,
|
81 |
+
// hellaSwag: 88.9,
|
82 |
+
// piqa: 84.7,
|
83 |
+
// winogrande: 84.9,
|
84 |
+
},
|
85 |
+
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3",
|
86 |
+
},
|
87 |
+
{
|
88 |
+
model: "DeepSeek-R1",
|
89 |
+
provider: "DeepSeek",
|
90 |
+
inputPrice: 0.60, // Placeholder — update if actual pricing is available
|
91 |
+
outputPrice: 1.20,
|
92 |
+
benchmark: {
|
93 |
+
mmlu: 90.8,
|
94 |
+
mmlu_pro: 84.0,
|
95 |
+
gpqa_diamond: 71.5,
|
96 |
+
simpleqa: 30.1,
|
97 |
+
lcb: 65.9, // LiveCodeBench (Pass@1-CoT)
|
98 |
+
swe_bench_verified: 49.2,
|
99 |
+
aider_polyglot: 53.3,
|
100 |
+
aime_24: 79.8,
|
101 |
+
// aime_2025: undefined, // not provided
|
102 |
+
// gpqa: undefined, // use gpqa_diamond
|
103 |
+
// egoschema: undefined,
|
104 |
+
// mmmu: undefined,
|
105 |
+
// loft: undefined,
|
106 |
+
// humanitys_last_exam: undefined, // optional
|
107 |
+
},
|
108 |
+
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1",
|
109 |
+
},
|
110 |
+
];
|
111 |
+
|