Presidentlin commited on
Commit
713e157
·
1 Parent(s): b29bfe7
src/lib/benchmarks/ index.ts CHANGED
@@ -3,10 +3,12 @@ import { xaiBenchmarks } from "./xai";
3
  import { googleBenchmarks } from "./google";
4
  import { anthropicBenchmarks } from "./anthropic";
5
  import { openaiBenchmarks } from "./openai";
 
6
 
7
  export const benchmarkData: Benchmark[] = [
8
  ...xaiBenchmarks,
9
  ...googleBenchmarks,
10
  ...anthropicBenchmarks,
11
  ...openaiBenchmarks,
 
12
  ];
 
3
  import { googleBenchmarks } from "./google";
4
  import { anthropicBenchmarks } from "./anthropic";
5
  import { openaiBenchmarks } from "./openai";
6
+ import { deepseekBenchmarks } from "./deepseek";
7
 
8
  export const benchmarkData: Benchmark[] = [
9
  ...xaiBenchmarks,
10
  ...googleBenchmarks,
11
  ...anthropicBenchmarks,
12
  ...openaiBenchmarks,
13
+ ...deepseekBenchmarks
14
  ];
src/lib/benchmarks/deepseek.ts ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Benchmark } from "./types";
2
+
3
+ export const deepseekBenchmarks: Benchmark[] = [
4
+ {
5
+ model: "DeepSeek-R1-0528",
6
+ provider: "DeepSeek",
7
+ inputPrice: 0.55, // Placeholder, update if pricing becomes available
8
+ outputPrice: 2.19,
9
+ benchmark: {
10
+ aime_24: 91.4,
11
+ aime_2025: 87.5,
12
+ gpqa_diamond: 81.0,
13
+ gpqa: 81.0, // For compatibility; can remove if you want to only use gpqa_diamond
14
+ mmlu_pro: 85.0,
15
+ mmlu: 93.4, // MMLU-Redux assumed to be "mmlu"
16
+ simpleqa: 27.8,
17
+ lcb: 73.3, // LiveCodeBench
18
+ aider_polyglot: 71.6,
19
+ swe_bench_verified: 57.6,
20
+ // Optional or less frequent benchmarks:
21
+ humanitys_last_exam: 17.7,
22
+ // Not in BenchmarkMetric, but useful (commented for type safety):
23
+ // codeforces_div1: 1930,
24
+ // frames: 83.0,
25
+ // tau_bench_airline: 53.5,
26
+ // tau_bench_retail: 63.9,
27
+ // bfcl_v3_multiturn: 37.0,
28
+ // cnmo_2024: 86.9,
29
+ // hmmt_2025: 79.4,
30
+ },
31
+ source: "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
32
+ },
33
+
34
+ {
35
+ model: "DeepSeek-V3-0324",
36
+ provider: "DeepSeek",
37
+ inputPrice: 0.27, // Placeholder — adjust if actual pricing becomes available
38
+ outputPrice: 1.10,
39
+ benchmark: {
40
+ mmlu: 87.1, // From original DeepSeek-V3
41
+ mmlu_pro: 81.2, // Updated in V3-0324
42
+ gpqa: 68.4, // Updated in V3-0324
43
+ gpqa_diamond: 59.1, // From V3
44
+ aime_24: 59.4, // Updated in V3-0324
45
+ lcb: 49.2, // Updated LiveCodeBench
46
+ simpleqa: 24.9, // From V3
47
+ aider_polyglot: 49.6, // From V3
48
+ swe_bench_verified: 42.0 // From V3
49
+ },
50
+ source: "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
51
+ },
52
+ {
53
+ model: "DeepSeek-V3",
54
+ provider: "DeepSeek",
55
+ inputPrice: 0.27, // Placeholder — update if real pricing is known
56
+ outputPrice: 1.10,
57
+ benchmark: {
58
+ mmlu: 87.1,
59
+ mmlu_pro: 64.4,
60
+ // mmlu_redux: 86.2, // Commented: not in BenchmarkMetric
61
+ gpqa_diamond: 59.1,
62
+ simpleqa: 24.9,
63
+ aime_24: 39.2,
64
+ lcb: 37.6, // LiveCodeBench (Pass@1)
65
+ aider_polyglot: 49.6,
66
+ swe_bench_verified: 42.0,
67
+
68
+ // Optional or not yet in your schema:
69
+ // humanitys_last_exam: undefined,
70
+ // codeforces: 51.6,
71
+ // drop: 89.0,
72
+ // gsm8k: 89.3,
73
+ // math_em: 61.6,
74
+ // mgsm: 79.8,
75
+ // cmath: 90.7,
76
+ // cruxeval_i: 67.3,
77
+ // cruxeval_o: 69.8,
78
+ // triviaqa: 82.9,
79
+ // naturalquestions: 40.0,
80
+ // agieval: 79.6,
81
+ // hellaSwag: 88.9,
82
+ // piqa: 84.7,
83
+ // winogrande: 84.9,
84
+ },
85
+ source: "https://huggingface.co/deepseek-ai/DeepSeek-V3",
86
+ },
87
+ {
88
+ model: "DeepSeek-R1",
89
+ provider: "DeepSeek",
90
+ inputPrice: 0.60, // Placeholder — update if actual pricing is available
91
+ outputPrice: 1.20,
92
+ benchmark: {
93
+ mmlu: 90.8,
94
+ mmlu_pro: 84.0,
95
+ gpqa_diamond: 71.5,
96
+ simpleqa: 30.1,
97
+ lcb: 65.9, // LiveCodeBench (Pass@1-CoT)
98
+ swe_bench_verified: 49.2,
99
+ aider_polyglot: 53.3,
100
+ aime_24: 79.8,
101
+ // aime_2025: undefined, // not provided
102
+ // gpqa: undefined, // use gpqa_diamond
103
+ // egoschema: undefined,
104
+ // mmmu: undefined,
105
+ // loft: undefined,
106
+ // humanitys_last_exam: undefined, // optional
107
+ },
108
+ source: "https://huggingface.co/deepseek-ai/DeepSeek-R1",
109
+ },
110
+ ];
111
+