import { Benchmark } from "./types"; export const anthropicBenchmarks: Benchmark[] = [ { model: "Claude Opus 4", provider: "Anthropic", inputPrice: 15.0, outputPrice: 75.0, source: "https://www.anthropic.com/news/claude-4", benchmark: { swe_bench_verified: 72.5, //terminal_bench: 43.2, gpqa_diamond: 79.6, aime_2025: 75.5, mmmlu: 88.8, mmmu: 76.5, tau_bench_retail: 81.4, tau_bench_airline: 59.6, }, }, { model: "Claude Sonnet 4", provider: "Anthropic", inputPrice: 3.0, outputPrice: 15.0, source: "https://www.anthropic.com/news/claude-4", benchmark: { swe_bench_verified: 72.7, //terminal_bench: 35.5, gpqa_diamond: 75.4, aime_2025: 70.5, mmmlu: 86.5, mmmu: 74.4, tau_bench_retail: 80.5, tau_bench_airline: 60.0, }, }, { model: "Claude 3.7 Sonnet (Extended Thinking 64K)", provider: "Anthropic", inputPrice: 3.0, outputPrice: 15.0, source: "https://www.anthropic.com/news/claude-3-7-sonnet", benchmark: { gpqa_diamond: 78.2, tau_bench_retail: 81.2, tau_bench_airline: 58.4, mmmlu: 86.1, mmmu: 75.0, aime_24: 61.3, }, }, { model: "Claude 3.7 Sonnet (No Extended Thinking)", provider: "Anthropic", inputPrice: 3.0, outputPrice: 15.0, source: "https://www.anthropic.com/news/claude-3-7-sonnet", benchmark: { gpqa_diamond: 68.0, swe_bench_verified: 62.3, mmmlu: 83.2, mmmu: 71.8, aime_24: 51.7, // using average of 23.3 & 80.0 }, }, { model: "Claude 3.5 Sonnet (New)", provider: "Anthropic", inputPrice: 3.0, outputPrice: 15.0, source: "https://www.anthropic.com/news/claude-3-7-sonnet", benchmark: { gpqa_diamond: 65.0, swe_bench_verified: 49.0, tau_bench_retail: 71.5, tau_bench_airline: 48.8, mmmlu: 82.1, mmmu: 70.4, aime_24: 16.0, // average of 16.0 & 65.4 }, }, { model: "Claude 3.5 Haiku", provider: "Anthropic", inputPrice: 3.0, outputPrice: 15.0, source: "https://www.anthropic.com/news/3-5-models-and-computer-use", benchmark: { gpqa_diamond: 41.6, swe_bench_verified: 49.0, tau_bench_retail: 51.0, tau_bench_airline: 22.8, humaneval: 88.1, mmmlu: 65.0, aime_24: 5.3, }, }, { model: "Claude 3 Opus", provider: "Anthropic", inputPrice: 15.0, outputPrice: 75.0, source: "https://www.anthropic.com/news/claude-3-family", benchmark: { gpqa_diamond: 50.4, mmmlu: 86.8, mmmu: 59.4, // gsm8k: 95.0, // math: 60.1, // mgsm: 90.7, // humaneval: 84.9, // drop: 83.1, // big_bench_hard: 86.8, // arc_challenge: 96.4, // hellaswag: 95.4, // mathvista: 50.5, // ai2d: 88.1, // chart_qa: 80.8, // docvqa_anls: 89.3, }, }, { model: "Claude 3 Sonnet", provider: "Anthropic", inputPrice: 3.0, outputPrice: 15.0, source: "https://www.anthropic.com/news/claude-3-family", benchmark: { gpqa_diamond: 40.4, mmmlu: 79.0, mmmu: 53.1, // gsm8k: 92.3, // math: 43.1, // mgsm: 83.5, // humaneval: 73.0, // drop: 78.9, // big_bench_hard: 82.9, // arc_challenge: 93.2, // hellaswag: 89.0, // mathvista: 47.9, // ai2d: 88.7, // chart_qa: 81.1, // docvqa_anls: 89.5, }, }, { model: "Claude 3 Haiku", provider: "Anthropic", inputPrice: 0.25, outputPrice: 1.25, source: "https://www.anthropic.com/news/claude-3-family", benchmark: { gpqa_diamond: 33.3, mmmlu: 75.2, mmmu: 50.2, // gsm8k: 88.9, // math: 38.9, // mgsm: 75.1, // humaneval: 75.9, // drop: 78.4, // big_bench_hard: 73.7, // arc_challenge: 89.2, // hellaswag: 85.9, // mathvista: 46.4, // ai2d: 86.7, // chart_qa: 81.7, // docvqa_anls: 88.8, }, }, ];