Spaces:
Running
Running
File size: 28,346 Bytes
41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 973737d 41c6214 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
Benchmark,URL,year,cognitive levels,category,domain,task,image,multiple image,video,score,use,,,,,,,,,,,,,,,,,
RefCOCO,https://aclanthology.org/D14-1086.pdf,2014,Understanding,General,natural,grounding,1,,,"{""Seed1.5VL"": 91.3, ""Gemini2.5-Pro"": 74.6, ""Qwen2.5VL-72B"": 90.3}","seed1.5vl thinking 91.3, Gemini2.5-pro thinking 74.6, Qwen2.5vl-72B 90.3",,,,,,,,,,,,,,,,,
VQA-v2,https://openaccess.thecvf.com/content_cvpr_2017/papers/Goyal_Making_the_v_CVPR_2017_paper.pdf,2017,Understanding,General,natural,perception,1,,,"{""Gemini-Ultra"": 77.8, ""Gemini1.5-Pro"": 80.2}","Gemini-ultra 77.8, Gemini1.5-pro 80.2",,,,,,,,,,,,,,,,,
FSC-147 (mean absolute error↓),https://arxiv.org/pdf/2104.08391,2021,Understanding,General,natural,grounding,1,,,"{""Seed1.5VL"": 17.9, ""Gemini2.5-Pro"": 24.5, ""OpenAI GPT4o"": 46.8, ""Qwen2.5VL-72B"": 28.6}","seed1.5vl thinking 17.9, Gemini2.5-pro thinking 24.5, OpenAI GPT4o 46.8, Qwen2.5vl-72B 28.6",,,,,,,,,,,,,,,,,
CountBench,https://arxiv.org/pdf/2302.12066,2023,Understanding,General,"natural, synthetic",grounding,1,,,"{""Seed1.5VL"": 93.7, ""Gemini2.5-Pro"": 91.0, ""OpenAI GPT4o"": 85.7, ""Qwen2.5VL-72B"": 93.6}","seed1.5vl thinking 93.7, Gemini2.5-pro thinking 91.0, OpenAI GPT4o 85.7, Qwen2.5vl-72B 93.6",,,,,,,,,,,,,,,,,
POPE,https://arxiv.org/pdf/2305.10355,2023,Understanding,General,natural,grounding,1,,,{"InternVL3-78B": 90.3},InternVL3 78B 90.3,,,,,,,,,,,,,,,,,
HallusionBench,https://arxiv.org/pdf/2310.14566,2023,Understanding,General,"natural, synthetic",perception,1,1,1,"{""Seed1.5VL"": 60.3, ""Gemini2.5-Pro"": 63.7, ""OpenAI GPT4o"": 56.2, ""Qwen2.5VL-72B"": 55.2, ""InternVL3-78B"": 59.1}","seed1.5vl thinking 60.3, Gemini2.5-pro thinking 63.7, OpenAI GPT4o 56.2, Qwen2.5vl-72B 55.2, InternVL3 78B 59.1",,,,,,,,,,,,,,,,,
V* bench,https://arxiv.org/pdf/2312.14135,2023,Understanding,General,natural,perception,1,,,"{""Gemini1.5-Pro"": 71.7, ""OpenAI o3 high"": 95.7, ""Seed1.5VL"": 89.0, ""Gemini2.5-Pro"": 79.1, ""OpenAI GPT4o"": 73.9, ""Qwen2.5VL-72B"": 86.4}","Gemini1.5-pro 71.7, OpenAI o3 high 95.7, seed1.5vl thinking 89.0, Gemini2.5-pro thinking 79.1, OpenAI GPT4o 73.9, Qwen2.5vl-72B 86.4",,,,,,,,,,,,,,,,,
MMVP,https://arxiv.org/pdf/2401.06209,2024,Understanding,General,"natural, synthetic",perception,1,,,"{""Seed1.5VL"": 70.7, ""Gemini2.5-Pro"": 70.7, ""OpenAI GPT4o"": 70.7, ""Qwen2.5VL-72B"": 66.7}","seed1.5vl thinking 70.7, Gemini2.5-pro thinking 70.7, OpenAI GPT4o 70.7, Qwen2.5vl-72B 66.7",,,,,,,,,,,,,,,,,
CRPE,https://arxiv.org/pdf/2402.19474,2024,Understanding,General,natural,relation understanding,1,,,"{""Qwen2.5VL-72B"": 79.2, ""InternVL2.5-78B"": 78.8}","Qwen2.5vl-72B 79.2, InternVL2.5 78B 78.8",,,,,,,,,,,,,,,,,
Vibe-Eval (Reka),https://arxiv.org/pdf/2405.02287,2024,Understanding,General,"natural, synthetic",perception,1,,,{"Gemini2.5-Pro": 67.2},Gemini2.5-pro thinking 67.2,,,,,,,,,,,,,,,,,
MuirBench,https://arxiv.org/pdf/2406.09411,2024,Understanding,General,natural,"counting, attribute similarity, image-text matching, visual retrieval, geographic understanding, scene understanding, cartoon understanding, diagram understanding",,1,,"{""Qwen2.5VL-72B"": 70.7, ""InternVL2.5-78B"": 63.5}","Qwen2.5vl-72B 70.7, InternVL2.5 78B 63.5",,,,,,,,,,,,,,,,,
WildVision,https://arxiv.org/pdf/2406.11069,2024,Understanding,General,"natural, synthetic",alignment with human prefernece,1,,,{"InternVL3-78B": 73.6},InternVL3 78B 73.6,,,,,,,,,,,,,,,,,
VLMs are blind,https://arxiv.org/pdf/2407.06581,2024,Understanding,General,synthetic,perception,1,,,"{""OpenAI o3 high"": 90.1, ""Seed1.5VL"": 92.1, ""Gemini2.5-Pro"": 84.3, ""OpenAI GPT4o"": 50.4, ""Qwen2.5VL-72B"": 69.0}","OpenAI o3 high 90.1, seed1.5vl thinking 92.1,Gemini2.5-pro thinking 84.3, OpenAI GPT4o 50.4, Qwen2.5vl-72B 69.0",,,,,,,,,,,,,,,,,
Realworld QA,https://huggingface.co/datasets/xai-org/RealworldQA,2024,Understanding,General,natural,perception,1,,,"{""Gemini1.5-Pro"": 70.4, ""Seed1.5VL"": 78.4, ""Gemini2.5-Pro"": 78.0, ""OpenAI GPT4o"": 76.2, ""Qwen2.5VL-72B"": 75.7, ""InternVL3-78B"": 78.0}","Gemini1.5-pro 70.4, seed1.5vl thinking 78.4, Gemini2.5-pro thinking 78.0, OpenAI GPT4o 76.2, Qwen2.5vl-72B 75.7, InternVL3 78B 78.0",,,,,,,,,,,,,,,,,
MME-Realworld,https://arxiv.org/pdf/2408.13257,2024,Understanding,General,"video monitoring, OCR, autonomous driving, diagram tables",perception,1,,,"{""Qwen2.5VL-72B"": 63.2, ""InternVL3-78B"": 65.4}","Qwen2.5vl-72B 63.2, InternVL3 78B 65.4",,,,,,,,,,,,,,,,,
MMIU,https://arxiv.org/pdf/2408.02718,2024,Understanding,General,"natural, synthetic","semantic understanding, spatial understanding, temporal understanding",,1,,{"InternVL3-78B": 60.4},InternVL3-78B 60.4,,,,,,,,,,,,,,,,,
R-Bench,https://arxiv.org/pdf/2410.05474,2024,Understanding,General,natural,robustness to corruption,1,,,{"InternVL3-78B": 77.4},InternVL3 78B 77.4,,,,,,,,,,,,,,,,,
SimpleVQA,https://arxiv.org/pdf/2502.13059,2025,Understanding,General,natural,knowledge memorization,1,,,"{""Seed1.5VL"": 63.4, ""Gemini2.5-Pro"": 62.0, ""OpenAI GPT4o"": 52.4, ""Qwen2.5VL-72B"": 52.4}","seed1.5vl thinking 63.4, Gemini2.5-pro thinking 62.0, OpenAI GPT4o 52.4, Qwen2.5vl-72B 52.4",,,,,,,,,,,,,,,,,
MMMU,https://arxiv.org/pdf/2311.16502,2023,Reasoning,General,multi-discipline,QA,1,,,"{""Gemini-Ultra"": 59.4, ""Gemini1.5-Pro"": 62.2, ""Gemini2.5-Pro"": 82.0, ""OpenAI o3 high"": 82.9, ""Claude4-Opus"": 76.5, ""Grok3-beta"": 76.0, ""Seed1.5VL"": 77.9, ""Qwen2.5VL-72B"": 70.2, ""InternVL3-78B"": 72.2}","Gemini-ultra 59.4, Gemini1.5-pro 62.2, Gemini2.5-pro thinking 82.0, OpenAI o3 high 82.9 Claude 4 Opus 76.5, Grok3 beta 76.0, seed1.5vl thinking 77.9, Qwen2.5vl-72B 70.2, InternVL3 78B 72.2",,,,,,,,,,,,,,,,,
ZeroBench,https://arxiv.org/pdf/2502.09696,2025,Reasoning,General,"natural, synthetic",QA,1,,,"{""Gemini2.5-Pro"": 4.5, ""Seed1.5VL"": 2.0, ""OpenAI GPT4o"": 0.0, ""Qwen2.5VL-72B"": 0.0}","Gemini2.5-pro thinking 4.5, seed1.5vl thinking 2.0, OpenAI GPT4o 0.0, Qwen2.5vl-72B 0.0",,,,,,,,,,,,,,,,,
MMBench,https://arxiv.org/pdf/2307.06281,2023,Comprehensive,General,"natural, synthetic","attribute reasoning, logic reasoning, relation reasoning, single-instance perception, cross-instance perception, coarse perception",1,,,"{""Seed1.5VL"": {""en"": 89.9, ""zh"": 89.1}, ""Gemini2.5-Pro"": {""en"": 90.1, ""zh"": 89.7}, ""OpenAI GPT4o"": {""en"": 84.3, ""zh"": 82.0}, ""Qwen2.5VL-72B"": {""en"": 88.6, ""zh"": 87.9}, ""InternVL3-78B"": {""en"": 89.0, ""zh"": 88.7}}","seed1.5vl thinking 89.9/89.1 (En/Zh), Gemini2.5-pro thinking 90.1/89.7 (En/Zh), OpenAI GPT4o 84.3/82.0 (En/Zh), Qwen2.5vl-72B 88.6/87.9 (En/Zh), InternVL3-78B 89.0/88.7 (En/Zh)",,,,,,,,,,,,,,,,,
MM-VET,https://arxiv.org/pdf/2308.02490,2023,Comprehensive,General,"natural, synthetic","OCR, math, recognition, spatial understanding, knowledge memorization",1,,,"{""Qwen2.5VL-72B"": 76.2, ""InternVL2.5-78B"": 72.3}","Qwen2.5vl-72B 76.2, InternVL2.5 78B 72.3",,,,,,,,,,,,,,,,,
MMStar,https://arxiv.org/pdf/2403.20330,2024,Comprehensive,General,multi-discipline,QA,1,,,"{""Seed1.5VL"": 77.8, ""Gemini2.5-Pro"": 77.5, ""OpenAI GPT4o"": 65.1, ""Qwen2.5VL-72B"": 70.8, ""InternVL3-78B"": 72.5}","seed1.5vl thinking 77.8, Gemini2.5-pro thinking 77.5, OpenAI GPT4o 65.1, Qwen2.5vl-72B 70.8, InternVL3 78B 72.5",,,,,,,,,,,,,,,,,
Blink,https://arxiv.org/pdf/2404.12390,2024,Comprehensive,General,"natural, synthetic","grounding, low-level pattern matching (e.g., visual correspondence) to mid-level spatial reasoning (e.g., relative depth), and up to high-level visual understanding (e.g., visual similarity)",1,1,,"{""Gemini1.5-Pro"": 61.4, ""Seed1.5VL"": 72.1, ""Gemini2.5-Pro"": 70.6, ""OpenAI GPT4o"": 65.9, ""Qwen2.5VL-72B"": 64.4, ""InternVL3-78B"": 66.3}","Gemini1.5-pro 61.4, seed1.5vl thinking 72.1, Gemini2.5-pro thinking 70.6, OpenAI GPT4o 65.9, Qwen2.5vl-72B 64.4, InternVL3-78B 66.3",,,,,,,,,,,,,,,,,
MMT-Bench,https://arxiv.org/pdf/2404.16006,2024,Comprehensive,General,"natural, synthetic",162 tasks,,1,,{"InternVL3-78B": 73.2},InternVL3-78B 73.2,,,,,,,,,,,,,,,,,
Mantis-Eval,https://arxiv.org/pdf/2405.01483,2024,Comprehensive,General,natural,"Co-reference, Comparison, Reasoning, Temporal understanding",,1,,{"InternVL3-78B": 79.3},InternVL3-78B 79.3,,,,,,,,,,,,,,,,,
MIRB ,https://arxiv.org/pdf/2406.12742,2024,Comprehensive,General,"natural, synthetic","perception, reasoning, knowledge, multi-hop",,1,,{"InternVL3-78B": 64.3},InternVL3-78B 64.3,,,,,,,,,,,,,,,,,
MegaBench,https://arxiv.org/pdf/2410.10563,2024,Comprehensive,General,"natural, synthetic","mathematics, information extraction, planning, coding, perception, metrics, scene, knowledge",1,1,,"{""Qwen2.5VL-72B"": 46.8, ""InternVL2.5-78B"": 45.6}","Qwen2.5vl-72B 46.8, InternVL2.5 78B 45.6",,,,,,,,,,,,,,,,,
Mathvista,https://arxiv.org/pdf/2310.02255,2023,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Gemini-Ultra"": 53.0, ""Gemini1.5-Pro"": 63.9, ""OpenAI o3 high"": 86.8, ""Seed1.5VL"": 85.6, ""Gemini2.5-Pro"": 82.7, ""OpenAI GPT4o"": 63.8, ""Qwen2.5VL-72B"": 74.8, ""InternVL3-78B"": 80.5, ""MiMo-VL-7B"": 81.5}","Gemini-ultra 53.0, Gemini1.5-pro 63.9, OpenAI o3 high 86.8,seed1.5vl thinking 85.6, Gemini2.5-pro thinking 82.7, OpenAI GPT4o 63.8, Qwen2.5vl-72B 74.8, InternVL3 78B 80.5, MiMo-VL 7B 81.5",,,,,,,,,,,,,,,,,
OlympiadBench,https://arxiv.org/pdf/2402.14008,2024,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Seed1.5VL"": 65.0, ""Gemini2.5-Pro"": 69.8, ""OpenAI GPT4o"": 25.9, ""Qwen2.5VL-72B"": 35.9, ""MiMo-VL-7B"": 59.4}","seed1.5vl thinking 65.0, Gemini2.5-pro thinking 69.8, OpenAI GPT4o 25.9, Qwen2.5vl-72B 35.9, MiMo-VL 7B 59.4",,,,,,,,,,,,,,,,,
Mathvision,https://arxiv.org/abs/2402.14804,2024,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Seed1.5VL"": 68.7, ""Gemini2.5-Pro"": 73.3, ""OpenAI GPT4o"": 31.2, ""Qwen2.5VL-72B"": 38.1, ""InternVL3-78B"": 40.8, ""MiMo-VL-7B"": 60.4}","seed1.5vl thinking 68.7, Gemini2.5-pro thinking 73.3, OpenAI GPT4o 31.2, Qwen2.5vl-72B 38.1, InternVL3 78B 40.8, MiMo-VL 7B 60.4, Gemini2.5-pro thinking 69.1",,,,,,,,,,,,,,,,,
MathVerse,https://arxiv.org/pdf/2403.14624,2024,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Qwen2.5VL-72B"": 57.6, ""InternVL3-78B"": 54.2, ""MiMo-VL-7B"": 71.5, ""OpenAI GPT4o"": 49.9, ""Gemini2.5-Pro"": 76.7}","Qwen2.5vl-72B 57.6, InternVL3 78B 54.2, MiMo-VL 7B 71.5, OpenAI GPT4o 49.9, Gemini2.5-pro thinking 76.7",,,,,,,,,,,,,,,,,
We-Math,https://arxiv.org/pdf/2407.01284,2024,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Qwen2.5VL-72B"": 49.1, ""InternVL3-78B"": 52.4, ""MiMo-VL-7B"": 66.3, ""OpenAI GPT4o"": 50.6, ""Gemini2.5-Pro"": 78.0}","Qwen2.5vl-72B 49.1, InternVL3 78B 52.4, MiMo-VL 7B 66.3, OpenAI GPT4o 50.6, Gemini2.5-pro thinking 78.0",,,,,,,,,,,,,,,,,
LogicVista,https://arxiv.org/pdf/2407.04973,2024,Reasoning,Math & Logic,logic,problem-solving,1,,,"{""Qwen2.5VL-72B"": 55.7, ""InternVL3-78B"": 57.9, ""MiMo-VL-7B"": 61.4, ""OpenAI GPT4o"": 64.4, ""Gemini2.5-Pro"": 73.8}","Qwen2.5vl-72B 55.7, InternVL3 78B 57.9, MiMo-VL 7B 61.4, OpenAI GPT4o 64.4, Gemini2.5-pro thinking 73.8",,,,,,,,,,,,,,,,,
DynaMath,https://arxiv.org/pdf/2411.00836,2024,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Qwen2.5VL-72B"": 35.9, ""InternVL3-78B"": 37.3, ""MiMo-VL-7B"": 45.9, ""OpenAI GPT4o"": 48.5, ""Gemini2.5-Pro"": 56.3}","Qwen2.5vl-72B 35.9, InternVL3 78B 37.3, MiMo-VL 7B 45.9, OpenAI GPT4o 48.5, Gemini2.5-pro thinking 56.3",,,,,,,,,,,,,,,,,
VisuLogic,https://arxiv.org/pdf/2504.15279,2025,Reasoning,Math & Logic,logic,problem-solving,1,,,"{""Seed1.5VL"": 35.0, ""Gemini2.5-Pro"": 31.0, ""OpenAI GPT4o"": 26.3, ""Qwen2.5VL-72B"": 28.0}","seed1.5vl thinking 35.0, Gemini2.5-pro thinking 31.0, OpenAI GPT4o 26.3, Qwen2.5vl-72B 28.0",,,,,,,,,,,,,,,,,
ARC Challenge,https://arcprize.org/leaderboard,2025,Reasoning,Math & Logic,logic,problem-solving,,1,,"{""OpenAI o3 high"": 60.8, ""Gemini2.5-Pro"": 41.0, ""Claude4-Opus"": 40.0}","OpenAI o3 high 60.8, Gemini2.5-pro thinking 41.0, Claude Opus 4 thinking 40.0",,,,,,,,,,,,,,,,,
OCRBench,https://arxiv.org/pdf/2305.07895,2023,Understanding,OCR & Chart& Document,OCR,QA,1,,,"{""Seed1.5VL"": 861, ""Gemini2.5-Pro"": 866, ""OpenAI GPT4o"": 806, ""Qwen2.5VL-72B"": 885, ""InternVL3-78B"": 906}","seed1.5vl thinking 861, Gemini2.5-pro thinking 866, OpenAI GPT4o 806, Qwen2.5vl-72B 885, InternVL3 78B 906",,,,,,,,,,,,,,,,,
MTVQA,https://arxiv.org/pdf/2405.11985,2024,Understanding,OCR & Chart& Document,book,Multilingual QA,1,,,"{""Qwen2.5VL-72B"": 31.7, ""InternVL2.5-78B"": 31.9}","Qwen2.5vl-72B 31.7, InternVL2.5 78B 31.9",,,,,,,,,,,,,,,,,
AI2D,https://arxiv.org/pdf/1603.07396,2016,Reasoning,OCR & Chart& Document,diagram,QA,1,,,"{""Gemini-Ultra"": 79.5, ""Gemini1.5-Pro"": 94.4, ""Seed1.5VL"": 87.3, ""Gemini2.5-Pro"": 88.4, ""OpenAI GPT4o"": 84.9, ""Qwen2.5VL-72B"": 88.7, ""InternVL3-78B"": 89.7}","Gemini-ultra 79.5, Gemini1.5-pro 94.4, seed1.5vl thinking 87.3, Gemini2.5-pro thinking 88.4, OpenAI GPT4o 84.9, Qwen2.5vl-72B 88.7, InternVL3 78B 89.7",,,,,,,,,,,,,,,,,
TextVQA,https://arxiv.org/abs/1904.08920,2019,Reasoning,OCR & Chart& Document,OCR,QA,1,,,"{""Gemini-Ultra"": 82.3, ""Gemini1.5-Pro"": 78.7, ""Seed1.5VL"": 81.8, ""Gemini2.5-Pro"": 76.8, ""OpenAI GPT4o"": 81.4, ""Qwen2.5VL-72B"": 83.5}","Gemini-ultra 82.3, Gemini1.5-pro 78.7, seed1.5vl thinking 81.8, Gemini2.5-pro thinking 76.8, OpenAI GPT4o 81.4, Qwen2.5vl-72B 83.5",,,,,,,,,,,,,,,,,
DocVQA,https://arxiv.org/pdf/2007.00398,2020,Reasoning,OCR & Chart& Document,document,QA,1,,,"{""Gemini-Ultra"": 90.9, ""Gemini1.5-Pro"": 93.1, ""Seed1.5VL"": 96.9, ""Gemini2.5-Pro"": 94.0, ""OpenAI GPT4o"": 66.2, ""Qwen2.5VL-72B"": 96.4, ""InternVL3-78B"": 95.4}","Gemini-ultra 90.9, Gemini1.5-pro 93.1, seed1.5vl thinking 96.9, Gemini2.5-pro thinking 94.0, OpenAI GPT4o 66.2, Qwen2.5vl-72B 96.4, InternVL3 78B 95.4",,,,,,,,,,,,,,,,,
InfographicVQA ,https://arxiv.org/pdf/2104.12756,2021,Reasoning,OCR & Chart& Document,infographic,QA,1,,,"{""Gemini-Ultra"": 80.3, ""Gemini1.5-Pro"": 81.0, ""Seed1.5VL"": 91.2, ""Gemini2.5-Pro"": 84.3, ""OpenAI GPT4o"": 79.2, ""Qwen2.5VL-72B"": 87.3, ""InternVL3-78B"": 85.2}","Gemini-ultra 80.3, Gemini1.5-pro 81.0, seed1.5vl thinking 91.2, Gemini2.5-pro thinking 84.3, OpenAI GPT4o 79.2, Qwen2.5vl-72B 87.3, InternVL3 78B 85.2",,,,,,,,,,,,,,,,,
ChartQA ,https://arxiv.org/pdf/2203.10244,2022,Reasoning,OCR & Chart& Document,chart,QA,1,,,"{""Gemini-Ultra"": 80.8, ""Gemini1.5-Pro"": 87.2, ""Seed1.5VL"": 89.1, ""Gemini2.5-Pro"": 83.3, ""OpenAI GPT4o"": 86.7, ""Qwen2.5VL-72B"": 89.5, ""InternVL3-78B"": 89.7}","Gemini-ultra 80.8, Gemini1.5-pro 87.2, seed1.5vl thinking 89.1, Gemini2.5-pro thinking 83.3, OpenAI GPT4o 86.7, Qwen2.5vl-72B 89.5, InternVL3 78B 89.7",,,,,,,,,,,,,,,,,
TAT-DQA,https://arxiv.org/pdf/2207.11871,2022,Reasoning,OCR & Chart& Document,document,QA,1,,,{"Gemini1.5-Pro": 37.8},Gemini1.5-pro 37.8,,,,,,,,,,,,,,,,,
DUDE ,https://arxiv.org/pdf/2305.08455,2023,Reasoning,OCR & Chart& Document,document,QA,,1,,{"Gemini1.5-Pro": 46.0},Gemini1.5-pro 46.0,,,,,,,,,,,,,,,,,
SEED-Bench-2-Plus,https://arxiv.org/pdf/2404.16790,2024,Reasoning,OCR & Chart& Document,"chart, web page, map",QA,1,,,"{""Qwen2.5VL-72B"": 73.0, ""InternVL2.5-78B"": 71.3, ""Gemini1.5-Pro"": 70.8}","Qwen2.5vl-72B 73.0, InternVL2.5 78B 71.3, Gemini 1.5 pro 70.8",,,,,,,,,,,,,,,,,
CharXiv reasoning/description,https://arxiv.org/pdf/2406.18521,2024,Reasoning,OCR & Chart& Document,chart,QA,1,,,"{""OpenAI o3 high"": {""score1"": 78.6, ""score2"": 95.0}, ""Seed1.5VL"": {""score1"": 60.2, ""score2"": 92.6}, ""Gemini2.5-Pro"": {""score1"": 69.9, ""score2"": 94.4}, ""OpenAI GPT4o"": {""score1"": 52.0, ""score2"": 86.5}, ""Qwen2.5VL-72B"": {""score1"": 49.7, ""score2"": 87.4}}","OpenAI o3 high 78.6/95.0, seed1.5vl thinking 60.2/92.6, Gemini2.5-pro thinking 69.9/94.4, OpenAI GPT4o 52.0/86.5, Qwen2.5vl-72B 49.7/87.4",,,,,,,,,,,,,,,,,
MMLongBench-DOC,https://arxiv.org/pdf/2407.01523,2024,Reasoning,OCR & Chart& Document,long document,QA,,1,,"{""Kimi-VL-A3B-Thinking-2506"": 42.1, ""OpenAI GPT4o"": 42.8, ""Qwen2.5VL-72B"": 38.8}",,,,,,,,,,,,,,,,,,
VisualWebBench ,https://arxiv.org/pdf/2404.05955,2024,Comprehensive,OCR & Chart& Document,web page,grounding,1,,,"{""Seed1.5VL"": 87.3, ""Gemini2.5-Pro"": 87.3, ""OpenAI GPT4o"": 80.2, ""Qwen2.5VL-72B"": 82.3}","seed1.5vl thinking 87.3, Gemini2.5-pro thinking 87.3, OpenAI GPT4o 80.2, Qwen2.5vl-72B 82.3",,,,,,,,,,,,,,,,,
QVHighlights,https://arxiv.org/pdf/2107.09609,2021,Understanding,Short Video,natural,moment retrieval,,,1,"{""Gemini2.5-Pro"": 75.0, ""OpenAI GPT4.1"": 71.4}","Gemini2.5-pro thinking 75.0, OpenAI GPT4.1 71.4",,,,,,,,,,,,,,,,,
TACoS ,https://arxiv.org/pdf/1403.6173,2014,Understanding,Short Video,natural,grounding,,,1,{"Seed1.5VL": 49.6},Seed1.5-VL thinking 49.6,,,,,,,,,,,,,,,,,
Charades-STA,https://arxiv.org/pdf/1705.02101,2017,Understanding,Short Video,natural,grounding,,,1,"{""Seed1.5VL"": 64.0, ""Qwen2.5VL-72B"": 50.9}","Seed1.5-VL thinking 64.0, Qwen2.5-VL 72B 50.9",,,,,,,,,,,,,,,,,50.9
YouCook2 ,https://arxiv.org/pdf/1703.09788,2017,Understanding,Short Video,cooking,perception,,,1,"{""Gemini-Ultra"": 135.4, ""Gemini1.5-Pro"": 106.5, ""Gemini2.5-Pro"": 188.3, ""OpenAI GPT4.1"": 127.6}","Gemini-ultra 135.4, Gemini1.5-pro 106.5, Gemini2.5-pro thinking 188.3, OpenAI GPT4.1 127.6",,,,,,,,,,,,,,,,,
VATEX ,https://arxiv.org/pdf/1904.03493,2019,Understanding,Short Video,natural,perception,,,1,"{""Gemini-Ultra"": {""en"": 62.7, ""zh"": 51.3}, ""Gemini1.5-Pro"": {""en"": 64.6, ""zh"": 55.3}, ""Gemini2.5-Pro"": {""en"": 71.3, ""zh"": 59.7}, ""OpenAI GPT4.1"": {""en"": 64.1, ""zh"": 48.7}}","Gemini-ultra 62.7/51.3 (En/Zh), Gemini1.5-pro 64.6/55.3 (En/Zh), Gemini2.5-pro thinking 71.3/59.7 (En/Zh), OpenAI GPT4.1 64.1/48.7 (En/Zh)",,,,,,,,,,,,,,,,,
EgoSchema,https://arxiv.org/pdf/2308.09126,2023,Understanding,Short Video,natural,perception,,,1,"{""Gemini1.5-Pro"": 72.2, ""Qwen2.5VL-72B"": 76.2}","Gemini1.5-pro 72.2, Qwen2.5-VL 72B 76.2",,,,,,,,,,,,,,,,,
TemporalBench,https://arxiv.org/pdf/2410.10818,2024,Understanding,Short Video,natural,captioning,,,1,"{""Seed1.5VL"": 79.8, ""OpenAI GPT4o"": 73.3}","Seed1.5-VL thinking 79.8, OpenAI GPT4o 73.3",,,,,,,,,,,,,,,,,
Dream-1K,https://arxiv.org/pdf/2407.00634,2024,Understanding,Short Video,natural,captioning,,,1,{"Seed1.5VL": 43.9},Seed1.5-VL thinking 43.9,,,,,,,,,,,,,,,,,
MotionBench,https://arxiv.org/pdf/2501.02955,2025,Understanding,Short Video,natural,motion understanding,,,1,{"Seed1.5VL": 68.4},Seed1.5-VL thinking 68.4,,,,,,,,,,,,,,,,,
MVBench,https://arxiv.org/pdf/2311.17005,2025,Understanding,Short Video,natural,"temporal understanding, spatial understanding",,,1,"{""Seed1.5VL"": 74.4, ""InternVL2.5-78B"": 76.4, ""Qwen2.5VL-72B"": 70.4}","Seed1.5-VL thinking 74.4, InternVL-2.5 76.4, Qwen2.5-VL 72B 70.4",,,,,,,,,,,,,,,,,70.4
ActivityNet-QA,https://arxiv.org/pdf/1906.02467,2019,Reasoning,Short Video,natural,QA,,,1,"{""Gemini-Ultra"": 52.2, ""Gemini1.5-Pro"": 57.5, ""Gemini2.5-Pro"": 66.6, ""OpenAI GPT4.1"": 60.4}","Gemini-ultra 52.2, Gemini1.5-pro 57.5, Gemini2.5-pro thinking 66.6, OpenAI GPT4.1 60.4",,,,,,,,,,,,,,,,,
NextQA ,https://arxiv.org/pdf/2105.08276,2021,Reasoning,Short Video,natural,QA,,,1,{"Gemini-Ultra": 29.9},Gemini-ultra 29.9,,,,,,,,,,,,,,,,,
Perception Test MCQA,https://proceedings.neurips.cc/paper_files/paper/2023/file/8540fba4abdc7f9f7a7b1cc6cd60e409-Paper-Datasets_and_Benchmarks.pdf,2023,Reasoning,Short Video,natural,QA,,,1,"{""Gemini-Ultra"": 54.7, ""Gemini2.5-Pro"": 78.4, ""OpenAI GPT4.1"": 64.8, ""Qwen2.5VL-72B"": 73.2}","Gemini-ultra 54.7, Gemini2.5-pro thinking 78.4, OpenAI GPT4.1 64.8, Qwen2.5-VL 72B 73.2",,,,,,,,,,,,,,,,,
MMVU,https://arxiv.org/pdf/2501.12380,2025,Reasoning,Short Video,multi-discipline,QA,,,1,"{""Gemini2.5-Pro"": 75.8, ""Seed1.5VL"": 70.1, ""Qwen2.5VL-72B"": 62.9}","Gemini2.5-pro thinking 75.8, seed1.5vl thinking 70.1, Qwen2.5-VL 72B 62.9",,,,,,,,,,,,,,,,,
VideoMMMU,https://arxiv.org/pdf/2501.13826,2025,Reasoning,Short Video,multi-discipline,QA,,,1,"{""Gemini2.5-Pro"": 83.6, ""OpenAI GPT4.1"": 60.9, ""Seed1.5VL"": 81.4, ""Kimi k1.6"": 76.7, ""Qwen2.5VL-72B"": 60.2}","Gemini2.5-pro thinking 83.6, OpenAI GPT4.1 60.9, seed1.5vl thinking 81.4, Kimi k1.6 76.7, Qwen2.5-VL 72B 60.2",,,,,,,,,,,,,,,,,
Minerva,https://arxiv.org/pdf/2505.00681,2025,Reasoning,Short Video,"sports, cooking, short film, science lecture",QA,,,1,"{""Gemini2.5-Pro"": 67.6, ""OpenAI GPT4.1"": 54.0}","Gemini2.5-pro thinking 67.6, OpenAI GPT4.1 54.0",,,,,,,,,,,,,,,,,
TempCompass ,https://arxiv.org/pdf/2403.00476,2024,Reasoning,Short Video,"natural, synthetic",temporal reasoning,,,1,"{""Seed1.5VL"": 83.7, ""Gemini2.5-Pro"": 75.8, ""Qwen2.5VL-72B"": 74.8}","Seed1.5-VL thinking 83.7, Gemini2.5-pro thinking 75.8, Qwen2.5-VL 72B 74.8",,,,,,,,,,,,,,,,,
TVBench,https://arxiv.org/pdf/2410.07752,2024,Reasoning,Short Video,"natural, synthetic",temporal reasoning,,,1,"{""Seed1.5VL"": 63.6, ""Gemini2.5-Pro"": 62.6}","Seed1.5-VL thinking 63.6, Gemini2.5-pro thinking 62.6",,,,,,,,,,,,,,,,,
TOMATO ,https://arxiv.org/pdf/2410.23266,2024,Reasoning,Short Video,natural,temporal reasoning,,,1,"{""Seed1.5VL"": 44.7, ""Gemini2.5-Pro"": 46.9}","Seed1.5-VL thinking 44.7, Gemini2.5-pro thinking 46.9",,,,,,,,,,,,,,,,,
EgoTempo,https://arxiv.org/pdf/2503.13646v1,2025,Reasoning,Short Video,natural,temporal reasoning,,,1,"{""Gemini2.5-Pro"": 44.3, ""OpenAI GPT4.1"": 40.3}","Gemini2.5-pro thinking 44.3, OpenAI GPT4.1 40.3",,,,,,,,,,,,,,,,,
MMBench-Video,https://arxiv.org/pdf/2406.14515,2024,Comprehensive,Short Video,"natural, synthetic","temporal reasoning, commonsense reasoning, attribute reasoning, logic reasoning, relation reasoning, perception",,,1,{"Qwen2.5VL-72B": 2.02},Qwen2.5vl-72B 2.02,,,,,,,,,,,,,,,,,
1H-VideoQA,https://arxiv.org/pdf/2403.05530,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 81.0, ""OpenAI GPT4.1"": 56.8}","Gemini2.5-pro thinking 81.0, OpenAI GPT4.1 56.8",,,,,,,,,,,,,,,,,
LVBench,https://arxiv.org/pdf/2406.08035,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 78.7, ""OpenAI GPT4.1"": 63.4, ""Seed1.5VL"": 64.6, ""Qwen2.5VL-72B"": 47.3}","Gemini2.5-pro thinking 78.7, OpenAI GPT4.1 63.4, Seed1.5-VL thinking 64.6, Qwen2.5-VL 72B 47.3",,,,,,,,,,,,,,,,,
VideoMME w/o subtitle,https://arxiv.org/abs/2405.21075,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 84.3, ""OpenAI GPT4.1"": 72.0, ""Seed1.5VL"": 77.9}","Gemini2.5-pro thinking 84.3, OpenAI GPT4.1 72.0, Seed1.5-vl thinking 77.9",,,,,,,,,,,,,,,,,
MLVU,https://arxiv.org/pdf/2406.04264,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Seed1.5VL"": 82.1, ""Gemini2.5-Pro"": 81.2, ""Qwen2.5VL-72B"": 74.6}","Seed1.5-VL thinking 82.1, Gemini2.5-pro thinking 81.2, Qwen2.5-VL 72B 74.6",,,,,,,,,,,,,,,,,
LongVideoBench ,https://arxiv.org/pdf/2407.15754,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Seed1.5VL"": 74.0, ""OpenAI GPT4o"": 66.7, ""Qwen2.5VL-72B"": 60.7, ""InternVL3-78B"": 65.7}","Seed1.5-VL thinking 74.0, OpenAI GPT4o 66.7, Qwen2.5-VL 72B 60.7, InternVL3 78B 65.7",,,,,,,,,,,,,,,,,
Neptune,https://arxiv.org/pdf/2412.09582,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 87.3, ""OpenAI GPT4.1"": 85.2}","Gemini2.5-pro thinking 87.3, OpenAI GPT4.1 85.2",,,,,,,,,,,,,,,,,
StreamBench ,https://arxiv.org/pdf/2501.13468,2025,Reasoning,Streaming Video,natural,streaming reasoning,,,1,"{""Seed1.5VL"": 72.8, ""OpenAI GPT4o"": 68.7}","Seed1.5-VL thinking 72.8, OpenAI GPT4o 68.7",,,,,,,,,,,,,,,,,
OVO-Bench,https://arxiv.org/pdf/2501.05510,2025,Reasoning,Streaming Video,natural,streaming reasoning,,,1,"{""Seed1.5VL"": 72.3, ""Gemini1.5-Pro"": 67.7}","Seed1.5-VL thinking 72.3, Gemini1.5-pro 67.7",,,,,,,,,,,,,,,,,
OVBench ,https://arxiv.org/pdf/2501.00584,2025,Reasoning,Streaming Video,natural,streaming reasoning,,,1,{"Seed1.5VL": 60.0},Seed1.5-VL thinking 60.0,,,,,,,,,,,,,,,,,
NYU-Depth V2 (absolute relative error↓),https://link.springer.com/chapter/10.1007/978-3-642-33715-4_54,2012,Understanding,Spatial & Embodied Reasoning,indoor scene,depth estimation,1,,,"{""Seed1.5VL"": 13.6, ""Gemini2.5-Pro"": 27.5, ""OpenAI GPT4o"": 73.8, ""Qwen2.5VL-72B"": 35.5}","seed1.5vl thinking 13.6, Gemini2.5-pro thinking 27.5, OpenAI GPT4o 73.8, Qwen2.5vl-72B 35.5",,,,,,,,,,,,,,,,,
DA-2K,https://arxiv.org/pdf/2406.09414v1,2024,Understanding,Spatial & Embodied Reasoning,natural,depth estimation,1,,,"{""Seed1.5VL"": 91.7, ""Gemini2.5-Pro"": 73.0, ""OpenAI GPT4o"": 66.9, ""Qwen2.5VL-72B"": 69.6}","seed1.5vl thinking 91.7, Gemini2.5-pro thinking 73.0, OpenAI GPT4o 66.9, Qwen2.5vl-72B 69.6",,,,,,,,,,,,,,,,,
OpenEQA ,https://openaccess.thecvf.com/content/CVPR2024/papers/Majumdar_OpenEQA_Embodied_Question_Answering_in_the_Era_of_Foundation_Models_CVPR_2024_paper.pdf,2024,Reasoning,Spatial & Embodied Reasoning,indoor scene,embodied reasoning,,,1,{"Gemini-Ultra": 57.9},Gemini-ultra 57.9,,,,,,,,,,,,,,,,,
VSI-Bench,https://arxiv.org/abs/2412.14171,2024,Reasoning,Spatial & Embodied Reasoning,indoor scene,"object count, object size, relative distance, absolute distance, appearance order, room size, relative direction, route plan",,,1,"{""InternVL3-78B"": 48.4, ""Gemini1.5-Pro"": 45.4, ""OpenAI GPT4o"": 34.0}","InternVL3-78B 48.4, Gemini-1.5 Pro 45.4, OpenAI GPT 4o 34.0",,,,,,,,,,,,,,,,,
All-Angles Bench,https://arxiv.org/pdf/2504.15280,2024,Reasoning,Spatial & Embodied Reasoning,"indoor scene, residential area, industrial space",spatial reasoning,,1,,"{""Seed1.5VL"": 58.6, ""Gemini2.5-Pro"": 53.4, ""OpenAI GPT4o"": 49.1, ""Qwen2.5VL-72B"": 55.7}","seed1.5vl thinking 58.6, Gemini2.5-pro thinking 53.4, OpenAI GPT4o 49.1, Qwen2.5vl-72B 55.7",,,,,,,,,,,,,,,,,
ERQA,https://storage.googleapis.com/deepmind-media/gemini-robotics/gemini_robotics_report.pdf,2025,Reasoning,Spatial & Embodied Reasoning,"indoor scene, residential area",spatial reasoning,1,1,,"{""Gemini2.0-Pro Experimental"": 54.8, ""OpenAI GPT4o"": 47.0, ""Qwen2.5VL-72B"": 44.8, ""GLM-4.5V"": 50.0}","seed1.5vl thinking 58.6, Gemini2.5-pro thinking 53.4, OpenAI GPT4o 49.1, Qwen2.5vl-72B 55.7",,,,,,,,,,,,,,,,,
ScreenSpot-V2,https://arxiv.org/pdf/2410.23218,2024,Understanding,Agent,GUI,grounding,,1,,"{""Seed1.5VL"": 95.2, ""OpenAI CUA"": 87.9, ""Claude3.7-Sonnet"": 87.6, ""Kimi-VL-A3B"": 92.8}","Seed1.5-VL thinking 95.2, OpenAI CUA 87.9, Claude 3.7 Sonnet 87.6, Kimi VL-A3B 92.8",,,,,,,,,,,,,,,,,
ScreenSpot-Pro,https://arxiv.org/pdf/2504.07981v1,2025,Understanding,Agent,GUI,grounding,1,,,"{""Seed1.5VL"": 60.9, ""OpenAI CUA"": 23.4, ""Claude3.7-Sonnet"": 27.7, ""Kimi-VL-A3B"": 34.5, ""Qwen2.5VL-72B"": 43.6}","Seed1.5-VL thinking 60.9, OpenAI CUA 23.4, Claude 3.7 Sonnet 27.7, Kimi VL-A3B 34.5, Qwen2.5vl 72B 43.6",,,,,,,,,,,,,,,,,
OSWorld ,https://arxiv.org/pdf/2404.07972,2024,Reasoning,Agent,GUI,computer use,,1,,"{""Seed1.5VL"": 36.1, ""OpenAI CUA"": 38.1, ""Claude3.7-Sonnet"": 28.0, ""Kimi-VL-A3B"": 8.2, ""Qwen2.5VL-72B"": 8.8}","Seed1.5-VL thinking 36.1, OpenAI CUA 38.1, Claude 3.7 Sonnet 28.0, Kimi VL-A3B 8.2, Qwen2.5vl 72B 8.8",,,,,,,,,,,,,,,,,
Windows Agent Arena,https://arxiv.org/pdf/2409.08264,2024,Reasoning,Agent,GUI,computer use,,1,,"{""Seed1.5VL"": 39.6, ""Claude3.7-Sonnet"": 38.9, ""Kimi-VL-A3B"": 10.4}","Seed1.5-VL thinking 39.6, Claude 3.7 Sonnet 38.9, Kimi VL-A3B 10.4",,,,,,,,,,,,,,,,,
WebVoyager ,https://arxiv.org/pdf/2401.13919,2024,Reasoning,Agent,GUI,browser use,,1,,"{""Seed1.5VL"": 87.2, ""OpenAI CUA"": 87.0, ""Claude3.7-Sonnet"": 84.1}","Seed1.5-VL thinking 87.2, OpenAI CUA 87.0, Claude 3.7 Sonnet 84.1",,,,,,,,,,,,,,,,,
Online-Mind2Web,https://arxiv.org/pdf/2504.01382,2025,Reasoning,Agent,GUI,browser use,,1,,"{""Seed1.5VL"": 76.4, ""OpenAI CUA"": 71.0, ""Claude3.7-Sonnet"": 62.9}","Seed1.5-VL thinking 76.4, OpenAI CUA 71.0, Claude 3.7 Sonnet 62.9",,,,,,,,,,,,,,,,,
Android World,https://arxiv.org/pdf/2405.14573v2,2024,Reasoning,Agent,GUI,phone use,,1,,"{""Seed1.5VL"": 62.1, ""Qwen2.5VL-72B"": 35.0}","Seed1.5-VL thinking 62.1, Qwen2.5vl 72B 35.0",,,,,,,,,,,,,,,,,
MobileMiniWob++,https://arxiv.org/pdf/2405.14573v2,2024,Reasoning,Agent,GUI,phone use,,1,,{"Qwen2.5VL-72B": 68.0},Qwen2.5vl 72B 68.0,,,,,,,,,,,,,,,,,
Android Control,https://arxiv.org/pdf/2406.03679,2024,Reasoning,Agent,GUI,phone use,,1,,"{""Qwen2.5VL-72B"": {""high"": 67.4, ""low"": 93.7}}",Qwen2.5vl 72B 67.4/93.7 (high/low),,,,,,,,,,,,,,,,,
|