mizan-llm-leaderboard / leaderboard /boards_data /extractive-qa_PQuAD.jsonl
mehran
update thinking col
81c6d0b
{"Model Name":"claude-3-7-sonnet-20250219","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":21.8957345972,"extractive-qa_PQuAD_f1":0.5899280585,"nlu_score":0.7143086066}
{"Model Name":"gemma-3-4b-it","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"4300000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":29.8578199052,"extractive-qa_PQuAD_f1":0.6483891649,"nlu_score":0.6241793507}
{"Model Name":"c4ai-command-r-plus","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"104000000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":51.6587677725,"extractive-qa_PQuAD_f1":0.7997294818,"nlu_score":0.6297634971}
{"Model Name":"gemma-3n-E4B-it","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"7850000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":null,"extractive-qa_PQuAD_f1":null,"nlu_score":0.6552152029}
{"Model Name":"gpt-4.1","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":16.2085308057,"extractive-qa_PQuAD_f1":0.5540542726,"nlu_score":0.6758278127}
{"Model Name":"o4-mini","thinking_method":"βœ”οΈ","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":null,"extractive-qa_PQuAD_f1":null,"nlu_score":null}
{"Model Name":"gemma-3-12b-it","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"12200000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":37.4407582938,"extractive-qa_PQuAD_f1":0.7121215175,"nlu_score":0.699116864}
{"Model Name":"gemma-3-27b-it","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"27400000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":24.9289099526,"extractive-qa_PQuAD_f1":0.5952537387,"nlu_score":0.6898261633}
{"Model Name":"Qwen3-14B","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"14800000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":11.9431279621,"extractive-qa_PQuAD_f1":0.5054306037,"nlu_score":0.6460328733}
{"Model Name":"Qwen3-32B","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"32800000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":12.0379146919,"extractive-qa_PQuAD_f1":0.5152644082,"nlu_score":0.6714091535}
{"Model Name":"claude-3-5-haiku-20241022","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":1.5165876777,"extractive-qa_PQuAD_f1":0.3221621809,"nlu_score":0.3749414991}
{"Model Name":"Mistral-Small-3.1-24B-Instruct-2503","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"24000000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":13.0805687204,"extractive-qa_PQuAD_f1":0.5111951184,"nlu_score":0.5661558794}
{"Model Name":"DeepSeek-R1-0528-Qwen3-8B","thinking_method":"βœ”οΈ","model_url":"https:\/\/google.com","parameters_count":"8190000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":null,"extractive-qa_PQuAD_f1":null,"nlu_score":0.456845738}
{"Model Name":"deepseek-chat","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"671000000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":7.0142180095,"extractive-qa_PQuAD_f1":0.4986764425,"nlu_score":0.6752949557}
{"Model Name":"Qwen3-4B","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"4020000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":20.4739336493,"extractive-qa_PQuAD_f1":0.5660677645,"nlu_score":0.5121418762}
{"Model Name":"gemma-3-1b-it","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"1000000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":15.6398104265,"extractive-qa_PQuAD_f1":0.4797901431,"nlu_score":0.3619547874}
{"Model Name":"aya-expanse-32b","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"32300000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":0.8530805687,"extractive-qa_PQuAD_f1":0.3570972648,"nlu_score":0.3928685253}
{"Model Name":"Llama-3.3-70B-Instruct","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"70600000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":38.2938388626,"extractive-qa_PQuAD_f1":0.7091014157,"nlu_score":0.6800109206}
{"Model Name":"gpt-4.1-mini","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":29.0995260664,"extractive-qa_PQuAD_f1":0.6500014945,"nlu_score":0.6833497104}
{"Model Name":"o3","thinking_method":"βœ”οΈ","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":45.5924170616,"extractive-qa_PQuAD_f1":0.7918102773,"nlu_score":0.7207167537}
{"Model Name":"gpt-4o-mini","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":7.2037914692,"extractive-qa_PQuAD_f1":0.4722142546,"nlu_score":0.6459120734}
{"Model Name":"c4ai-command-a-03-2025","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"111000000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":5.8767772512,"extractive-qa_PQuAD_f1":0.4459269248,"nlu_score":0.4824528512}
{"Model Name":"gemini-2.0-flash","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":37.4407582938,"extractive-qa_PQuAD_f1":0.6861140935,"nlu_score":0.7050532433}
{"Model Name":"gemini-2.5-flash","thinking_method":"βœ”οΈ","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":45.1184834123,"extractive-qa_PQuAD_f1":0.7795163265,"nlu_score":0.6944128198}
{"Model Name":"gemini-2.0-flash-lite","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":33.2701421801,"extractive-qa_PQuAD_f1":0.6885320288,"nlu_score":0.6914202844}
{"Model Name":"c4ai-command-r-v01","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"35000000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":41.990521327,"extractive-qa_PQuAD_f1":0.7401025641,"nlu_score":0.531045981}
{"Model Name":"gpt-4.1-nano","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":21.5165876777,"extractive-qa_PQuAD_f1":0.6052090568,"nlu_score":0.6262096694}
{"Model Name":"Qwen3-8B","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"8190000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":1.8957345972,"extractive-qa_PQuAD_f1":0.4954484984,"nlu_score":0.5968415875}
{"Model Name":"Mistral-7B-Instruct-v0.3","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"7250000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":0.4739336493,"extractive-qa_PQuAD_f1":0.3440209421,"nlu_score":0.3916645306}
{"Model Name":"gpt-4o","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":17.5355450237,"extractive-qa_PQuAD_f1":0.5641459437,"nlu_score":0.7146808531}
{"Model Name":"deepseek-reasoner","thinking_method":"βœ”οΈ","model_url":"https:\/\/google.com","parameters_count":"671000000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":36.7772511848,"extractive-qa_PQuAD_f1":0.7059801524,"nlu_score":0.6361186163}
{"Model Name":"Qwen3-30B-A3B","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"30500000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":1.4218009479,"extractive-qa_PQuAD_f1":0.6109462131,"nlu_score":0.6255818412}
{"Model Name":"Llama-3.2-3B-Instruct","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"3210000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":2.3696682464,"extractive-qa_PQuAD_f1":0.4003473594,"nlu_score":0.1368924446}
{"Model Name":"Llama-3.2-1B-Instruct","thinking_method":"❌","model_url":"https:\/\/google.com","parameters_count":"1240000000","source_type":"Open-Source","extractive-qa_PQuAD_exact_match":0.663507109,"extractive-qa_PQuAD_f1":0.3378125221,"nlu_score":0.046805056}
{"Model Name":"gemini-2.5-pro","thinking_method":"βœ”οΈ","model_url":"https:\/\/google.com","parameters_count":"None","source_type":"Closed-Source","extractive-qa_PQuAD_exact_match":49.5734597156,"extractive-qa_PQuAD_f1":0.7803597788,"nlu_score":0.6992555201}