Arabic-LLM-Broad-Leaderboard / results /openai /o3_results_2025-05-23_08-49-13.json
karimouda's picture
Add results file for o3
26bd746 verified
{
"results": {
"average_score": 8.961702127659574,
"speed": 4.236624114787222,
"contamination_score": 0,
"execution_time": 5196.826389,
"errors": [
{
"error": "Error code: 400 - {'error': {'message': 'Invalid prompt: your prompt was flagged as potentially violating our usage policy. Please try again with a different prompt: https://platform.openai.com/docs/guides/reasoning#advice-on-prompting', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_prompt'}}",
"prompt": "{'role': 'user', 'content': '\u0644\u062f\u064a\u0643 \u062b\u0644\u0627\u062b\u0629 \u0635\u0646\u0627\u062f\u064a\u0642 \u0627\u0644\u0623\u0648\u0644\u0649 \u062a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 \u0643\u0631\u0627\u062a \u062d\u0645\u0631\u0627\u0621 \u0641\u0642\u0637 \u0648\u0627\u0644\u062b\u0627\u0646\u064a\u0629 \u062a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 \u0643\u0631\u0627\u062a \u0632\u0631\u0642\u0627\u0621 \u0641\u0642\u0637 \u0648\u0627\u0644\u062b\u0627\u0644\u062b\u0629 \u062a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 \u0645\u0632\u064a\u062c \u0645\u0646 \u0627\u0644\u0643\u0631\u0627\u062a \u0627\u0644\u062d\u0645\u0631\u0627\u0621 \u0648\u0627\u0644\u0632\u0631\u0642\u0627\u0621 \u062c\u0645\u064a\u0639 \u0627\u0644\u0635\u0646\u0627\u062f\u064a\u0642 \u062a\u0645 \u062a\u0633\u0645\u064a\u062a\u0647\u0627 \u0628\u0634\u0643\u0644 \u062e\u0627\u0637\u0626 \u0643\u064a\u0641 \u064a\u0645\u0643\u0646\u0643 \u062a\u0635\u062d\u064a\u062d \u0627\u0644\u062a\u0633\u0645\u064a\u0627\u062a \u0628\u0633\u062d\u0628 \u0643\u0631\u0629 \u0648\u0627\u062d\u062f\u0629 \u0641\u0642\u0637 \u0645\u0646 \u0625\u062d\u062f\u0649 \u0627\u0644\u0635\u0646\u0627\u062f\u064a\u0642'}"
}
],
"scores_by_category": [
{
"category": "Paraphrasing",
"average_score": 10.0,
"count": 6
},
{
"category": "Coding",
"average_score": 10.0,
"count": 3
},
{
"category": "Sentiment Analysis",
"average_score": 10.0,
"count": 9
},
{
"category": "Reading Comprehension",
"average_score": 10.0,
"count": 17
},
{
"category": "Long Context",
"average_score": 10.0,
"count": 4
},
{
"category": "MMLU",
"average_score": 9.50413223140496,
"count": 121
},
{
"category": "General Knowledge",
"average_score": 9.444444444444445,
"count": 63
},
{
"category": "Trust & Safety",
"average_score": 9.333333333333334,
"count": 30
},
{
"category": "RAG QA",
"average_score": 9.21951219512195,
"count": 41
},
{
"category": "Reasoning & Math",
"average_score": 8.837209302325581,
"count": 43
},
{
"category": "Diacritization",
"average_score": 8.833333333333334,
"count": 12
},
{
"category": "Entity Extraction",
"average_score": 8.8,
"count": 5
},
{
"category": "Instruction Following",
"average_score": 8.714285714285714,
"count": 7
},
{
"category": "Transliteration",
"average_score": 8.5,
"count": 6
},
{
"category": "Function Calling",
"average_score": 8.0,
"count": 3
},
{
"category": "Dialect Detection",
"average_score": 8.0,
"count": 11
},
{
"category": "Translation (incl Dialects)",
"average_score": 7.722222222222222,
"count": 36
},
{
"category": "Structuring",
"average_score": 7.666666666666667,
"count": 3
},
{
"category": "Writing (incl Dialects)",
"average_score": 7.545454545454546,
"count": 22
},
{
"category": "Summarization",
"average_score": 7.375,
"count": 8
},
{
"category": "Arabic Language & Grammar",
"average_score": 7.0,
"count": 17
},
{
"category": "Hallucination",
"average_score": 6.666666666666667,
"count": 3
}
],
"scores_by_format": [
{
"format": "Short Answer",
"average_score": 10.0,
"count": 5
},
{
"format": "MCQ",
"average_score": 9.397379912663755,
"count": 229
},
{
"format": "Fill-in-the-blank",
"average_score": 8.75,
"count": 8
},
{
"format": "Generation",
"average_score": 8.508771929824562,
"count": 228
}
]
},
"config": {
"model": "openai/o3",
"model_sha": "na",
"submitted_time": "2025-05-23 07:22:31",
"likes": -1,
"params": 999,
"license": "closed",
"model_source": "API",
"model_category": "Large"
}
}