|
{ |
|
"results": { |
|
"average_score": 8.961702127659574, |
|
"speed": 4.236624114787222, |
|
"contamination_score": 0, |
|
"execution_time": 5196.826389, |
|
"errors": [ |
|
{ |
|
"error": "Error code: 400 - {'error': {'message': 'Invalid prompt: your prompt was flagged as potentially violating our usage policy. Please try again with a different prompt: https://platform.openai.com/docs/guides/reasoning#advice-on-prompting', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_prompt'}}", |
|
"prompt": "{'role': 'user', 'content': '\u0644\u062f\u064a\u0643 \u062b\u0644\u0627\u062b\u0629 \u0635\u0646\u0627\u062f\u064a\u0642 \u0627\u0644\u0623\u0648\u0644\u0649 \u062a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 \u0643\u0631\u0627\u062a \u062d\u0645\u0631\u0627\u0621 \u0641\u0642\u0637 \u0648\u0627\u0644\u062b\u0627\u0646\u064a\u0629 \u062a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 \u0643\u0631\u0627\u062a \u0632\u0631\u0642\u0627\u0621 \u0641\u0642\u0637 \u0648\u0627\u0644\u062b\u0627\u0644\u062b\u0629 \u062a\u062d\u062a\u0648\u064a \u0639\u0644\u0649 \u0645\u0632\u064a\u062c \u0645\u0646 \u0627\u0644\u0643\u0631\u0627\u062a \u0627\u0644\u062d\u0645\u0631\u0627\u0621 \u0648\u0627\u0644\u0632\u0631\u0642\u0627\u0621 \u062c\u0645\u064a\u0639 \u0627\u0644\u0635\u0646\u0627\u062f\u064a\u0642 \u062a\u0645 \u062a\u0633\u0645\u064a\u062a\u0647\u0627 \u0628\u0634\u0643\u0644 \u062e\u0627\u0637\u0626 \u0643\u064a\u0641 \u064a\u0645\u0643\u0646\u0643 \u062a\u0635\u062d\u064a\u062d \u0627\u0644\u062a\u0633\u0645\u064a\u0627\u062a \u0628\u0633\u062d\u0628 \u0643\u0631\u0629 \u0648\u0627\u062d\u062f\u0629 \u0641\u0642\u0637 \u0645\u0646 \u0625\u062d\u062f\u0649 \u0627\u0644\u0635\u0646\u0627\u062f\u064a\u0642'}" |
|
} |
|
], |
|
"scores_by_category": [ |
|
{ |
|
"category": "Paraphrasing", |
|
"average_score": 10.0, |
|
"count": 6 |
|
}, |
|
{ |
|
"category": "Coding", |
|
"average_score": 10.0, |
|
"count": 3 |
|
}, |
|
{ |
|
"category": "Sentiment Analysis", |
|
"average_score": 10.0, |
|
"count": 9 |
|
}, |
|
{ |
|
"category": "Reading Comprehension", |
|
"average_score": 10.0, |
|
"count": 17 |
|
}, |
|
{ |
|
"category": "Long Context", |
|
"average_score": 10.0, |
|
"count": 4 |
|
}, |
|
{ |
|
"category": "MMLU", |
|
"average_score": 9.50413223140496, |
|
"count": 121 |
|
}, |
|
{ |
|
"category": "General Knowledge", |
|
"average_score": 9.444444444444445, |
|
"count": 63 |
|
}, |
|
{ |
|
"category": "Trust & Safety", |
|
"average_score": 9.333333333333334, |
|
"count": 30 |
|
}, |
|
{ |
|
"category": "RAG QA", |
|
"average_score": 9.21951219512195, |
|
"count": 41 |
|
}, |
|
{ |
|
"category": "Reasoning & Math", |
|
"average_score": 8.837209302325581, |
|
"count": 43 |
|
}, |
|
{ |
|
"category": "Diacritization", |
|
"average_score": 8.833333333333334, |
|
"count": 12 |
|
}, |
|
{ |
|
"category": "Entity Extraction", |
|
"average_score": 8.8, |
|
"count": 5 |
|
}, |
|
{ |
|
"category": "Instruction Following", |
|
"average_score": 8.714285714285714, |
|
"count": 7 |
|
}, |
|
{ |
|
"category": "Transliteration", |
|
"average_score": 8.5, |
|
"count": 6 |
|
}, |
|
{ |
|
"category": "Function Calling", |
|
"average_score": 8.0, |
|
"count": 3 |
|
}, |
|
{ |
|
"category": "Dialect Detection", |
|
"average_score": 8.0, |
|
"count": 11 |
|
}, |
|
{ |
|
"category": "Translation (incl Dialects)", |
|
"average_score": 7.722222222222222, |
|
"count": 36 |
|
}, |
|
{ |
|
"category": "Structuring", |
|
"average_score": 7.666666666666667, |
|
"count": 3 |
|
}, |
|
{ |
|
"category": "Writing (incl Dialects)", |
|
"average_score": 7.545454545454546, |
|
"count": 22 |
|
}, |
|
{ |
|
"category": "Summarization", |
|
"average_score": 7.375, |
|
"count": 8 |
|
}, |
|
{ |
|
"category": "Arabic Language & Grammar", |
|
"average_score": 7.0, |
|
"count": 17 |
|
}, |
|
{ |
|
"category": "Hallucination", |
|
"average_score": 6.666666666666667, |
|
"count": 3 |
|
} |
|
], |
|
"scores_by_format": [ |
|
{ |
|
"format": "Short Answer", |
|
"average_score": 10.0, |
|
"count": 5 |
|
}, |
|
{ |
|
"format": "MCQ", |
|
"average_score": 9.397379912663755, |
|
"count": 229 |
|
}, |
|
{ |
|
"format": "Fill-in-the-blank", |
|
"average_score": 8.75, |
|
"count": 8 |
|
}, |
|
{ |
|
"format": "Generation", |
|
"average_score": 8.508771929824562, |
|
"count": 228 |
|
} |
|
] |
|
}, |
|
"config": { |
|
"model": "openai/o3", |
|
"model_sha": "na", |
|
"submitted_time": "2025-05-23 07:22:31", |
|
"likes": -1, |
|
"params": 999, |
|
"license": "closed", |
|
"model_source": "API", |
|
"model_category": "Large" |
|
} |
|
} |