|
{ |
|
"results": { |
|
"average_score": 8.77659574468085, |
|
"speed": 12.59830004953974, |
|
"contamination_score": 0, |
|
"execution_time": 1313.828051, |
|
"errors": [], |
|
"scores_by_category": [ |
|
{ |
|
"category": "Paraphrasing", |
|
"average_score": 10.0, |
|
"count": 6 |
|
}, |
|
{ |
|
"category": "Coding", |
|
"average_score": 10.0, |
|
"count": 3 |
|
}, |
|
{ |
|
"category": "Sentiment Analysis", |
|
"average_score": 10.0, |
|
"count": 9 |
|
}, |
|
{ |
|
"category": "Hallucination", |
|
"average_score": 10.0, |
|
"count": 3 |
|
}, |
|
{ |
|
"category": "Reading Comprehension", |
|
"average_score": 10.0, |
|
"count": 17 |
|
}, |
|
{ |
|
"category": "Entity Extraction", |
|
"average_score": 9.6, |
|
"count": 5 |
|
}, |
|
{ |
|
"category": "MMLU", |
|
"average_score": 9.338842975206612, |
|
"count": 121 |
|
}, |
|
{ |
|
"category": "General Knowledge", |
|
"average_score": 9.19047619047619, |
|
"count": 63 |
|
}, |
|
{ |
|
"category": "Trust & Safety", |
|
"average_score": 8.766666666666667, |
|
"count": 30 |
|
}, |
|
{ |
|
"category": "Diacritization", |
|
"average_score": 8.75, |
|
"count": 12 |
|
}, |
|
{ |
|
"category": "Long Context", |
|
"average_score": 8.75, |
|
"count": 4 |
|
}, |
|
{ |
|
"category": "Structuring", |
|
"average_score": 8.666666666666666, |
|
"count": 3 |
|
}, |
|
{ |
|
"category": "Function Calling", |
|
"average_score": 8.666666666666666, |
|
"count": 3 |
|
}, |
|
{ |
|
"category": "Reasoning & Math", |
|
"average_score": 8.581395348837209, |
|
"count": 43 |
|
}, |
|
{ |
|
"category": "Transliteration", |
|
"average_score": 8.166666666666666, |
|
"count": 6 |
|
}, |
|
{ |
|
"category": "Instruction Following", |
|
"average_score": 8.142857142857142, |
|
"count": 7 |
|
}, |
|
{ |
|
"category": "Summarization", |
|
"average_score": 8.125, |
|
"count": 8 |
|
}, |
|
{ |
|
"category": "Writing (incl Dialects)", |
|
"average_score": 8.0, |
|
"count": 22 |
|
}, |
|
{ |
|
"category": "RAG QA", |
|
"average_score": 7.975609756097561, |
|
"count": 41 |
|
}, |
|
{ |
|
"category": "Arabic Language & Grammar", |
|
"average_score": 7.764705882352941, |
|
"count": 17 |
|
}, |
|
{ |
|
"category": "Dialect Detection", |
|
"average_score": 7.636363636363637, |
|
"count": 11 |
|
}, |
|
{ |
|
"category": "Translation (incl Dialects)", |
|
"average_score": 7.611111111111111, |
|
"count": 36 |
|
} |
|
], |
|
"scores_by_format": [ |
|
{ |
|
"format": "Short Answer", |
|
"average_score": 10.0, |
|
"count": 5 |
|
}, |
|
{ |
|
"format": "MCQ", |
|
"average_score": 9.222707423580786, |
|
"count": 229 |
|
}, |
|
{ |
|
"format": "Generation", |
|
"average_score": 8.346491228070175, |
|
"count": 228 |
|
}, |
|
{ |
|
"format": "Fill-in-the-blank", |
|
"average_score": 7.5, |
|
"count": 8 |
|
} |
|
] |
|
}, |
|
"config": { |
|
"model": "openai/gpt-4.1-mini", |
|
"model_sha": "na", |
|
"submitted_time": "2025-05-11 08:26:34", |
|
"likes": -1, |
|
"params": 999, |
|
"license": "closed", |
|
"model_source": "API", |
|
"model_category": "Large" |
|
} |
|
} |