File size: 4,362 Bytes
12efa10
 
 
 
f07d235
12efa10
 
 
 
 
 
f07d235
bcbf716
f07d235
ca48878
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12efa10
 
 
 
 
 
 
13aff27
12efa10
 
 
ca48878
 
98e8d9a
ca48878
98e8d9a
 
423c7b2
12efa10
 
 
 
2fe1d39
 
 
 
cb2c8cb
 
 
 
2fe1d39
cb2c8cb
 
 
 
 
2fe1d39
cb2c8cb
 
 
2fe1d39
cb2c8cb
2fe1d39
cb2c8cb
2fe1d39
cb2c8cb
2fe1d39
12efa10
 
 
 
 
 
 
 
bcbf716
12efa10
bcbf716
 
 
 
 
 
 
 
 
12efa10
653f70c
f61cbe1
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from dataclasses import dataclass
from enum import Enum

@dataclass
class EvalDimension:
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class EvalDimensions(Enum):
    d0 = EvalDimension("speed",  "Speed (words/sec)")
    d1 = EvalDimension("contamination_score",  "Contamination Score")
    d2 = EvalDimension("paraphrasing",  "Paraphrasing")
    d3 = EvalDimension("sentiment analysis",  "Sentiment Analysis")
    d4 = EvalDimension("coding",  "Coding")
    d5 = EvalDimension("function calling",  "Function Calling")
    d6 = EvalDimension("rag qa",  "RAG QA")
    d7 = EvalDimension("reading comprehension",  "Reading Comprehension")
    d8 = EvalDimension("entity extraction",  "Entity Extraction")
    d9 = EvalDimension("summarization",  "Summarization")
    d10 = EvalDimension("long context",  "Long Context")
    d11 = EvalDimension("mmlu",  "MMLU")
    d12 = EvalDimension("arabic language & grammar",  "Arabic Language & Grammar")
    d13 = EvalDimension("general knowledge",  "General Knowledge")
    d14 = EvalDimension("translation (incl dialects)",  "Translation (incl Dialects)")
    d15 = EvalDimension("trust & safety","Trust & Safety")
    d16 = EvalDimension("writing (incl dialects)",  "Writing (incl Dialects)")
    d17 = EvalDimension("dialect detection",  "Dialect Detection")
    d18 = EvalDimension("reasoning & math",  "Reasoning & Math")
    d19 = EvalDimension("diacritization",  "Diacritization")
    d20 = EvalDimension("instruction following",  "Instruction Following")
    d21 = EvalDimension("transliteration",  "Transliteration")
    d22 = EvalDimension("structuring",  "Structuring")
    d23 = EvalDimension("hallucination",  "Hallucination")





    






NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------



# Your leaderboard name
TITLE = """<div ><img class='abl_header_image' src='https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Leaderboard/resolve/main/src/images/abl_logo.png' ></div>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
<h1 style='width: 100%;text-align: center;' id="space-title">Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
ABL is the official Leaderboard of <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Broad Benchmark (ABB)</a>. 
With advanced features and innovative visualizations, we provide the community with a comprehensive view of the capabilities of Arabic models, showcasing their speed, diverse skills while also defending against benchmarking contamination. 
The benchmark consists of <b>450 high quality human-validated questions</b> sampled from <b>63 Arabic benchmarking datasets</b>, evaluating <b>22 categories and skills</b>.
Find more details in the about Tab. 


"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""

## FAQ

### What is the difference betweem ABL and ABB?

ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here


### What can I learn more about ABL and ABB?

Feel free to read the following resources
ABB Page:
ABL blog post:

### How can I reproduce the results?

You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure.

### What is the Benchmark Score?

### What is the Contamination Score?

### What is the Speed?

### Why I am not allowed to submit models more than 15B parameters?


"""

EVALUATION_QUEUE_TEXT = """

"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite the Leaderboard"
CITATION_BUTTON_TEXT = r"""

@misc{ABL,
  author = {SILMA AI Team},
  title = {Arabic Broad Leaderboard},
  year = {2025},
  publisher = {SILMA.AI},
  howpublished = "{\url{https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Leaderboard}}"
}

"""

FOOTER_TEXT = """<div style='display:flex;justify-content:center;align-items:center;'>
<span style='font-size:36px;font-weight:bold;margin-right:20px;'>Sponsored By</span>
<a href='https://silma.ai/?ref=abl' target='_blank'>
<img style='height:60px' src='https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Leaderboard/resolve/main/src/images/silma-logo-wide.png' >
</a>
</div>"""