kiliangoto commited on
Commit
ef54478
·
0 Parent(s):

Initial commit of Sahabat-AI Leaderboard

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sahabat AI Leaderboard
3
+ emoji: 🥇
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ ---
11
+
12
+ # Start the configuration
13
+
14
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
15
+
16
+ Results files should have the following format and be stored as json files:
17
+ ```json
18
+ {
19
+ "config": {
20
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
21
+ "model_name": "path of the model on the hub: org/model",
22
+ "model_sha": "revision on the hub",
23
+ },
24
+ "results": {
25
+ "task_name": {
26
+ "metric_name": score,
27
+ },
28
+ "task_name2": {
29
+ "metric_name": score,
30
+ }
31
+ }
32
+ }
33
+ ```
34
+
35
+ Request files are created automatically by this tool.
36
+
37
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
38
+
39
+ # Code logic for more complex edits
40
+
41
+ You'll find
42
+ - the main table' columns names and properties in `src/display/utils.py`
43
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
44
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+
4
+ # Import function to load leaderboard tables
5
+ from src.populate import load_tables
6
+ # Import configurations and informational texts
7
+ from src.config import (
8
+ file_path,
9
+ model_types,
10
+ hidden_tabs,
11
+ INTRODUCTION_TEXT,
12
+ TITLE,
13
+ INFO_BENCHMARK_TASK,
14
+ INFO_SCORE_CALCULATION,
15
+ INFO_GOTO_SAHABAT_AI,
16
+ CITATIONS
17
+ )
18
+
19
+ # Create a Gradio application with block-based UI
20
+ # 'Blocks()' is used to group multiple components in a single interface
21
+ demo = gr.Blocks()
22
+ with demo:
23
+ gr.HTML(TITLE) # Display the main title of the application
24
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") # Display introductory text
25
+
26
+ # Create tabs to display different leaderboard tables
27
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
28
+ tables = load_tables(file_path) # Load leaderboard data from file
29
+ for model_type in model_types:
30
+ with gr.TabItem(model_type, elem_id="llm-benchmark-tab-table", id=model_type):
31
+ for i, t in enumerate(tables): # Loop through the tables to create tabs
32
+ if (model_type, t["name"]) in hidden_tabs:
33
+ continue
34
+ with gr.TabItem(t["name"], elem_id="llm-benchmark-tab-table", id=i):
35
+ table_df = t["table"][t["table"]["Type"] == model_type]
36
+ table_df = table_df.dropna(axis=1, how='all')
37
+ leaderboard = Leaderboard(
38
+ value=table_df, # Leaderboard data
39
+ search_columns=["Model"], # Columns that can be searched
40
+ filter_columns=[
41
+ ColumnFilter(table_df["Size"].name, type="checkboxgroup", label="Model sizes"),
42
+ ], # Filters based on model type and size
43
+ hide_columns=t["hidden_col"], # Columns to be hidden imported from config.py
44
+ interactive=False,
45
+ )
46
+
47
+ # Add additional informational sections using Accordion
48
+ with gr.Row():
49
+ with gr.Accordion("📚 Benchmark Tasks", open=False):
50
+ gr.Markdown(INFO_BENCHMARK_TASK, elem_classes="markdown-text")
51
+
52
+ with gr.Row():
53
+ with gr.Accordion("🧮 Score Calculation", open=False):
54
+ gr.Markdown(INFO_SCORE_CALCULATION, elem_classes="markdown-text")
55
+
56
+ with gr.Row():
57
+ with gr.Accordion("🤝 About Sahabat-AI", open=False):
58
+ gr.Markdown(INFO_GOTO_SAHABAT_AI, elem_classes="markdown-text")
59
+
60
+ with gr.Row():
61
+ with gr.Accordion("📝 Citations", open=False):
62
+ gr.Markdown(CITATIONS, elem_classes="markdown-text")
63
+
64
+ # Run the application
65
+ demo.launch()
config/model_performance.jsonl ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model_name": "meta-llama/Llama-3.1-8B", "model_type": "Base", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 70.62232177639267}, "qa": {"total": 74.0400264179707}, "metaphor": {"total": 59.26640926640927}, "total": 67.9762524869242}, "safety": {"toxicity": {"total": 49.02062654372765}, "total": 49.02062654372765}, "nlg": {"abssum": {"total": 23.608069870057587}, "translation-en-xx": {"total": 89.18834918478261}, "translation-xx-en": {"total": 91.81120306324111}, "total": 57.053922997034725}, "nlr": {"causal": {"total": 72.0}, "nli": {"total": 60.66223252911016}, "total": 66.33111626455508}, "linguistic-diagnostics": {"mp-r": {"total": 38.467261904761926}, "pragmatics": {"total": 0}, "total": 19.233630952380963}, "instruction-following": {"total": 17.142857142857142}, "total": 46.126401064579966}, "jv": {"nlu": {"sentiment": {"total": 60.85693940041301}, "qa-mc": {"total": 60.47619047619048}, "metaphor": {"total": 23.65191146881287}, "total": 48.32834711513879}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 62.24888959390863}, "translation-xx-id": {"total": 76.60850253807106}, "total": 69.42869606598984}, "total": 43.696792171487324}, "su": {"nlu": {"sentiment": {"total": 69.74912767927081}, "qa-mc": {"total": 45.714285714285715}, "metaphor": {"total": 10.994668137525277}, "total": 42.15269384369393}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 57.64538388324873}, "translation-xx-id": {"total": 71.5006543464467}, "total": 64.57301911484771}, "total": 41.606983684593246}, "ban": {"nlu": {"sentiment": {"total": 52.53414512568539}, "total": 52.53414512568539}, "instruction-following": {"total": 26.666666666666668}, "nlg": {"translation-id-xx": {"total": 52.05575824873097}, "translation-xx-id": {"total": 63.63691703680203}, "total": 57.8463376427665}, "total": 45.68238314503952}, "bbc": {"nlu": {"sentiment": {"total": 19.372142704550303}, "qa": {"total": 36.86877604474056}, "total": 28.120459374645428}, "instruction-following": {"total": 21.904761904761905}, "nlg": {"translation-id-xx": {"total": 22.465418781725887}, "translation-xx-id": {"total": 53.68107550761421}, "total": 38.07324714467005}, "total": 29.366156141359127}}
2
+ {"model_name": "meta-llama/Meta-Llama-3-8B", "model_type": "Base", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 71.67852930355365}, "qa": {"total": 74.29545907237966}, "metaphor": {"total": 57.901268615554315}, "total": 67.95841899716254}, "safety": {"toxicity": {"total": 51.13260813250443}, "total": 51.13260813250443}, "nlg": {"abssum": {"total": 22.55571208466509}, "translation-en-xx": {"total": 89.17333791378458}, "translation-xx-en": {"total": 91.6809458374506}, "total": 56.49142698014134}, "nlr": {"causal": {"total": 68.8}, "nli": {"total": 51.71622423858406}, "total": 60.25811211929203}, "linguistic-diagnostics": {"mp-r": {"total": 43.08035714285714}, "pragmatics": {"total": 0}, "total": 21.54017857142857}, "instruction-following": {"total": 13.333333333333334}, "total": 45.11901302231038}, "jv": {"nlu": {"sentiment": {"total": 66.6700135298725}, "qa-mc": {"total": 58.57142857142858}, "metaphor": {"total": 24.28571428571429}, "total": 49.842385462338456}, "instruction-following": {"total": 11.428571428571429}, "nlg": {"translation-id-xx": {"total": 62.900222081218274}, "translation-xx-id": {"total": 77.09444400380711}, "total": 69.9973330425127}, "total": 43.75609664447419}, "su": {"nlu": {"sentiment": {"total": 67.27850174464145}, "qa-mc": {"total": 43.80952380952382}, "metaphor": {"total": 4.908990623276344}, "total": 38.6656720591472}, "instruction-following": {"total": 12.380952380952381}, "nlg": {"translation-id-xx": {"total": 55.41981281725889}, "translation-xx-id": {"total": 72.32473429568527}, "total": 63.87227355647208}, "total": 38.306299332190555}, "ban": {"nlu": {"sentiment": {"total": 52.15723136082033}, "total": 52.15723136082033}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 50.92592005076142}, "translation-xx-id": {"total": 64.71889871510152}, "total": 57.82240938293147}, "total": 42.056705644742664}, "bbc": {"nlu": {"sentiment": {"total": 13.632699565619873}, "qa": {"total": 40.6976614170028}, "total": 27.165180491311336}, "instruction-following": {"total": 25.71428571428571}, "nlg": {"translation-id-xx": {"total": 20.986992385786802}, "translation-xx-id": {"total": 53.95026967005076}, "total": 37.46863102791878}, "total": 30.116032411171943}}
3
+ {"model_name": "meta-llama/Llama-3.3-70B-Instruct", "model_type": "Instruct", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 74.03171395489761}, "qa": {"total": 76.30208921433771}, "metaphor": {"total": 75.53318624747196}, "total": 75.28899647223575}, "safety": {"toxicity": {"total": 51.150070847815535}, "total": 51.150070847815535}, "nlg": {"abssum": {"total": 17.398772409440475}, "translation-en-xx": {"total": 91.30748594985178}, "translation-xx-en": {"total": 92.44811287981719}, "total": 54.638285912137476}, "nlr": {"causal": {"total": 93.19999999999999}, "nli": {"total": 78.47882864922653}, "total": 85.83941432461326}, "linguistic-diagnostics": {"mp-r": {"total": 42.782738095238074}, "pragmatics": {"total": 74.8076923076923}, "total": 58.79521520146519}, "instruction-following": {"total": 93.33333333333333}, "multi-turn": {"total": 66.71768707482994}, "total": 69.3947147380615}, "jv": {"nlu": {"sentiment": {"total": 64.5398419141209}, "qa-mc": {"total": 85.71428571428572}, "metaphor": {"total": 47.48490945674044}, "total": 65.91301236171569}, "instruction-following": {"total": 30.476190476190478}, "nlg": {"translation-id-xx": {"total": 76.54370241116752}, "translation-xx-id": {"total": 87.42237071700508}, "total": 81.98303656408629}, "multi-turn": {"total": 69.60884353741497}, "total": 61.99527073485186}, "su": {"nlu": {"sentiment": {"total": 60.20650858078758}, "qa-mc": {"total": 76.19047619047619}, "metaphor": {"total": 34.8271741128884}, "total": 57.07471962805072}, "instruction-following": {"total": 50.476190476190474}, "nlg": {"translation-id-xx": {"total": 72.01518876903553}, "translation-xx-id": {"total": 81.19715656725889}, "total": 76.60617266814721}, "multi-turn": {"total": 74.48979591836735}, "total": 64.66171967268895}, "ban": {"nlu": {"sentiment": {"total": 55.27572456027914}, "total": 55.27572456027914}, "instruction-following": {"total": 30.476190476190478}, "nlg": {"translation-id-xx": {"total": 52.35049175126903}, "translation-xx-id": {"total": 73.82601324555837}, "total": 63.0882524984137}, "multi-turn": {"total": 68.41836734693878}, "total": 54.31463372045552}, "bbc": {"nlu": {"sentiment": {"total": 23.92152673930071}, "qa": {"total": 37.0966478780963}, "total": 30.509087308698504}, "instruction-following": {"total": 11.428571428571429}, "nlg": {"translation-id-xx": {"total": 40.36294416243655}, "translation-xx-id": {"total": 58.640625}, "total": 49.501784581218274}, "multi-turn": {"total": 64.21768707482994}, "total": 38.914282598329535}, "indommlu": {"total": 58.94715166043224, "STEM": {"total": 60.65596005103318}, "Humanities": {"total": 70.13279289089097}, "Social science": {"total": 68.19753140670149}, "Indonesian language": {"total": 62.79989190728599}, "Local languages and cultures": {"total": 37.77960008071277}}}
4
+ {"model_name": "meta-llama/Llama-3.1-8B-Instruct", "model_type": "Instruct", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 72.43735932562872}, "qa": {"total": 72.98486282425593}, "metaphor": {"total": 58.645890788747934}, "total": 68.02270431287752}, "safety": {"toxicity": {"total": 38.908373094726265}, "total": 38.908373094726265}, "nlg": {"abssum": {"total": 18.678845956389008}, "translation-en-xx": {"total": 89.42876497653162}, "translation-xx-en": {"total": 91.56136132040514}, "total": 54.58695455242869}, "nlr": {"causal": {"total": 63.20000000000001}, "nli": {"total": 58.413678956243096}, "total": 60.80683947812155}, "linguistic-diagnostics": {"mp-r": {"total": 22.54464285714286}, "pragmatics": {"total": 14.32692307692307}, "total": 18.435782967032964}, "instruction-following": {"total": 76.19047619047619}, "multi-turn": {"total": 52.61904761904762}, "total": 52.79573974495869}, "jv": {"nlu": {"sentiment": {"total": 53.942391226945794}, "qa-mc": {"total": 58.57142857142858}, "metaphor": {"total": 17.444668008048293}, "total": 43.319495935474215}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 59.02189086294416}, "translation-xx-id": {"total": 75.13764078362944}, "total": 67.0797658232868}, "multi-turn": {"total": 53.40136054421769}, "total": 43.33110795669706}, "su": {"nlu": {"sentiment": {"total": 43.91839350566117}, "qa-mc": {"total": 43.33333333333333}, "metaphor": {"total": 5.244530244530243}, "total": 30.832085694508248}, "instruction-following": {"total": 7.6190476190476195}, "nlg": {"translation-id-xx": {"total": 46.92480964467005}, "translation-xx-id": {"total": 61.685398159898476}, "total": 54.30510390228426}, "multi-turn": {"total": 58.11224489795919}, "total": 37.717120528449826}, "ban": {"nlu": {"sentiment": {"total": 33.98760948515275}, "total": 33.98760948515275}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 35.4739451142132}, "translation-xx-id": {"total": 59.36871430837564}, "total": 47.42132971129442}, "multi-turn": {"total": 52.908163265306115}, "total": 35.9602279963907}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 25.257711433282036}, "total": 12.628855716641018}, "instruction-following": {"total": 0.9523809523809524}, "nlg": {"translation-id-xx": {"total": 30.788229695431472}, "translation-xx-id": {"total": 38.916164340101524}, "total": 34.8521970177665}, "multi-turn": {"total": 43.265306122448976}, "total": 22.924684952309363}, "indommlu": {"total": 40.89436643992288, "STEM": {"total": 39.189826629857905}, "Humanities": {"total": 48.02667298928306}, "Social science": {"total": 51.41311806730703}, "Indonesian language": {"total": 47.57202823512603}, "Local languages and cultures": {"total": 21.777177572443968}}}
5
+ {"model_name": "meta-llama/Llama-3.1-70B-Instruct", "model_type": "Instruct", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 74.03171395489761}, "qa": {"total": 75.11092022580092}, "metaphor": {"total": 70.78047435190294}, "total": 73.30770284420049}, "safety": {"toxicity": {"total": 52.48912859389008}, "total": 52.48912859389008}, "nlg": {"abssum": {"total": 22.789696909710898}, "translation-en-xx": {"total": 91.94257001914525}, "translation-xx-en": {"total": 92.4886296087574}, "total": 57.50264836183111}, "nlr": {"causal": {"total": 92.39999999999999}, "nli": {"total": 74.06280477957523}, "total": 83.2314023897876}, "linguistic-diagnostics": {"mp-r": {"total": 54.077380952380935}, "pragmatics": {"total": 57.01923076923077}, "total": 55.548305860805854}, "instruction-following": {"total": 83.80952380952381}, "multi-turn": {"total": 64.28571428571429}, "total": 67.16777516367902}, "jv": {"nlu": {"sentiment": {"total": 65.16378266752119}, "qa-mc": {"total": 84.14785756249171}, "metaphor": {"total": 47.40442655935615}, "total": 65.57202226312302}, "instruction-following": {"total": 24.761904761904763}, "nlg": {"translation-id-xx": {"total": 79.32576538705584}, "translation-xx-id": {"total": 85.88955425126903}, "total": 82.60765981916244}, "multi-turn": {"total": 69.84693877551021}, "total": 60.69713140492511}, "su": {"nlu": {"sentiment": {"total": 63.15274513992737}, "qa-mc": {"total": 72.35942918869749}, "metaphor": {"total": 25.284978856407434}, "total": 53.599051061677436}, "instruction-following": {"total": 40.95238095238095}, "nlg": {"translation-id-xx": {"total": 74.6673144035533}, "translation-xx-id": {"total": 81.10521097715736}, "total": 77.88626269035532}, "multi-turn": {"total": 66.20748299319727}, "total": 59.66129442440274}, "ban": {"nlu": {"sentiment": {"total": 58.48444064658549}, "total": 58.48444064658549}, "instruction-following": {"total": 25.71428571428571}, "nlg": {"translation-id-xx": {"total": 50.2741116751269}, "translation-xx-id": {"total": 73.7688075031726}, "total": 62.02145958914975}, "multi-turn": {"total": 68.36734693877551}, "total": 53.646883222199115}, "bbc": {"nlu": {"sentiment": {"total": 21.778466139713746}, "qa": {"total": 36.54511563444106}, "total": 29.1617908870774}, "instruction-following": {"total": 11.428571428571429}, "nlg": {"translation-id-xx": {"total": 40.4752538071066}, "translation-xx-id": {"total": 57.3776173857868}, "total": 48.926435596446694}, "multi-turn": {"total": 56.173469387755105}, "total": 36.42256682496266}, "indommlu": {"total": 59.20845024270631, "STEM": {"total": 61.23914364790225}, "Humanities": {"total": 70.01440666360243}, "Social science": {"total": 68.1316435966181}, "Indonesian language": {"total": 63.557422093070194}, "Local languages and cultures": {"total": 37.485822556014845}}}
6
+ {"model_name": "meta-llama/Meta-Llama-3-8B-Instruct", "model_type": "Instruct", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 65.89674501147039}, "qa": {"total": 71.49747763705508}, "metaphor": {"total": 55.906416620702345}, "total": 64.43354642307594}, "safety": {"toxicity": {"total": 30.5564686517935}, "total": 30.5564686517935}, "nlg": {"abssum": {"total": 18.04238577912569}, "translation-en-xx": {"total": 87.625}, "translation-xx-en": {"total": 90.98604007766181}, "total": 53.6739529089783}, "nlr": {"causal": {"total": 68.00000000000001}, "nli": {"total": 58.605562589496515}, "total": 63.302781294748264}, "linguistic-diagnostics": {"mp-r": {"total": 20.818452380952365}, "pragmatics": {"total": 30.57692307692306}, "total": 25.697687728937712}, "instruction-following": {"total": 28.57142857142857}, "multi-turn": {"total": 55.90136054421768}, "total": 46.01960373188285}, "jv": {"nlu": {"sentiment": {"total": 42.61952574236275}, "qa-mc": {"total": 48.0952380952381}, "metaphor": {"total": 16.056338028169016}, "total": 35.59036728858996}, "instruction-following": {"total": 2.857142857142857}, "nlg": {"translation-id-xx": {"total": 43.50182423857868}, "translation-xx-id": {"total": 70.62736952728426}, "total": 57.06459688293147}, "multi-turn": {"total": 61.683673469387756}, "total": 39.29894512451301}, "su": {"nlu": {"sentiment": {"total": 28.000000000000004}, "qa-mc": {"total": 31.428571428571427}, "metaphor": {"total": 0}, "total": 19.80952380952381}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 45.93464467005076}, "translation-xx-id": {"total": 58.81846843274111}, "total": 52.376556551395936}, "multi-turn": {"total": 56.88775510204081}, "total": 33.22083981812109}, "ban": {"nlu": {"sentiment": {"total": 0}, "total": 0.0}, "instruction-following": {"total": 0.9523809523809524}, "nlg": {"translation-id-xx": {"total": 57.54861992385787}, "translation-xx-id": {"total": 52.827906884517766}, "total": 55.18826340418782}, "multi-turn": {"total": 54.48979591836734}, "total": 27.65761006873403}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 18.09226209386497}, "total": 9.046131046932485}, "nlg": {"translation-id-xx": {"total": 18.681789340101524}, "translation-xx-id": {"total": 32.99952411167513}, "total": 25.840656725888326}, "multi-turn": {"total": 48.33333333333334}, "total": 27.740040368718052}, "indommlu": {"total": 32.80503442646069, "STEM": {"total": 30.745384112748297}, "Humanities": {"total": 40.495667733523575}, "Social science": {"total": 44.50789696902289}, "Indonesian language": {"total": 36.58659081993335}, "Local languages and cultures": {"total": 15.699814759911021}}}
7
+ {"model_name": "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct", "model_type": "Instruct", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 80.03073193957495}, "qa": {"total": 73.67491779837458}, "metaphor": {"total": 63.297481154624016}, "total": 72.33437696419118}, "safety": {"toxicity": {"total": 49.08728635049469}, "total": 49.08728635049469}, "nlg": {"abssum": {"total": 21.912194636447175}, "translation-en-xx": {"total": 91.51027513586956}, "translation-xx-en": {"total": 91.94370869874012}, "total": 56.819593276876006}, "nlr": {"causal": {"total": 76.0}, "nli": {"total": 55.57921414897125}, "total": 65.78960707448563}, "linguistic-diagnostics": {"mp-r": {"total": 33.928571428571445}, "pragmatics": {"total": 22.307692307692317}, "total": 28.118131868131883}, "instruction-following": {"total": 79.04761904761905}, "multi-turn": {"total": 52.312925170068034}, "total": 57.64421996455234}, "jv": {"nlu": {"sentiment": {"total": 76.31624296802674}, "qa-mc": {"total": 56.66666666666668}, "metaphor": {"total": 39.81891348088533}, "total": 57.600607705192914}, "instruction-following": {"total": 30.476190476190478}, "nlg": {"translation-id-xx": {"total": 72.59359137055837}, "translation-xx-id": {"total": 85.57393123413705}, "total": 79.08376130234771}, "multi-turn": {"total": 58.40136054421768}, "total": 56.3904800069872}, "su": {"nlu": {"sentiment": {"total": 68.27543972085736}, "qa-mc": {"total": 43.8095238095238}, "metaphor": {"total": 24.774774774774766}, "total": 45.61991276838531}, "instruction-following": {"total": 40.95238095238095}, "nlg": {"translation-id-xx": {"total": 63.15315672588832}, "translation-xx-id": {"total": 80.32120479060913}, "total": 71.73718075824873}, "multi-turn": {"total": 55.2891156462585}, "total": 53.399647531318365}, "ban": {"nlu": {"sentiment": {"total": 37.35099337748344}, "total": 37.35099337748344}, "instruction-following": {"total": 12.380952380952381}, "nlg": {"translation-id-xx": {"total": 53.2818845177665}, "translation-xx-id": {"total": 62.99171161167513}, "total": 58.136798064720814}, "multi-turn": {"total": 47.32993197278911}, "total": 38.79966894898644}, "bbc": {"nlu": {"sentiment": {"total": 7.942604856512146}, "qa": {"total": 25.678260705469608}, "total": 16.810432780990876}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 23.885786802030456}, "translation-xx-id": {"total": 44.879917512690355}, "total": 34.382852157360404}, "multi-turn": {"total": 42.89115646258503}, "total": 26.85444368356741}, "indommlu": {"total": 31.99357367401823, "STEM": {"total": 33.22726354840381}, "Humanities": {"total": 41.0664295998681}, "Social science": {"total": 41.72069115515451}, "Indonesian language": {"total": 31.98855253821362}, "Local languages and cultures": {"total": 16.08184881924881}}}
8
+ {"model_name": "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct", "model_type": "Instruct", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 78.58212407479547}, "qa": {"total": 78.3094264272406}, "metaphor": {"total": 73.4969663541092}, "total": 76.79617228538176}, "safety": {"toxicity": {"total": 36.55204361614938}, "total": 36.55204361614938}, "nlg": {"abssum": {"total": 21.005682373720838}, "translation-en-xx": {"total": 92.34778990010498}, "translation-xx-en": {"total": 92.74215179563983}, "total": 56.775326610796625}, "nlr": {"causal": {"total": 88.39999999999999}, "nli": {"total": 64.9508863012699}, "total": 76.67544315063495}, "linguistic-diagnostics": {"mp-r": {"total": 43.764880952380935}, "pragmatics": {"total": 58.07692307692307}, "total": 50.920902014652}, "instruction-following": {"total": 85.71428571428571}, "multi-turn": {"total": 55.935374149659864}, "total": 62.767078220222906}, "jv": {"nlu": {"sentiment": {"total": 76.60243537705617}, "qa-mc": {"total": 83.33333333333334}, "metaphor": {"total": 50.30181086519114}, "total": 70.07919319186021}, "instruction-following": {"total": 49.523809523809526}, "nlg": {"translation-id-xx": {"total": 73.32515069796955}, "translation-xx-id": {"total": 88.26601165926395}, "total": 80.79558117861674}, "multi-turn": {"total": 66.39455782312925}, "total": 66.69828542935393}, "su": {"nlu": {"sentiment": {"total": 72.50786868902657}, "qa-mc": {"total": 71.42857142857143}, "metaphor": {"total": 37.57124471410185}, "total": 60.50256161056662}, "instruction-following": {"total": 55.23809523809524}, "nlg": {"translation-id-xx": {"total": 62.6653711928934}, "translation-xx-id": {"total": 83.43284026015229}, "total": 73.04910572652284}, "multi-turn": {"total": 69.4047619047619}, "total": 64.54863111998665}, "ban": {"nlu": {"sentiment": {"total": 55.57380901516771}, "total": 55.57380901516771}, "instruction-following": {"total": 6.666666666666667}, "nlg": {"translation-id-xx": {"total": 34.44487626903553}, "translation-xx-id": {"total": 59.96829394035533}, "total": 47.206585104695435}, "multi-turn": {"total": 59.336734693877546}, "total": 42.195948870101844}, "bbc": {"nlu": {"sentiment": {"total": 11.256070640176597}, "qa": {"total": 25.453627519664387}, "total": 18.35484907992049}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 32.850372779187815}, "translation-xx-id": {"total": 26.463039340101524}, "total": 29.65670605964467}, "multi-turn": {"total": 54.96598639455783}, "total": 29.077718716864084}, "indommlu": {"total": 52.11201654890406, "STEM": {"total": 53.8416812123537}, "Humanities": {"total": 58.044855388925995}, "Social science": {"total": 59.625418126693894}, "Indonesian language": {"total": 57.43955221495361}, "Local languages and cultures": {"total": 34.63575868668624}}}
9
+ {"model_name": "GoToCompany/llama3-8b-cpt-sahabatai-v1-base", "model_type": "Base", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 77.33188059992209}, "qa": {"total": 74.70098262553658}, "metaphor": {"total": 68.10994668137526}, "total": 73.3809366356113}, "safety": {"toxicity": {"total": 54.40860992141678}, "total": 54.40860992141678}, "nlg": {"abssum": {"total": 24.768063265770042}, "translation-en-xx": {"total": 91.82524356163538}, "translation-xx-en": {"total": 92.12213665506114}, "total": 58.370876687059145}, "nlr": {"causal": {"total": 78.4}, "nli": {"total": 65.01227670867794}, "total": 71.70613835433898}, "linguistic-diagnostics": {"mp-r": {"total": 37.85714285714286}, "pragmatics": {"total": 32.21153846153846}, "total": 35.03434065934066}, "instruction-following": {"total": 15.238095238095239}, "total": 51.356499582643686}, "jv": {"nlu": {"sentiment": {"total": 73.97244178594318}, "qa-mc": {"total": 64.76190476190476}, "metaphor": {"total": 51.026156941649894}, "total": 63.25350116316594}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 82.35594463832487}, "translation-xx-id": {"total": 88.25025777284264}, "total": 85.30310120558376}, "total": 55.55061348799594}, "su": {"nlu": {"sentiment": {"total": 75.53820408744569}, "qa-mc": {"total": 54.76190476190478}, "metaphor": {"total": 39.630446773303916}, "total": 56.64351854088479}, "instruction-following": {"total": 20.952380952380953}, "nlg": {"translation-id-xx": {"total": 78.38067100253807}, "translation-xx-id": {"total": 87.74551871827411}, "total": 83.0630948604061}, "total": 53.55299811789062}, "ban": {"nlu": {"sentiment": {"total": 58.36509292886136}, "total": 58.36509292886136}, "instruction-following": {"total": 24.761904761904763}, "nlg": {"translation-id-xx": {"total": 43.015704314720814}, "translation-xx-id": {"total": 69.08879282994924}, "total": 56.05224857233503}, "total": 46.393082087700385}, "bbc": {"nlu": {"sentiment": {"total": 19.190699992879008}, "qa": {"total": 36.80323137445211}, "total": 27.996965683665557}, "instruction-following": {"total": 24.761904761904763}, "nlg": {"translation-id-xx": {"total": 20.583439086294415}, "translation-xx-id": {"total": 58.37277918781726}, "total": 39.47810913705584}, "total": 30.74565986087539}}
10
+ {"model_name": "GoToCompany/gemma2-9b-cpt-sahabatai-v1-base", "model_type": "Base", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 79.61513385707484}, "qa": {"total": 74.32736269344839}, "metaphor": {"total": 71.46534289391431}, "total": 75.13594648147918}, "safety": {"toxicity": {"total": 63.078376622102674}, "total": 63.078376622102674}, "nlg": {"abssum": {"total": 26.31522882135615}, "translation-en-xx": {"total": 92.13952684967886}, "translation-xx-en": {"total": 92.6986272156003}, "total": 59.36715292699786}, "nlr": {"causal": {"total": 87.99999999999999}, "nli": {"total": 50.45527660978115}, "total": 69.22763830489058}, "linguistic-diagnostics": {"mp-r": {"total": 55.610119047619065}, "pragmatics": {"total": 49.90384615384615}, "total": 52.75698260073261}, "instruction-following": {"total": 34.285714285714285}, "total": 58.975301870319534}, "jv": {"nlu": {"sentiment": {"total": 77.19824823755607}, "qa-mc": {"total": 78.09523809523809}, "metaphor": {"total": 58.79275653923541}, "total": 71.3620809573432}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 84.98171795685279}, "translation-xx-id": {"total": 89.59297668147208}, "total": 87.28734731916244}, "total": 57.327587203279656}, "su": {"nlu": {"sentiment": {"total": 70.37057608773053}, "qa-mc": {"total": 70.4761904761905}, "metaphor": {"total": 45.04504504504503}, "total": 61.96393720298869}, "instruction-following": {"total": 19.047619047619047}, "nlg": {"translation-id-xx": {"total": 77.55825666243655}, "translation-xx-id": {"total": 88.59196541878173}, "total": 83.07511104060913}, "total": 54.69555576373896}, "ban": {"nlu": {"sentiment": {"total": 68.91348002563555}, "total": 68.91348002563555}, "instruction-following": {"total": 20.0}, "nlg": {"translation-id-xx": {"total": 61.31940038071066}, "translation-xx-id": {"total": 75.09899468591371}, "total": 68.20919753331219}, "total": 52.37422585298258}, "bbc": {"nlu": {"sentiment": {"total": 25.21889909563484}, "qa": {"total": 37.68915856623248}, "total": 31.45402883093366}, "instruction-following": {"total": 17.142857142857142}, "nlg": {"translation-id-xx": {"total": 33.5982709390863}, "translation-xx-id": {"total": 64.51764752538071}, "total": 49.0579592322335}, "total": 32.55161506867477}}
11
+ {"model_name": "google/gemma-2-9b-it", "model_type": "Instruct", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 67.69724170021209}, "qa": {"total": 79.92186846501487}, "metaphor": {"total": 74.20022062879205}, "total": 73.93977693133968}, "safety": {"toxicity": {"total": 37.87798080322485}, "total": 37.87798080322485}, "nlg": {"abssum": {"total": 19.35211019808279}, "translation-en-xx": {"total": 92.32352106750247}, "translation-xx-en": {"total": 92.73650375185277}, "total": 55.941061303880204}, "nlr": {"causal": {"total": 88.39999999999999}, "nli": {"total": 61.13890565692307}, "total": 74.76945282846154}, "linguistic-diagnostics": {"mp-r": {"total": 34.747023809523796}, "pragmatics": {"total": 50.86538461538461}, "total": 42.806204212454205}, "instruction-following": {"total": 92.38095238095238}, "multi-turn": {"total": 64.09863945578232}, "total": 63.116295416585025}, "jv": {"nlu": {"sentiment": {"total": 61.17040518407748}, "qa-mc": {"total": 75.23809523809524}, "metaphor": {"total": 39.708249496981885}, "total": 58.705583306384874}, "instruction-following": {"total": 17.142857142857142}, "nlg": {"translation-id-xx": {"total": 68.26288864213198}, "translation-xx-id": {"total": 84.07596367385787}, "total": 76.16942615799493}, "multi-turn": {"total": 71.61564625850339}, "total": 55.90837821643508}, "su": {"nlu": {"sentiment": {"total": 60.04358043153173}, "qa-mc": {"total": 68.57142857142856}, "metaphor": {"total": 24.080713366427652}, "total": 50.898574123129315}, "instruction-following": {"total": 20.952380952380953}, "nlg": {"translation-id-xx": {"total": 60.161326142131976}, "translation-xx-id": {"total": 79.23067695114213}, "total": 69.69600154663705}, "multi-turn": {"total": 67.87414965986393}, "total": 52.35527657050281}, "ban": {"nlu": {"sentiment": {"total": 52.08858506017232}, "total": 52.08858506017232}, "instruction-following": {"total": 20.0}, "nlg": {"translation-id-xx": {"total": 40.809327411167516}, "translation-xx-id": {"total": 72.12482154187818}, "total": 56.467074476522846}, "multi-turn": {"total": 63.23129251700681}, "total": 47.946738013425495}, "bbc": {"nlu": {"sentiment": {"total": 12.640390230007828}, "qa": {"total": 35.19812281429937}, "total": 23.919256522153596}, "instruction-following": {"total": 10.476190476190476}, "nlg": {"translation-id-xx": {"total": 26.89752538071066}, "translation-xx-id": {"total": 39.81813134517766}, "total": 33.35782836294416}, "multi-turn": {"total": 60.71428571428571}, "total": 32.116890268893485}, "indommlu": {"total": 48.999188639177675, "STEM": {"total": 51.07174652386319}, "Humanities": {"total": 55.98237552708435}, "Social science": {"total": 58.995928840011516}, "Indonesian language": {"total": 55.75432178720666}, "Local languages and cultures": {"total": 25.93271589159855}}}
12
+ {"model_name": "google/gemma-2-9b", "model_type": "Base", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 78.6433985845994}, "qa": {"total": 71.57453630485351}, "metaphor": {"total": 74.1956241956242}, "total": 74.8045196950257}, "safety": {"toxicity": {"total": 59.060094322285295}, "total": 59.060094322285295}, "nlg": {"abssum": {"total": 26.225004066556945}, "translation-en-xx": {"total": 91.69915120121047}, "translation-xx-en": {"total": 92.80231412503088}, "total": 59.23786836483881}, "nlr": {"causal": {"total": 90.8}, "nli": {"total": 43.8597978672891}, "total": 67.32989893364456}, "linguistic-diagnostics": {"mp-r": {"total": 43.05059523809524}, "pragmatics": {"total": 49.519230769230774}, "total": 46.28491300366301}, "instruction-following": {"total": 4.761904761904762}, "total": 51.91319984689369}, "jv": {"nlu": {"sentiment": {"total": 77.6460158085879}, "qa-mc": {"total": 71.42857142857144}, "metaphor": {"total": 43.14889336016097}, "total": 64.0744935324401}, "instruction-following": {"total": 0.9523809523809524}, "nlg": {"translation-id-xx": {"total": 73.56111199238579}, "translation-xx-id": {"total": 81.72416322969544}, "total": 77.64263761104061}, "total": 47.556504031953885}, "su": {"nlu": {"sentiment": {"total": 71.01381471195613}, "qa-mc": {"total": 63.33333333333333}, "metaphor": {"total": 25.36771465342893}, "total": 53.23828756623946}, "instruction-following": {"total": 7.6190476190476195}, "nlg": {"translation-id-xx": {"total": 58.14768401015228}, "translation-xx-id": {"total": 77.19652204949239}, "total": 67.67210302982234}, "total": 42.84314607170314}, "ban": {"nlu": {"sentiment": {"total": 49.60136722922452}, "total": 49.60136722922452}, "instruction-following": {"total": 17.142857142857142}, "nlg": {"translation-id-xx": {"total": 59.54909581218274}, "translation-xx-id": {"total": 70.86218075824873}, "total": 65.20563828521574}, "total": 43.983287552432465}, "bbc": {"nlu": {"sentiment": {"total": 21.214412874741864}, "qa": {"total": 39.3478532528149}, "total": 30.281133063778384}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 31.178616751269036}, "translation-xx-id": {"total": 62.37186706852792}, "total": 46.775241909898476}, "total": 31.082283721384346}}
13
+ {"model_name": "google/gemma-3-27b-it", "model_type": "Instruct", "model_size": "27B", "id": {"nlu": {"sentiment": {"total": 73.50222373717699}, "qa": {"total": 78.15773978542872}, "metaphor": {"total": 69.41993013421586}, "total": 73.6932978856072}, "safety": {"toxicity": {"total": 48.404320615693344}, "total": 48.404320615693344}, "nlg": {"abssum": {"total": 16.524665817864868}, "translation-en-xx": {"total": 94.28980360671937}, "translation-xx-en": {"total": 93.21113821640316}, "total": 55.13756836471307}, "nlr": {"causal": {"total": 89.6}, "nli": {"total": 73.55881285868958}, "total": 81.57940642934479}, "linguistic-diagnostics": {"mp-r": {"total": 48.422619047619065}, "pragmatics": {"total": 60.48076923076921}, "total": 54.45169413919414}, "instruction-following": {"total": 85.71428571428571}, "multi-turn": {"total": 74.8809523809524}, "total": 67.69450364711295}, "jv": {"nlu": {"sentiment": {"total": 68.99366232286546}, "qa-mc": {"total": 87.61904761904762}, "metaphor": {"total": 55.955734406438616}, "total": 70.85614811611723}, "instruction-following": {"total": 65.71428571428571}, "nlg": {"translation-id-xx": {"total": 85.44648239213198}, "translation-xx-id": {"total": 91.4309664498731}, "total": 88.43872442100255}, "multi-turn": {"total": 72.29591836734694}, "total": 74.3262691546881}, "su": {"nlu": {"sentiment": {"total": 68.94652139856157}, "qa-mc": {"total": 80.95238095238096}, "metaphor": {"total": 40.926640926640935}, "total": 63.60851442586116}, "instruction-following": {"total": 68.57142857142857}, "nlg": {"translation-id-xx": {"total": 78.67233105964468}, "translation-xx-id": {"total": 89.0990046002538}, "total": 83.88566782994924}, "multi-turn": {"total": 77.90816326530613}, "total": 73.49344352313628}, "ban": {"nlu": {"sentiment": {"total": 64.34244819483017}, "total": 64.34244819483017}, "instruction-following": {"total": 45.714285714285715}, "nlg": {"translation-id-xx": {"total": 64.00178458121827}, "translation-xx-id": {"total": 80.85844305203045}, "total": 72.43011381662436}, "multi-turn": {"total": 73.72448979591837}, "total": 64.05283438041465}, "bbc": {"nlu": {"sentiment": {"total": 41.17895036673077}, "qa": {"total": 53.90620481957703}, "total": 47.542577593153894}, "instruction-following": {"total": 36.19047619047619}, "nlg": {"translation-id-xx": {"total": 34.14482868020305}, "translation-xx-id": {"total": 72.01943210659898}, "total": 53.082130393401016}, "multi-turn": {"total": 71.76870748299321}, "total": 52.14597291500607}, "indommlu": {"total": 57.00052675017508, "STEM": {"total": 59.886083357110124}, "Humanities": {"total": 63.22328475452199}, "Social science": {"total": 64.23406160745138}, "Indonesian language": {"total": 60.906785648619774}, "Local languages and cultures": {"total": 38.257958112495274}}}
14
+ {"model_name": "GoToCompany/Llama-Sahabat-AI-v2-70B-IT", "model_type": "Instruct", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 77.6732188460373}, "qa": {"total": 76.61819825000589}, "metaphor": {"total": 70.78047435190294}, "total": 75.02396381598204}, "safety": {"toxicity": {"total": 51.04618340675695}, "total": 51.04618340675695}, "nlg": {"abssum": {"total": 19.1930577833964}, "translation-en-xx": {"total": 93.29174129199605}, "translation-xx-en": {"total": 92.92486162147975}, "total": 56.15067962006715}, "nlr": {"causal": {"total": 94.8}, "nli": {"total": 79.08219508232072}, "total": 86.94109754116036}, "linguistic-diagnostics": {"mp-r": {"total": 59.50892857142859}, "pragmatics": {"total": 73.5576923076923}, "total": 66.53331043956045}, "instruction-following": {"total": 93.33333333333333}, "multi-turn": {"total": 76.6326530612245}, "total": 72.23731731686925}, "jv": {"nlu": {"sentiment": {"total": 67.1011892045859}, "qa-mc": {"total": 90.0}, "metaphor": {"total": 50.231388329979865}, "total": 69.11085917818859}, "instruction-following": {"total": 69.52380952380952}, "nlg": {"translation-id-xx": {"total": 86.94888166243655}, "translation-xx-id": {"total": 89.48649666878173}, "total": 88.21768916560913}, "multi-turn": {"total": 74.08163265306122}, "total": 75.23349763016712}, "su": {"nlu": {"sentiment": {"total": 68.94210638752403}, "qa-mc": {"total": 82.85714285714285}, "metaphor": {"total": 39.51553594410737}, "total": 63.77159506292475}, "instruction-following": {"total": 69.52380952380952}, "nlg": {"translation-id-xx": {"total": 85.46533946700508}, "translation-xx-id": {"total": 86.5930262531726}, "total": 86.02918286008884}, "multi-turn": {"total": 76.27551020408163}, "total": 73.90002441272618}, "ban": {"nlu": {"sentiment": {"total": 64.59389019440292}, "total": 64.59389019440292}, "instruction-following": {"total": 64.76190476190476}, "nlg": {"translation-id-xx": {"total": 76.8918543781726}, "translation-xx-id": {"total": 80.28165648794416}, "total": 78.58675543305839}, "multi-turn": {"total": 67.6190476190476}, "total": 68.89039950210342}, "bbc": {"nlu": {"sentiment": {"total": 48.27437157302571}, "qa": {"total": 53.313288510304716}, "total": 50.79383004166522}, "instruction-following": {"total": 46.666666666666664}, "nlg": {"translation-id-xx": {"total": 57.41640228426396}, "translation-xx-id": {"total": 73.42498810279187}, "total": 65.42069519352792}, "multi-turn": {"total": 66.56462585034014}, "total": 57.361454438049975}, "indommlu": {"total": 60.70334495757589, "STEM": {"total": 61.3415240409803}, "Humanities": {"total": 70.51818982615777}, "Social science": {"total": 68.75885762465622}, "Indonesian language": {"total": 65.02913061953353}, "Local languages and cultures": {"total": 41.79340142831706}}}
15
+ {"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "model_type": "Reasoning", "model_size": "32B", "id": {"nlu": {"sentiment": {"total": 78.9701959702203}, "qa": {"total": 69.14921529154698}, "metaphor": {"total": 81.66023166023164}, "total": 76.59321430733297}, "safety": {"toxicity": {"total": 48.532687488463964}, "total": 48.532687488463964}, "nlg": {"abssum": {"total": 15.771986184570713}, "translation-en-xx": {"total": 89.06264281744072}, "translation-xx-en": {"total": 91.78938769917244}, "total": 53.09900072143864}, "nlr": {"causal": {"total": 92.0}, "nli": {"total": 79.68625340846559}, "total": 85.84312670423279}, "linguistic-diagnostics": {"mp-r": {"total": 52.70833333333334}, "pragmatics": {"total": 60.28846153846155}, "total": 56.498397435897445}, "instruction-following": {"total": 53.333333333333336}, "multi-turn": {"total": 68.26530612244899}, "total": 63.166438016164015}, "jv": {"nlu": {"sentiment": {"total": 67.3692231004771}, "qa-mc": {"total": 65.71428571428571}, "metaphor": {"total": 27.66599597585513}, "total": 53.583168263539314}, "instruction-following": {"total": 2.857142857142857}, "nlg": {"translation-id-xx": {"total": 44.75182423857868}, "translation-xx-id": {"total": 66.04903632614213}, "total": 55.400430282360404}, "multi-turn": {"total": 68.62244897959184}, "total": 45.115797595658606}, "su": {"nlu": {"sentiment": {"total": 60.04471978921884}, "qa-mc": {"total": 57.14285714285715}, "metaphor": {"total": 17.268799411656556}, "total": 44.81879211457751}, "instruction-following": {"total": 7.6190476190476195}, "nlg": {"translation-id-xx": {"total": 44.04267131979695}, "translation-xx-id": {"total": 57.660691624365484}, "total": 50.85168147208122}, "multi-turn": {"total": 66.13945578231292}, "total": 42.357244247004814}, "ban": {"nlu": {"sentiment": {"total": 0}, "total": 0.0}, "instruction-following": {"total": 1.9047619047619049}, "nlg": {"translation-id-xx": {"total": 50.61798064720812}, "translation-xx-id": {"total": 37.4948048857868}, "total": 44.05639276649746}, "multi-turn": {"total": 61.819727891156454}, "total": 26.945220640603956}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 9.281469990670297}, "total": 4.640734995335149}, "instruction-following": {"total": 1.9047619047619049}, "nlg": {"translation-id-xx": {"total": 34.2337404822335}, "translation-xx-id": {"total": 34.2864451142132}, "total": 34.260092798223354}, "multi-turn": {"total": 53.31632653061224}, "total": 23.530479057233162}, "indommlu": {"total": 56.97297175288766, "STEM": {"total": 76.60105360009733}, "Humanities": {"total": 61.78918336683863}, "Social science": {"total": 63.76683210862538}, "Indonesian language": {"total": 61.01908232177998}, "Local languages and cultures": {"total": 23.38479571344331}}}
16
+ {"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "model_type": "Reasoning", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 78.14143455395404}, "qa": {"total": 69.26059901277308}, "metaphor": {"total": 80.29049457620887}, "total": 75.89750938097866}, "safety": {"toxicity": {"total": 46.041727141219575}, "total": 46.041727141219575}, "nlg": {"abssum": {"total": 17.28477573793091}, "translation-en-xx": {"total": 91.97944972826087}, "translation-xx-en": {"total": 92.36568116199358}, "total": 54.72867059152907}, "nlr": {"causal": {"total": 94.0}, "nli": {"total": 77.47431525513804}, "total": 85.73715762756902}, "linguistic-diagnostics": {"mp-r": {"total": 44.15178571428572}, "pragmatics": {"total": 64.32692307692307}, "total": 54.239354395604394}, "instruction-following": {"total": 88.57142857142857}, "multi-turn": {"total": 69.57482993197279}, "total": 67.82723966290028}, "jv": {"nlu": {"sentiment": {"total": 72.96802677490564}, "qa-mc": {"total": 87.61904761904762}, "metaphor": {"total": 52.49496981891348}, "total": 71.02734807095558}, "instruction-following": {"total": 7.6190476190476195}, "nlg": {"translation-id-xx": {"total": 73.34557423857868}, "translation-xx-id": {"total": 81.28239014911168}, "total": 77.31398219384518}, "multi-turn": {"total": 71.47959183673468}, "total": 56.859992430145766}, "su": {"nlu": {"sentiment": {"total": 70.92088585060172}, "qa-mc": {"total": 80.4761904761905}, "metaphor": {"total": 38.93178893178892}, "total": 63.44295508619371}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 63.705385469543145}, "translation-xx-id": {"total": 74.6843075824873}, "total": 69.19484652601523}, "multi-turn": {"total": 72.55102040816325}, "total": 55.82101502890257}, "ban": {"nlu": {"sentiment": {"total": 62.72819198177025}, "total": 62.72819198177025}, "instruction-following": {"total": 12.380952380952381}, "nlg": {"translation-id-xx": {"total": 57.72533312182741}, "translation-xx-id": {"total": 64.42384795368021}, "total": 61.07459053775381}, "multi-turn": {"total": 66.44557823129252}, "total": 50.657328282942245}, "bbc": {"nlu": {"sentiment": {"total": 4.338460442925295}, "qa": {"total": 16.706621074803703}, "total": 10.5225407588645}, "instruction-following": {"total": 1.9047619047619049}, "nlg": {"translation-id-xx": {"total": 34.16560913705584}, "translation-xx-id": {"total": 43.58887214467005}, "total": 38.877240640862944}, "multi-turn": {"total": 61.156462585034014}, "total": 28.11525147238084}, "indommlu": {"total": 63.62162170332392, "STEM": {"total": 76.83877111633697}, "Humanities": {"total": 70.83927436481935}, "Social science": {"total": 71.29522879069941}, "Indonesian language": {"total": 64.73211959812697}, "Local languages and cultures": {"total": 36.80837078959374}}}
17
+ {"model_name": "GoToCompany/Llama-Sahabat-AI-v2-70B-R", "model_type": "Reasoning", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 74.36716822490585}, "qa": {"total": 68.70316576851644}, "metaphor": {"total": 82.3405037690752}, "total": 75.1369459208325}, "safety": {"toxicity": {"total": 44.805565887667676}, "total": 44.805565887667676}, "nlg": {"abssum": {"total": 15.770754059792369}, "translation-en-xx": {"total": 92.4779240906003}, "translation-xx-en": {"total": 89.78252763710475}, "total": 53.45048996182245}, "nlr": {"causal": {"total": 94.8}, "nli": {"total": 79.85016753970572}, "total": 87.32508376985285}, "linguistic-diagnostics": {"mp-r": {"total": 51.205357142857146}, "pragmatics": {"total": 69.71153846153845}, "total": 60.458447802197796}, "instruction-following": {"total": 86.66666666666667}, "multi-turn": {"total": 71.6326530612245}, "total": 68.4965504386092}, "jv": {"nlu": {"sentiment": {"total": 74.37662892544327}, "qa-mc": {"total": 88.57142857142857}, "metaphor": {"total": 58.08853118712274}, "total": 73.67886289466486}, "instruction-following": {"total": 52.38095238095239}, "nlg": {"translation-id-xx": {"total": 84.78731757614213}, "translation-xx-id": {"total": 86.91073128172589}, "total": 85.849024428934}, "multi-turn": {"total": 77.90816326530611}, "total": 72.45425074246434}, "su": {"nlu": {"sentiment": {"total": 53.23912269458092}, "qa-mc": {"total": 79.04761904761905}, "metaphor": {"total": 44.3417907703622}, "total": 58.87617750418739}, "instruction-following": {"total": 63.8095238095238}, "nlg": {"translation-id-xx": {"total": 81.1491513324873}, "translation-xx-id": {"total": 81.8542393718274}, "total": 81.50169535215736}, "multi-turn": {"total": 77.89115646258503}, "total": 70.51963828211339}, "ban": {"nlu": {"sentiment": {"total": 65.37499109876808}, "total": 65.37499109876808}, "instruction-following": {"total": 63.8095238095238}, "nlg": {"translation-id-xx": {"total": 74.0709470177665}, "translation-xx-id": {"total": 68.70261936865482}, "total": 71.38678319321066}, "multi-turn": {"total": 71.64965986394559}, "total": 68.05523949136203}, "bbc": {"nlu": {"sentiment": {"total": 60.06950081891333}, "qa": {"total": 39.05529170888816}, "total": 49.56239626390075}, "instruction-following": {"total": 57.14285714285714}, "nlg": {"translation-id-xx": {"total": 54.30369606598985}, "translation-xx-id": {"total": 66.19709708121827}, "total": 60.250396573604064}, "multi-turn": {"total": 74.93197278911566}, "total": 60.471905692369404}, "indommlu": {"total": 65.61855199966932, "STEM": {"total": 76.76900555375443}, "Humanities": {"total": 72.57418960079825}, "Social science": {"total": 71.07883839300611}, "Indonesian language": {"total": 67.2507866798964}, "Local languages and cultures": {"total": 43.22103260819583}}}
18
+ {"model_name": "aisingapore/Llama-SEA-LION-v2-8B", "model_type": "Base", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 69.60385285460764}, "qa": {"total": 73.73839363312449}, "metaphor": {"total": 59.914506343077754}, "total": 67.7522509436033}, "safety": {"toxicity": {"total": 45.25492601518173}, "total": 45.25492601518173}, "nlg": {"abssum": {"total": 25.372425005335675}, "translation-en-xx": {"total": 91.61413815464427}, "translation-xx-en": {"total": 92.07667631971034}, "total": 58.60891612125649}, "nlr": {"causal": {"total": 68.39999999999999}, "nli": {"total": 60.31150864698831}, "total": 64.35575432349415}, "linguistic-diagnostics": {"mp-r": {"total": 41.309523809523796}, "pragmatics": {"total": 6.634615384615383}, "total": 23.97206959706959}, "instruction-following": {"total": 16.19047619047619}, "total": 46.02239886518024}, "jv": {"nlu": {"sentiment": {"total": 66.13458662678914}, "qa-mc": {"total": 46.666666666666664}, "metaphor": {"total": 24.637826961770635}, "total": 45.81302675174214}, "instruction-following": {"total": 11.428571428571429}, "nlg": {"translation-id-xx": {"total": 61.99734295685279}, "translation-xx-id": {"total": 78.83339942893402}, "total": 70.4153711928934}, "total": 42.55232312440233}, "su": {"nlu": {"sentiment": {"total": 65.45908993804741}, "qa-mc": {"total": 37.61904761904762}, "metaphor": {"total": 5.5295091009376796}, "total": 36.2025488860109}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 51.8411326142132}, "translation-xx-id": {"total": 72.78686151649747}, "total": 62.31399706535534}, "total": 36.01345182505859}, "ban": {"nlu": {"sentiment": {"total": 55.74115217546106}, "total": 55.74115217546106}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 53.35564720812183}, "translation-xx-id": {"total": 64.95974777918782}, "total": 59.15769749365482}, "total": 44.331362588118}, "bbc": {"nlu": {"sentiment": {"total": 14.378765221106596}, "qa": {"total": 37.418918485381994}, "total": 25.898841853244296}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 25.358819796954315}, "translation-xx-id": {"total": 59.188650063451774}, "total": 42.27373493020305}, "total": 28.75593829289515}}
19
+ {"model_name": "aisingapore/Gemma-SEA-LION-v3-9B-IT", "model_type": "Instruct", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 71.48882179803489}, "qa": {"total": 79.70399822335983}, "metaphor": {"total": 72.84427284427287}, "total": 74.67903095522253}, "safety": {"toxicity": {"total": 40.59132401534749}, "total": 40.59132401534749}, "nlg": {"abssum": {"total": 18.181388830492043}, "translation-en-xx": {"total": 93.09884945111784}, "translation-xx-en": {"total": 93.12180880218627}, "total": 55.64585897857205}, "nlr": {"causal": {"total": 90.8}, "nli": {"total": 72.54852270674931}, "total": 81.67426135337465}, "linguistic-diagnostics": {"mp-r": {"total": 34.85119047619048}, "pragmatics": {"total": 60.57692307692308}, "total": 47.71405677655678}, "instruction-following": {"total": 96.19047619047619}, "multi-turn": {"total": 71.53061224489797}, "total": 66.86080293063539}, "jv": {"nlu": {"sentiment": {"total": 69.40888698995941}, "qa-mc": {"total": 82.3809523809524}, "metaphor": {"total": 46.73038229376259}, "total": 66.17340722155814}, "instruction-following": {"total": 22.857142857142858}, "nlg": {"translation-id-xx": {"total": 76.26477236675127}, "translation-xx-id": {"total": 87.09299651015229}, "total": 81.67888443845177}, "multi-turn": {"total": 74.67687074829932}, "total": 61.34657631636303}, "su": {"nlu": {"sentiment": {"total": 62.93826105533004}, "qa-mc": {"total": 70.95238095238095}, "metaphor": {"total": 35.59018201875346}, "total": 56.49360800882149}, "instruction-following": {"total": 35.23809523809524}, "nlg": {"translation-id-xx": {"total": 69.71629124365482}, "translation-xx-id": {"total": 82.27563848350253}, "total": 75.99596486357868}, "multi-turn": {"total": 74.72789115646259}, "total": 60.6138898167395}, "ban": {"nlu": {"sentiment": {"total": 50.81122267321797}, "total": 50.81122267321797}, "instruction-following": {"total": 17.142857142857142}, "nlg": {"translation-id-xx": {"total": 64.24619289340102}, "translation-xx-id": {"total": 76.28541402284264}, "total": 70.26580345812184}, "multi-turn": {"total": 72.3809523809524}, "total": 52.65020891378734}, "bbc": {"nlu": {"sentiment": {"total": 20.32713807590971}, "qa": {"total": 39.7914974064387}, "total": 30.059317741174205}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 47.01633883248731}, "translation-xx-id": {"total": 54.60501269035533}, "total": 50.81067576142132}, "multi-turn": {"total": 69.57482993197279}, "total": 39.99215823959446}, "indommlu": {"total": 50.91995623941112, "STEM": {"total": 53.621038289268476}, "Humanities": {"total": 58.20677354222946}, "Social science": {"total": 59.187432464425626}, "Indonesian language": {"total": 57.20965571441104}, "Local languages and cultures": {"total": 30.158333200310455}}}
20
+ {"model_name": "aisingapore/Llama-SEA-LION-v3-70B-IT", "model_type": "Instruct", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 77.15671395489763}, "qa": {"total": 76.84738358190378}, "metaphor": {"total": 73.49236992094134}, "total": 75.83215581924757}, "safety": {"toxicity": {"total": 51.98770863470513}, "total": 51.98770863470513}, "nlg": {"abssum": {"total": 19.525191817646014}, "translation-en-xx": {"total": 92.33508252532114}, "translation-xx-en": {"total": 92.7480999490489}, "total": 56.033391527415525}, "nlr": {"causal": {"total": 93.6}, "nli": {"total": 79.65788716619113}, "total": 86.62894358309556}, "linguistic-diagnostics": {"mp-r": {"total": 53.154761904761926}, "pragmatics": {"total": 75.76923076923077}, "total": 64.46199633699635}, "instruction-following": {"total": 92.38095238095238}, "multi-turn": {"total": 69.01360544217687}, "total": 70.90553624636992}, "jv": {"nlu": {"sentiment": {"total": 69.79377625863418}, "qa-mc": {"total": 88.09523809523809}, "metaphor": {"total": 47.40442655935615}, "total": 68.43114697107613}, "instruction-following": {"total": 34.285714285714285}, "nlg": {"translation-id-xx": {"total": 82.43952252538071}, "translation-xx-id": {"total": 87.85191941624366}, "total": 85.14572097081219}, "multi-turn": {"total": 73.48639455782313}, "total": 65.33724419635644}, "su": {"nlu": {"sentiment": {"total": 65.91839350566117}, "qa-mc": {"total": 80.0}, "metaphor": {"total": 26.65011950726237}, "total": 57.52283767097452}, "instruction-following": {"total": 56.19047619047619}, "nlg": {"translation-id-xx": {"total": 77.60628172588832}, "translation-xx-id": {"total": 82.63099817576142}, "total": 80.11863995082487}, "multi-turn": {"total": 75.5612244897959}, "total": 67.34829457551787}, "ban": {"nlu": {"sentiment": {"total": 63.236843979206725}, "total": 63.236843979206725}, "instruction-following": {"total": 36.19047619047619}, "nlg": {"translation-id-xx": {"total": 57.826142131979694}, "translation-xx-id": {"total": 74.99389276649747}, "total": 66.41001744923858}, "multi-turn": {"total": 68.72448979591837}, "total": 58.64045685370996}, "bbc": {"nlu": {"sentiment": {"total": 30.247383037812455}, "qa": {"total": 41.793369541286886}, "total": 36.020376289549674}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 44.33359771573604}, "translation-xx-id": {"total": 59.36377696700507}, "total": 51.84868734137056}, "multi-turn": {"total": 63.58843537414967}, "total": 41.91199379888652}, "indommlu": {"total": 60.18025084141225, "STEM": {"total": 62.267906671911646}, "Humanities": {"total": 70.42539141840543}, "Social science": {"total": 68.85334900407847}, "Indonesian language": {"total": 64.34575595370362}, "Local languages and cultures": {"total": 39.88441676395111}}}
21
+ {"model_name": "aisingapore/Gemma-SEA-LION-v3-9B", "model_type": "Base", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 78.19838062156428}, "qa": {"total": 74.12483728239906}, "metaphor": {"total": 73.50156278727708}, "total": 75.27492689708014}, "safety": {"toxicity": {"total": 59.66682908880377}, "total": 59.66682908880377}, "nlg": {"abssum": {"total": 26.54738670064436}, "translation-en-xx": {"total": 91.58656473875988}, "translation-xx-en": {"total": 92.55769052618577}, "total": 59.30975716655858}, "nlr": {"causal": {"total": 91.19999999999999}, "nli": {"total": 74.75970856366459}, "total": 82.97985428183229}, "linguistic-diagnostics": {"mp-r": {"total": 57.23214285714286}, "pragmatics": {"total": 52.01923076923079}, "total": 54.625686813186825}, "instruction-following": {"total": 26.666666666666668}, "total": 59.753953485688044}, "jv": {"nlu": {"sentiment": {"total": 79.42270170191551}, "qa-mc": {"total": 73.33333333333334}, "metaphor": {"total": 38.87323943661971}, "total": 63.87642482395619}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 72.42651491116752}, "translation-xx-id": {"total": 85.33161484771574}, "total": 78.87906487944163}, "total": 52.98198863129134}, "su": {"nlu": {"sentiment": {"total": 73.00719219539987}, "qa-mc": {"total": 66.19047619047619}, "metaphor": {"total": 34.16988416988418}, "total": 57.78918418525341}, "instruction-following": {"total": 20.952380952380953}, "nlg": {"translation-id-xx": {"total": 55.703362944162436}, "translation-xx-id": {"total": 81.53739689086295}, "total": 68.6203799175127}, "total": 49.12064835171569}, "ban": {"nlu": {"sentiment": {"total": 62.65954568112227}, "total": 62.65954568112227}, "instruction-following": {"total": 25.71428571428571}, "nlg": {"translation-id-xx": {"total": 53.75531408629442}, "translation-xx-id": {"total": 71.69029584390863}, "total": 62.722804965101524}, "total": 50.36554545350317}, "bbc": {"nlu": {"sentiment": {"total": 37.39357687103894}, "qa": {"total": 40.46068444098612}, "total": 38.92713065601253}, "instruction-following": {"total": 22.857142857142858}, "nlg": {"translation-id-xx": {"total": 29.435279187817258}, "translation-xx-id": {"total": 62.45594067258883}, "total": 45.94560993020305}, "total": 35.90996114778614}}
22
+ {"model_name": "aisingapore/Llama-SEA-LION-v3.5-70B-R", "model_type": "Reasoning", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 75.73130383499978}, "qa": {"total": 69.35122429051765}, "metaphor": {"total": 82.33590733590734}, "total": 75.80614515380826}, "safety": {"toxicity": {"total": 45.285611387219845}, "total": 45.285611387219845}, "nlg": {"abssum": {"total": 15.83263587689335}, "translation-en-xx": {"total": 92.98521646492095}, "translation-xx-en": {"total": 92.67311307281373}, "total": 54.330900322880346}, "nlr": {"causal": {"total": 93.6}, "nli": {"total": 79.33422124444434}, "total": 86.46711062222217}, "linguistic-diagnostics": {"mp-r": {"total": 43.110119047619065}, "pragmatics": {"total": 68.46153846153847}, "total": 55.785828754578766}, "instruction-following": {"total": 87.61904761904762}, "multi-turn": {"total": 76.49659863945578}, "total": 68.8273203570304}, "jv": {"nlu": {"sentiment": {"total": 71.35512354909919}, "qa-mc": {"total": 89.04761904761904}, "metaphor": {"total": 52.464788732394354}, "total": 70.95584377637086}, "instruction-following": {"total": 38.095238095238095}, "nlg": {"translation-id-xx": {"total": 81.60529029187818}, "translation-xx-id": {"total": 86.06099302030456}, "total": 83.83314165609137}, "multi-turn": {"total": 79.08163265306122}, "total": 67.99146404519038}, "su": {"nlu": {"sentiment": {"total": 65.80702129174678}, "qa-mc": {"total": 81.90476190476191}, "metaphor": {"total": 44.34638720353008}, "total": 64.01939013334625}, "instruction-following": {"total": 58.0952380952381}, "nlg": {"translation-id-xx": {"total": 74.65085659898477}, "translation-xx-id": {"total": 80.70001189720813}, "total": 77.67543424809645}, "multi-turn": {"total": 75.13605442176869}, "total": 68.73152922461237}, "ban": {"nlu": {"sentiment": {"total": 62.439792067222086}, "total": 62.439792067222086}, "instruction-following": {"total": 37.142857142857146}, "nlg": {"translation-id-xx": {"total": 62.54401967005076}, "translation-xx-id": {"total": 67.96902760152284}, "total": 65.2565236357868}, "multi-turn": {"total": 74.91496598639456}, "total": 59.93853470806515}, "bbc": {"nlu": {"sentiment": {"total": 37.488072349213134}, "qa": {"total": 38.34901535900559}, "total": 37.91854385410936}, "instruction-following": {"total": 20.952380952380953}, "nlg": {"translation-id-xx": {"total": 40.71541878172589}, "translation-xx-id": {"total": 50.81959866751269}, "total": 45.76750872461929}, "multi-turn": {"total": 63.55442176870748}, "total": 42.04821382495427}, "indommlu": {"total": 64.19240645617727, "STEM": {"total": 77.44502530319998}, "Humanities": {"total": 70.1734518684928}, "Social science": {"total": 70.29975159070486}, "Indonesian language": {"total": 64.49084946417122}, "Local languages and cultures": {"total": 40.95263915781106}}}
23
+ {"model_name": "aisingapore/Llama-SEA-LION-v2-8B-IT", "model_type": "Instruct", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 74.54389040384366}, "qa": {"total": 71.6100894869895}, "metaphor": {"total": 61.29803272660417}, "total": 69.15067087247911}, "safety": {"toxicity": {"total": 41.1573698344055}, "total": 41.1573698344055}, "nlg": {"abssum": {"total": 19.705313096632644}, "translation-en-xx": {"total": 91.68619148344861}, "translation-xx-en": {"total": 91.56246719058794}, "total": 55.66482121682546}, "nlr": {"causal": {"total": 78.0}, "nli": {"total": 53.512191896867066}, "total": 65.75609594843354}, "linguistic-diagnostics": {"mp-r": {"total": 16.42857142857143}, "pragmatics": {"total": 23.076923076923084}, "total": 19.752747252747255}, "instruction-following": {"total": 72.38095238095238}, "multi-turn": {"total": 57.31292517006803}, "total": 54.45365466798732}, "jv": {"nlu": {"sentiment": {"total": 62.67798903368226}, "qa-mc": {"total": 43.33333333333334}, "metaphor": {"total": 14.014084507042245}, "total": 40.00846895801928}, "instruction-following": {"total": 6.666666666666667}, "nlg": {"translation-id-xx": {"total": 61.38527125634518}, "translation-xx-id": {"total": 67.46113578680203}, "total": 64.42320352157361}, "multi-turn": {"total": 57.874149659863946}, "total": 42.24312220153087}, "su": {"nlu": {"sentiment": {"total": 51.23876664530371}, "qa-mc": {"total": 33.8095238095238}, "metaphor": {"total": 8.586137157565732}, "total": 31.211475870797745}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 54.9426951142132}, "translation-xx-id": {"total": 68.41759200507614}, "total": 61.68014355964468}, "multi-turn": {"total": 59.93197278911564}, "total": 39.158279007270465}, "ban": {"nlu": {"sentiment": {"total": 29.35099337748344}, "total": 29.35099337748344}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 56.31150856598985}, "translation-xx-id": {"total": 61.053676237309645}, "total": 58.68259240164974}, "multi-turn": {"total": 47.89115646258503}, "total": 34.93356651281051}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 20.077159425640094}, "total": 10.038579712820047}, "instruction-following": {"total": 4.761904761904762}, "nlg": {"translation-id-xx": {"total": 40.314800126903556}, "translation-xx-id": {"total": 37.23913388324873}, "total": 38.776967005076145}, "multi-turn": {"total": 47.976190476190474}, "total": 25.388410488997856}, "indommlu": {"total": 44.174114800549994, "STEM": {"total": 40.5167101219968}, "Humanities": {"total": 54.183577604266866}, "Social science": {"total": 55.14234869840264}, "Indonesian language": {"total": 50.39557152880211}, "Local languages and cultures": {"total": 25.18883493645394}}}
24
+ {"model_name": "Qwen/Qwen2.5-7B-Instruct", "model_type": "Instruct", "model_size": "7B", "id": {"nlu": {"sentiment": {"total": 75.78669436869671}, "qa": {"total": 74.27600858973535}, "metaphor": {"total": 60.594778451921314}, "total": 70.21916047011779}, "safety": {"toxicity": {"total": 46.48682318715473}, "total": 46.48682318715473}, "nlg": {"abssum": {"total": 18.283972085777638}, "translation-en-xx": {"total": 85.15674793107708}, "translation-xx-en": {"total": 91.37426830186203}, "total": 53.2747401011236}, "nlr": {"causal": {"total": 80.0}, "nli": {"total": 68.64494448299581}, "total": 74.3224722414979}, "linguistic-diagnostics": {"mp-r": {"total": 36.383928571428584}, "pragmatics": {"total": 41.634615384615394}, "total": 39.009271978021985}, "instruction-following": {"total": 79.04761904761905}, "multi-turn": {"total": 61.0204081632653}, "total": 60.482927884114325}, "jv": {"nlu": {"sentiment": {"total": 60.33945738090152}, "qa-mc": {"total": 39.99999999999999}, "metaphor": {"total": 16.830985915492967}, "total": 39.05681443213149}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 42.45994606598985}, "translation-xx-id": {"total": 61.021930520304565}, "total": 51.74093829314721}, "multi-turn": {"total": 61.25850340136054}, "total": 41.34739736499314}, "su": {"nlu": {"sentiment": {"total": 41.98518835006765}, "qa-mc": {"total": 31.428571428571423}, "metaphor": {"total": 0}, "total": 24.47125325954636}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 22.513642131979694}, "translation-xx-id": {"total": 31.6829394035533}, "total": 27.098290767766496}, "multi-turn": {"total": 48.18027210884354}, "total": 28.27078736737243}, "ban": {"nlu": {"sentiment": {"total": 31.45495976643166}, "total": 31.45495976643166}, "instruction-following": {"total": 5.714285714285714}, "nlg": {"translation-id-xx": {"total": 37.45213356598985}, "translation-xx-id": {"total": 36.39344067258883}, "total": 36.92278711928934}, "multi-turn": {"total": 51.10544217687074}, "total": 31.299368694219364}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 33.60734871091491}, "total": 16.803674355457456}, "instruction-following": {"total": 5.714285714285714}, "nlg": {"translation-id-xx": {"total": 27.23080583756345}, "translation-xx-id": {"total": 33.87468274111675}, "total": 30.5527442893401}, "multi-turn": {"total": 53.656462585034014}, "total": 26.68179173602932}, "indommlu": {"total": 42.71309418959688, "STEM": {"total": 45.47521929491425}, "Humanities": {"total": 48.733629272051395}, "Social science": {"total": 53.93752543659862}, "Indonesian language": {"total": 51.20940125670692}, "Local languages and cultures": {"total": 17.313678586984217}}}
25
+ {"model_name": "Qwen/Qwen2-7B", "model_type": "Base", "model_size": "7B", "id": {"nlu": {"sentiment": {"total": 75.21946554560013}, "qa": {"total": 71.52413905008522}, "metaphor": {"total": 57.2209965067108}, "total": 67.98820036746538}, "safety": {"toxicity": {"total": 44.205661659340315}, "total": 44.205661659340315}, "nlg": {"abssum": {"total": 23.68972738280887}, "translation-en-xx": {"total": 84.4631994194664}, "translation-xx-en": {"total": 90.75560944447875}, "total": 55.64956590739072}, "nlr": {"causal": {"total": 84.39999999999999}, "nli": {"total": 72.91286068288636}, "total": 78.65643034144318}, "linguistic-diagnostics": {"mp-r": {"total": 38.39285714285714}, "pragmatics": {"total": 0}, "total": 19.19642857142857}, "instruction-following": {"total": 25.71428571428571}, "total": 48.56842876022565}, "jv": {"nlu": {"sentiment": {"total": 64.0681478316599}, "qa-mc": {"total": 48.57142857142858}, "metaphor": {"total": 26.12676056338028}, "total": 46.25544565548959}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 41.66949555837564}, "translation-xx-id": {"total": 61.29969067258883}, "total": 51.484593115482234}, "total": 33.849854193498544}, "su": {"nlu": {"sentiment": {"total": 42.50680054119491}, "qa-mc": {"total": 41.428571428571416}, "metaphor": {"total": 9.624931053502483}, "total": 31.186767674422935}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 40.04616116751269}, "translation-xx-id": {"total": 52.83169416243655}, "total": 46.43892766497462}, "total": 27.145073049640455}, "ban": {"nlu": {"sentiment": {"total": 50.76493626717937}, "total": 50.76493626717937}, "instruction-following": {"total": 21.904761904761905}, "nlg": {"translation-id-xx": {"total": 39.58026649746193}, "translation-xx-id": {"total": 50.566782994923855}, "total": 45.07352474619289}, "total": 39.24774097271139}, "bbc": {"nlu": {"sentiment": {"total": 26.338745282347077}, "qa": {"total": 35.88321060236509}, "total": 31.110977942356083}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 19.100571065989847}, "translation-xx-id": {"total": 43.421478426395936}, "total": 31.26102474619289}, "total": 26.187492959675055}}
26
+ {"model_name": "Qwen/Qwen2.5-7B", "model_type": "Base", "model_size": "7B", "id": {"nlu": {"sentiment": {"total": 75.75328420551442}, "qa": {"total": 73.32407561711896}, "metaphor": {"total": 65.36127964699394}, "total": 71.47954648987577}, "safety": {"toxicity": {"total": 31.55389939249849}, "total": 31.55389939249849}, "nlg": {"abssum": {"total": 21.564449902216676}, "translation-en-xx": {"total": 86.72052942811264}, "translation-xx-en": {"total": 91.35743827970603}, "total": 55.30171687806301}, "nlr": {"causal": {"total": 84.4}, "nli": {"total": 73.39944270760374}, "total": 78.89972135380188}, "linguistic-diagnostics": {"mp-r": {"total": 44.89583333333334}, "pragmatics": {"total": 33.94230769230768}, "total": 39.41907051282051}, "instruction-following": {"total": 34.285714285714285}, "total": 51.823278152128985}, "jv": {"nlu": {"sentiment": {"total": 65.79192480239264}, "qa-mc": {"total": 49.523809523809526}, "metaphor": {"total": 24.75855130784708}, "total": 46.691428544683085}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 45.925364847715734}, "translation-xx-id": {"total": 64.8521573604061}, "total": 55.38876110406092}, "total": 37.20133305751784}, "su": {"nlu": {"sentiment": {"total": 47.658477533290615}, "qa-mc": {"total": 41.42857142857143}, "metaphor": {"total": 8.416069130354842}, "total": 32.5010393640723}, "instruction-following": {"total": 4.761904761904762}, "nlg": {"translation-id-xx": {"total": 43.161326142131976}, "translation-xx-id": {"total": 61.348350253807105}, "total": 52.25483819796954}, "total": 29.839260774648864}, "ban": {"nlu": {"sentiment": {"total": 53.75033824681337}, "total": 53.75033824681337}, "instruction-following": {"total": 14.285714285714285}, "nlg": {"translation-id-xx": {"total": 44.255869289340104}, "translation-xx-id": {"total": 60.060259359137056}, "total": 52.15806432423858}, "total": 40.06470561892208}, "bbc": {"nlu": {"sentiment": {"total": 20.73417360962757}, "qa": {"total": 41.071441696158544}, "total": 30.90280765289306}, "instruction-following": {"total": 12.380952380952381}, "nlg": {"translation-id-xx": {"total": 26.63626269035533}, "translation-xx-id": {"total": 49.21890862944162}, "total": 37.927585659898476}, "total": 27.0704485645813}}
27
+ {"model_name": "Qwen/Qwen2-7B-Instruct", "model_type": "Instruct", "model_size": "7B", "id": {"nlu": {"sentiment": {"total": 76.32118934770374}, "qa": {"total": 60.29704883952192}, "metaphor": {"total": 54.490715205000924}, "total": 63.70298446407552}, "safety": {"toxicity": {"total": 23.21240291206743}, "total": 23.21240291206743}, "nlg": {"abssum": {"total": 13.130711645467363}, "translation-en-xx": {"total": 82.98908797554348}, "translation-xx-en": {"total": 83.69165328557312}, "total": 48.235541138012834}, "nlr": {"causal": {"total": 81.2}, "nli": {"total": 52.41372244666343}, "total": 66.80686122333171}, "linguistic-diagnostics": {"mp-r": {"total": 3.3333333333333286}, "pragmatics": {"total": 27.019230769230784}, "total": 15.176282051282056}, "instruction-following": {"total": 65.71428571428571}, "multi-turn": {"total": 51.83673469387754}, "total": 47.81215602813325}, "jv": {"nlu": {"sentiment": {"total": 56.66424553158155}, "qa-mc": {"total": 24.28571428571429}, "metaphor": {"total": 3.6116700201207275}, "total": 28.18720994580552}, "instruction-following": {"total": 2.857142857142857}, "nlg": {"translation-id-xx": {"total": 58.805242703045685}, "translation-xx-id": {"total": 54.17488895939086}, "total": 56.490065831218274}, "multi-turn": {"total": 54.234693877551024}, "total": 35.44227812792942}, "su": {"nlu": {"sentiment": {"total": 47.37171544541765}, "qa-mc": {"total": 31.428571428571423}, "metaphor": {"total": 0}, "total": 26.266762291329695}, "instruction-following": {"total": 4.761904761904762}, "nlg": {"translation-id-xx": {"total": 31.82955266497462}, "translation-xx-id": {"total": 34.00594860406091}, "total": 32.917750634517766}, "multi-turn": {"total": 47.15986394557824}, "total": 27.776570408332617}, "ban": {"nlu": {"sentiment": {"total": 0}, "total": 0.0}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 38.09668464467005}, "translation-xx-id": {"total": 36.76491116751269}, "total": 37.43079790609137}, "multi-turn": {"total": 39.94897959183673}, "total": 21.725896755434405}, "bbc": {"nlu": {"sentiment": {"total": 9.251655629139064}, "qa": {"total": 27.430561483601497}, "total": 18.34110855637028}, "instruction-following": {"total": 2.857142857142857}, "nlg": {"translation-id-xx": {"total": 36.072572969543145}, "translation-xx-id": {"total": 22.299254441624367}, "total": 29.185913705583758}, "multi-turn": {"total": 45.527210884353735}, "total": 23.977844000862657}, "indommlu": {"total": 19.439219720805205, "STEM": {"total": 19.089113128479287}, "Humanities": {"total": 27.818499157288194}, "Social science": {"total": 22.833935000753403}, "Indonesian language": {"total": 34.0245970704392}, "Local languages and cultures": {"total": 0.0}}}
28
+ {"model_name": "Qwen/Qwen2.5-72B-Instruct", "model_type": "Instruct", "model_size": "72B", "id": {"nlu": {"sentiment": {"total": 78.40107345366401}, "qa": {"total": 74.64988471304491}, "metaphor": {"total": 72.14561500275786}, "total": 75.06552438982227}, "safety": {"toxicity": {"total": 50.59214880483599}, "total": 50.59214880483599}, "nlg": {"abssum": {"total": 17.336644608612378}, "translation-en-xx": {"total": 91.94026178050889}, "translation-xx-en": {"total": 93.00841464920948}, "total": 54.90549141173578}, "nlr": {"causal": {"total": 94.39999999999999}, "nli": {"total": 67.33660629863192}, "total": 80.86830314931595}, "linguistic-diagnostics": {"mp-r": {"total": 57.961309523809526}, "pragmatics": {"total": 67.3076923076923}, "total": 62.63450091575092}, "instruction-following": {"total": 92.38095238095238}, "multi-turn": {"total": 71.4455782312925}, "total": 69.69892846910082}, "jv": {"nlu": {"sentiment": {"total": 66.68461155023854}, "qa-mc": {"total": 77.6190476190476}, "metaphor": {"total": 34.14486921529174}, "total": 59.4828427948593}, "instruction-following": {"total": 40.0}, "nlg": {"translation-id-xx": {"total": 55.5690038071066}, "translation-xx-id": {"total": 80.43935398159898}, "total": 68.00417889435279}, "multi-turn": {"total": 76.9047619047619}, "total": 61.0979458984935}, "su": {"nlu": {"sentiment": {"total": 62.30413729260128}, "qa-mc": {"total": 64.76190476190476}, "metaphor": {"total": 20.104798676227233}, "total": 49.05694691024443}, "instruction-following": {"total": 45.714285714285715}, "nlg": {"translation-id-xx": {"total": 46.21391180203046}, "translation-xx-id": {"total": 69.89271692576142}, "total": 58.053314363895936}, "multi-turn": {"total": 64.03061224489795}, "total": 54.21378980833101}, "ban": {"nlu": {"sentiment": {"total": 53.58263903724273}, "total": 53.58263903724273}, "instruction-following": {"total": 4.761904761904762}, "nlg": {"translation-id-xx": {"total": 43.244923857868024}, "translation-xx-id": {"total": 59.21627141497462}, "total": 51.23059763642132}, "multi-turn": {"total": 62.704081632653065}, "total": 43.06980576705547}, "bbc": {"nlu": {"sentiment": {"total": 26.478459018728184}, "qa": {"total": 37.11172567294669}, "total": 31.79509234583744}, "instruction-following": {"total": 6.666666666666667}, "nlg": {"translation-id-xx": {"total": 22.50555203045685}, "translation-xx-id": {"total": 31.081535532994923}, "total": 26.793543781725887}, "multi-turn": {"total": 69.35374149659863}, "total": 33.652261072707155}, "indommlu": {"total": 56.41249578205506, "STEM": {"total": 63.61631263878045}, "Humanities": {"total": 66.03320962601761}, "Social science": {"total": 67.05889343710753}, "Indonesian language": {"total": 62.046257290964135}, "Local languages and cultures": {"total": 28.10104489182636}}}
dataframe.csv ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,Model,Type,Size,Indonesian Language (Average),ID,JV,SU,Open LLM Leaderboard 2 (EN)
2
+ 10,GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct,Instruct,9B,59.91,63.96,60.1,55.66,33.67
3
+ 9,GoToCompany/gemma2-9b-cpt-sahabatai-v1-base,Base,9B,51.43,54.74,51.46,48.08,19.62
4
+ 11,GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct,Instruct,8B,49.98,56.56,49.64,43.74,24.43
5
+ 6,aisingapore/gemma2-9b-cpt-sea-lionv3-base,Base,9B,49.23,55.09,47.65,44.95,21.99
6
+ 0,google/gemma-2-9b-it,Instruct,9B,48.85,63.09,43.36,40.1,23.49
7
+ 12,GoToCompany/llama3-8b-cpt-sahabatai-v1-base,Base,8B,47.78,46.67,49.15,47.51,13.92
8
+ 1,google/gemma-2-9b,Base,9B,43.35,48.02,42.83,39.2,13.34
9
+ 15,meta-llama/Llama-3.1-8B,Base,8B,39.8,41.93,41.53,35.94,13.69
10
+ 16,meta-llama/Meta-Llama-3-8B,Base,8B,38.52,41.14,40.04,34.39,13.56
11
+ 7,aisingapore/llama3-8b-cpt-sea-lionv2-base,Base,8B,38.5,42.45,38.94,34.11,12.77
12
+ 8,aisingapore/llama3-8b-cpt-sea-lionv2.1-instruct,Instruct,8B,38.29,54.35,33.2,27.32,24.52
13
+ 14,meta-llama/Llama-3.1-8B-Instruct,Instruct,8B,37.74,50.52,35.26,27.43,27.98
14
+ 2,Qwen/Qwen2.5-7B-Instruct,Instruct,7B,37.74,59.2,30.47,23.54,27.75
15
+ 5,Qwen/Qwen2.5-7B,Base,7B,37.53,51.66,34.11,26.82,24.65
16
+ 4,Qwen/Qwen2-7B,Base,7B,33.26,44.81,29.87,25.11,23.68
17
+ 3,Qwen/Qwen2-7B-Instruct,Instruct,7B,30.58,46.25,23.29,22.21,24.48
18
+ 13,meta-llama/Meta-Llama-3-8B-Instruct,Instruct,8B,30.44,41.36,28.91,21.04,23.91
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler
2
+ black
3
+ datasets
4
+ gradio
5
+ gradio[oauth]
6
+ gradio_leaderboard==0.0.13
7
+ gradio_client
8
+ huggingface-hub>=0.18.0
9
+ matplotlib
10
+ numpy
11
+ pandas
12
+ python-dateutil
13
+ tqdm
14
+ transformers
15
+ tokenizers>=0.15.0
16
+ sentencepiece
src/config.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DESCRIPTION CONFIG
2
+
3
+ # Title for the leaderboard page
4
+ TITLE = """<h1 align="center" id="space-title">Sahabat-AI Leaderboard</h1>"""
5
+
6
+ # Introduction text providing an overview of the leaderboard
7
+ INTRODUCTION_TEXT = """
8
+ Sahabat-AI (Indonesian language for "close friends") is a collection of large language models which has been pretrained and instruct-tuned for Indonesian language and its various local languages.
9
+ This leaderboard evaluates general language capabilities of Sahabat-AI and other open source models using SEA-HELM and IndoMMLU, focusing on Indonesian, Javanese, Sundanese, Balinese, and Batak.
10
+ """
11
+
12
+ # Detailed information about benchmark tasks evaluated in the leaderboard
13
+ INFO_BENCHMARK_TASK = """
14
+ ## Overview
15
+ This leaderboard evaluates the performance of various Large Language Models (LLMs) using SEA-HELM and IndoMMLU.
16
+ SEA-HELM is a benchmark that evaluates LLM on Natural Language Processing (NLP) classic tasks, safety, linguistics, culture, instruction following, and chat capabilities.
17
+ We focus on Indonesian, Javanese, Sundanese, Balinese, and Batak languages, adding tasks that are relevant to these languages.
18
+ IndoMMLU covers various subjects and educational levels, including STEM, social sciences, humanities, Indonesian language, and local languages & cultures.
19
+
20
+ ## Competencies
21
+
22
+ ### Natural Language Understanding (NLU)
23
+ - **Sentiment Analysis:** Classifies sentences as positive, negative, or neutral.
24
+ - **Question Answering (QA):** Answers questions based on a given passage. For Javanese and Sundanese, we employ a multiple-choice format.
25
+ - **Metaphor Recognition:** Selects between two options that best explain a given metaphorical sentence.
26
+
27
+ ### Natural Language Generation (NLG)
28
+ - **Translation:** For Indonesian, we evaluate translation to and from English. For the local languages, we evaluate translation to and from Indonesian.
29
+ - **Abstractive Summarization:** Summarize a passage into 1 or 2 sentences.
30
+
31
+ ### Natural Language Reasoning (NLR)
32
+ - **Causal Reasoning:** Given a premise and two options, select one which is the cause or effect of the premise.
33
+ - **Natural Language Inference (NLI):** Determine the relationship between a premise and hypothesis, classifying it as entailment, contradiction, or neutral.
34
+
35
+ ### Safety
36
+ - **Toxicity Detection:** Classifies sentences as toxic, hate speech, or clean.
37
+
38
+ ### Linguistic Diagnostics
39
+ - **Syntax:** Selects the grammatically correct sentence from two minimally differed sentences.
40
+ - **Pragmatics:** Given a situation, determines whether a sentence is true or false.
41
+
42
+ ### Instruction Following
43
+ - Follows human instructions to respond using a specific format, e.g., using JSON, mentioning a certain keyword, or providing a specific number of sentences.
44
+
45
+ ### Multi Turn
46
+ - Holds a human-like conversation in a multi-turn setting.
47
+ """
48
+
49
+ # Explanation of score calculation methodology
50
+ INFO_SCORE_CALCULATION = """
51
+ - The **overall score** for a language is computed as the **average** of all competency scores.
52
+ - Each **competency score** is computed as the **average** of its tasks.
53
+ - Normalization is applied for classification tasks by substracting the random baseline score and scaling it to the range of 0-100.
54
+ """
55
+
56
+ # Placeholder information about GoTo and Sahabat AI
57
+ INFO_GOTO_SAHABAT_AI = """
58
+ Sahabat-AI (Indonesian language for “close friends”) is a local open source Large Language Model (LLM) ecosystem in Indonesian language, co-initiated by Indonesian tech and telecommunication companies: GoTo Group and Indosat Ooredoo Hutchison. Sahabat-AI ecosystem aims to empower Indonesians who want to develop AI-based services and applications using Bahasa Indonesia and its various local languages.
59
+
60
+ We are supported by research centers and global tech experts such as AI Singapore to train the model to gain general language understanding.
61
+
62
+ We also collaborate with key top Indonesia universities such as University of Indonesia, Gadjah Mada University, Bogor Institute of Agriculture, Bandung Institute of Technology, University of North Sumatera (Universitas Sumatera Utara), and Udayana University, including top Indonesian media groups, such as Kompas Gramedia Group, and Republika, Tempo, and Hukumonline to train and enrich the model in Bahasa Indonesia, ensuring optimum provision of local context and cultural relevance.
63
+
64
+ We would like to invite researchers, developers, and language enthusiasts to actively contribute to the enhancement and expansion of Sahabat-AI. Your collaborations can involve:
65
+ - Identifying and reporting technical issues
66
+ - Sharing pre-training, instruction, and preference data
67
+ - Improving documentation usability
68
+ - Proposing and implementing new model evaluation tasks and metrics
69
+
70
+ Join us in shaping the future of Sahabat-AI by sharing your expertise and insights to make these models more accessible, accurate, and versatile.
71
+
72
+ You can contribute your ideas through [this form](https://docs.google.com/forms/d/1_us969eQtEooYOn4XkvGkdP5VHOyCbO6L_sd9kTMnaA).
73
+ """
74
+
75
+ CITATIONS = """
76
+ ```
77
+ @misc{susanto2025seahelmsoutheastasianholistic,
78
+ title={SEA-HELM: Southeast Asian Holistic Evaluation of Language Models},
79
+ author={Yosephine Susanto and Adithya Venkatadri Hulagadri and Jann Railey Montalan and Jian Gang Ngui and Xian Bin Yong and Weiqi Leong and Hamsawardhini Rengarajan and Peerat Limkonchotiwat and Yifan Mai and William Chandra Tjhi},
80
+ year={2025},
81
+ eprint={2502.14301},
82
+ archivePrefix={arXiv},
83
+ primaryClass={cs.CL},
84
+ url={https://arxiv.org/abs/2502.14301},
85
+ }
86
+ ```
87
+ ```
88
+ @inproceedings{koto-etal-2023-indommlu,
89
+ title = "Large Language Models Only Pass Primary School Exams in {I}ndonesia: A Comprehensive Test on {I}ndo{MMLU}",
90
+ author = "Fajri Koto and Nurul Aisyah and Haonan Li and Timothy Baldwin",
91
+ booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
92
+ month = December,
93
+ year = "2023",
94
+ address = "Singapore",
95
+ publisher = "Association for Computational Linguistics",
96
+ }
97
+ ```
98
+ """
99
+
100
+ # LEADERBOARD CONFIGURATION
101
+
102
+ # Path to the JSON file containing model performance data
103
+ file_path = "config/model_performance.jsonl"
104
+
105
+ # Label for the average score of SEA-HELM Indonesian languages
106
+ avg_label = "Indonesian Languages Average"
107
+
108
+ # Number of decimal places for rounding scores
109
+ round_precision = 2
110
+
111
+ # Delimiter used in dataset keys
112
+ delimiter = "."
113
+
114
+ model_types = ["Instruct", "Base"]
115
+
116
+ # Base information about model to be displayed in every leaderboard
117
+ # key is from JSONL, so it must be the same
118
+ # display used as column name in leaderboard
119
+ base_info = [
120
+ {
121
+ "key": "model_name",
122
+ "display": "Model"
123
+ },
124
+ {
125
+ "key": "model_type",
126
+ "display": "Type"
127
+ },
128
+ {
129
+ "key": "model_size",
130
+ "display": "Size"
131
+ },
132
+ ]
133
+
134
+ # List of languages evaluated in the leaderboard
135
+ # key: is from JSONL, so it must be the same
136
+ # display: used as column name in overall leaderboard
137
+ # main_table_avg: determine if the language shoul be added to average in overall leaderboard
138
+ # tab: tab name in top of leaderboard
139
+ # hidden_col: list of column to be hidden from leaderboard, so it must be the same col name as in leaderboard
140
+
141
+ language_list = [
142
+ {
143
+ "key": "id",
144
+ "display": "ID",
145
+ "main_table_avg": True,
146
+ "tab": "Indonesian",
147
+ "hidden_col": ["nlg", "nlu", "nlr", "safety", "linguistic-diagnostics"]
148
+ },
149
+ {
150
+ "key": "jv",
151
+ "display": "JV",
152
+ "main_table_avg": True,
153
+ "tab": "Javanese",
154
+ "hidden_col": ["nlg", "nlu", "nlr"]
155
+ },
156
+ {
157
+ "key": "su",
158
+ "display": "SU",
159
+ "main_table_avg": True,
160
+ "tab": "Sundanese",
161
+ "hidden_col": ["nlg", "nlu", "nlr"]
162
+ },
163
+ {
164
+ "key": "ban",
165
+ "display": "BAN",
166
+ "main_table_avg": True,
167
+ "tab": "Balinese",
168
+ "hidden_col": ["nlg", "nlu", "nlr"]
169
+ },
170
+ {
171
+ "key": "bbc",
172
+ "display": "BBC",
173
+ "main_table_avg": True,
174
+ "tab": "Batak",
175
+ "hidden_col": ["nlg", "nlu", "nlr"]
176
+ },
177
+ {
178
+ "key": "indommlu",
179
+ "display": "IndoMMLU",
180
+ "main_table_avg": False,
181
+ "tab": "IndoMMLU",
182
+ "hidden_col": []
183
+ }
184
+ ]
185
+
186
+ hidden_tabs = [
187
+ ("Base", "IndoMMLU")
188
+ ]
src/populate.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+
4
+ from .config import base_info, language_list, delimiter, avg_label, round_precision
5
+
6
+ def load_tables(file_path: str) -> pd.DataFrame:
7
+ """
8
+ Load and process the leaderboard data from a JSONL file.
9
+ - Flattens nested JSON structures.
10
+ - Computes total scores for each language.
11
+ - Sorts models by their performance.
12
+ """
13
+ data = []
14
+ with open(file_path, "r", encoding="utf-8") as f:
15
+ for line in f:
16
+ json_obj = json.loads(line) # Load each JSON object from the file
17
+ flattened = pd.json_normalize(json_obj, sep=delimiter) # Flatten the nested JSON structure
18
+ data.append(flattened)
19
+
20
+ # Combine all JSON objects into a single DataFrame
21
+ df = pd.concat(data, ignore_index=True)
22
+
23
+ # Round numeric values to the specified precision
24
+ df = df.map(lambda x: round(x, round_precision) if isinstance(x, (int, float)) else x)
25
+
26
+ base = pd.DataFrame()
27
+
28
+ # Extract base information (e.g., model name, type, size)
29
+ for info in base_info:
30
+ base[info["display"]] = df[info["key"]]
31
+
32
+ # Create the main leaderboard table
33
+ main_table = base.copy()
34
+
35
+ detailed_tables = []
36
+
37
+ for lang in language_list:
38
+ # Add total scores for each language to the main table
39
+ main_table[lang['display']] = df[f"{lang['key']}{delimiter}total"]
40
+
41
+ # Identify all columns related to the language
42
+ cols = [col for col in df.columns if col.startswith(lang["key"])]
43
+ total_col = None
44
+ table = base.copy()
45
+
46
+ for col in cols:
47
+ display_col = col.split(delimiter)[:-1] # Extract display column name
48
+
49
+ # Identify the total column (if it exists)
50
+ if len(display_col) == 1:
51
+ total_col = col
52
+
53
+ # Format column name for better readability
54
+ display_col = col if len(display_col) < 2 else " - ".join(display_col[1:])
55
+ table[display_col] = df[col]
56
+
57
+ # If a total column exists, move it to the front and sort the table
58
+ if total_col:
59
+ total_col_data = table.pop(total_col)
60
+ table.insert(len(base.columns), "Total", total_col_data)
61
+ table = table.sort_values(by="Total", ascending=False)
62
+
63
+ detailed_tables.append(table)
64
+
65
+ # Compute the overall average score for Indonesian languages
66
+ main_table[avg_label] = sum(
67
+ [main_table[lang["display"]] if lang["main_table_avg"] else 0 for lang in language_list]
68
+ )
69
+ main_table[avg_label] = round(
70
+ main_table[avg_label] / sum(lang["main_table_avg"] for lang in language_list), round_precision
71
+ )
72
+
73
+ # Move the average score column to the rightmost position
74
+ last_col = main_table.pop(main_table.columns[-1])
75
+ main_table.insert(len(base.columns), last_col.name, last_col)
76
+
77
+ # Sort models by the average score in descending order
78
+ main_table = main_table.sort_values(by=avg_label, ascending=False)
79
+
80
+ # Return structured leaderboard tables (overall + language-specific)
81
+ return [{"name": "Overall", "table": main_table, "hidden_col": []}] + [
82
+ {"name": lang["tab"], "table": table, "hidden_col": lang["hidden_col"]}
83
+ for lang, table in zip(language_list, detailed_tables)
84
+ ]