Commit
·
ef54478
0
Parent(s):
Initial commit of Sahabat-AI Leaderboard
Browse files- .gitattributes +35 -0
- .gitignore +13 -0
- .pre-commit-config.yaml +53 -0
- Makefile +13 -0
- README.md +44 -0
- app.py +65 -0
- config/model_performance.jsonl +28 -0
- dataframe.csv +18 -0
- pyproject.toml +13 -0
- requirements.txt +16 -0
- src/config.py +188 -0
- src/populate.py +84 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
auto_evals/
|
2 |
+
venv/
|
3 |
+
__pycache__/
|
4 |
+
.env
|
5 |
+
.ipynb_checkpoints
|
6 |
+
*ipynb
|
7 |
+
.vscode/
|
8 |
+
|
9 |
+
eval-queue/
|
10 |
+
eval-results/
|
11 |
+
eval-queue-bk/
|
12 |
+
eval-results-bk/
|
13 |
+
logs/
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
default_language_version:
|
16 |
+
python: python3
|
17 |
+
|
18 |
+
ci:
|
19 |
+
autofix_prs: true
|
20 |
+
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
21 |
+
autoupdate_schedule: quarterly
|
22 |
+
|
23 |
+
repos:
|
24 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
25 |
+
rev: v4.3.0
|
26 |
+
hooks:
|
27 |
+
- id: check-yaml
|
28 |
+
- id: check-case-conflict
|
29 |
+
- id: detect-private-key
|
30 |
+
- id: check-added-large-files
|
31 |
+
args: ['--maxkb=1000']
|
32 |
+
- id: requirements-txt-fixer
|
33 |
+
- id: end-of-file-fixer
|
34 |
+
- id: trailing-whitespace
|
35 |
+
|
36 |
+
- repo: https://github.com/PyCQA/isort
|
37 |
+
rev: 5.12.0
|
38 |
+
hooks:
|
39 |
+
- id: isort
|
40 |
+
name: Format imports
|
41 |
+
|
42 |
+
- repo: https://github.com/psf/black
|
43 |
+
rev: 22.12.0
|
44 |
+
hooks:
|
45 |
+
- id: black
|
46 |
+
name: Format code
|
47 |
+
additional_dependencies: ['click==8.0.2']
|
48 |
+
|
49 |
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
50 |
+
# Ruff version.
|
51 |
+
rev: 'v0.0.267'
|
52 |
+
hooks:
|
53 |
+
- id: ruff
|
Makefile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: style format
|
2 |
+
|
3 |
+
|
4 |
+
style:
|
5 |
+
python -m black --line-length 119 .
|
6 |
+
python -m isort .
|
7 |
+
ruff check --fix .
|
8 |
+
|
9 |
+
|
10 |
+
quality:
|
11 |
+
python -m black --check --line-length 119 .
|
12 |
+
python -m isort --check-only .
|
13 |
+
ruff check .
|
README.md
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Sahabat AI Leaderboard
|
3 |
+
emoji: 🥇
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
7 |
+
app_file: app.py
|
8 |
+
pinned: true
|
9 |
+
license: apache-2.0
|
10 |
+
---
|
11 |
+
|
12 |
+
# Start the configuration
|
13 |
+
|
14 |
+
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
15 |
+
|
16 |
+
Results files should have the following format and be stored as json files:
|
17 |
+
```json
|
18 |
+
{
|
19 |
+
"config": {
|
20 |
+
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
21 |
+
"model_name": "path of the model on the hub: org/model",
|
22 |
+
"model_sha": "revision on the hub",
|
23 |
+
},
|
24 |
+
"results": {
|
25 |
+
"task_name": {
|
26 |
+
"metric_name": score,
|
27 |
+
},
|
28 |
+
"task_name2": {
|
29 |
+
"metric_name": score,
|
30 |
+
}
|
31 |
+
}
|
32 |
+
}
|
33 |
+
```
|
34 |
+
|
35 |
+
Request files are created automatically by this tool.
|
36 |
+
|
37 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
38 |
+
|
39 |
+
# Code logic for more complex edits
|
40 |
+
|
41 |
+
You'll find
|
42 |
+
- the main table' columns names and properties in `src/display/utils.py`
|
43 |
+
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
44 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
app.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
+
|
4 |
+
# Import function to load leaderboard tables
|
5 |
+
from src.populate import load_tables
|
6 |
+
# Import configurations and informational texts
|
7 |
+
from src.config import (
|
8 |
+
file_path,
|
9 |
+
model_types,
|
10 |
+
hidden_tabs,
|
11 |
+
INTRODUCTION_TEXT,
|
12 |
+
TITLE,
|
13 |
+
INFO_BENCHMARK_TASK,
|
14 |
+
INFO_SCORE_CALCULATION,
|
15 |
+
INFO_GOTO_SAHABAT_AI,
|
16 |
+
CITATIONS
|
17 |
+
)
|
18 |
+
|
19 |
+
# Create a Gradio application with block-based UI
|
20 |
+
# 'Blocks()' is used to group multiple components in a single interface
|
21 |
+
demo = gr.Blocks()
|
22 |
+
with demo:
|
23 |
+
gr.HTML(TITLE) # Display the main title of the application
|
24 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") # Display introductory text
|
25 |
+
|
26 |
+
# Create tabs to display different leaderboard tables
|
27 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
28 |
+
tables = load_tables(file_path) # Load leaderboard data from file
|
29 |
+
for model_type in model_types:
|
30 |
+
with gr.TabItem(model_type, elem_id="llm-benchmark-tab-table", id=model_type):
|
31 |
+
for i, t in enumerate(tables): # Loop through the tables to create tabs
|
32 |
+
if (model_type, t["name"]) in hidden_tabs:
|
33 |
+
continue
|
34 |
+
with gr.TabItem(t["name"], elem_id="llm-benchmark-tab-table", id=i):
|
35 |
+
table_df = t["table"][t["table"]["Type"] == model_type]
|
36 |
+
table_df = table_df.dropna(axis=1, how='all')
|
37 |
+
leaderboard = Leaderboard(
|
38 |
+
value=table_df, # Leaderboard data
|
39 |
+
search_columns=["Model"], # Columns that can be searched
|
40 |
+
filter_columns=[
|
41 |
+
ColumnFilter(table_df["Size"].name, type="checkboxgroup", label="Model sizes"),
|
42 |
+
], # Filters based on model type and size
|
43 |
+
hide_columns=t["hidden_col"], # Columns to be hidden imported from config.py
|
44 |
+
interactive=False,
|
45 |
+
)
|
46 |
+
|
47 |
+
# Add additional informational sections using Accordion
|
48 |
+
with gr.Row():
|
49 |
+
with gr.Accordion("📚 Benchmark Tasks", open=False):
|
50 |
+
gr.Markdown(INFO_BENCHMARK_TASK, elem_classes="markdown-text")
|
51 |
+
|
52 |
+
with gr.Row():
|
53 |
+
with gr.Accordion("🧮 Score Calculation", open=False):
|
54 |
+
gr.Markdown(INFO_SCORE_CALCULATION, elem_classes="markdown-text")
|
55 |
+
|
56 |
+
with gr.Row():
|
57 |
+
with gr.Accordion("🤝 About Sahabat-AI", open=False):
|
58 |
+
gr.Markdown(INFO_GOTO_SAHABAT_AI, elem_classes="markdown-text")
|
59 |
+
|
60 |
+
with gr.Row():
|
61 |
+
with gr.Accordion("📝 Citations", open=False):
|
62 |
+
gr.Markdown(CITATIONS, elem_classes="markdown-text")
|
63 |
+
|
64 |
+
# Run the application
|
65 |
+
demo.launch()
|
config/model_performance.jsonl
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model_name": "meta-llama/Llama-3.1-8B", "model_type": "Base", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 70.62232177639267}, "qa": {"total": 74.0400264179707}, "metaphor": {"total": 59.26640926640927}, "total": 67.9762524869242}, "safety": {"toxicity": {"total": 49.02062654372765}, "total": 49.02062654372765}, "nlg": {"abssum": {"total": 23.608069870057587}, "translation-en-xx": {"total": 89.18834918478261}, "translation-xx-en": {"total": 91.81120306324111}, "total": 57.053922997034725}, "nlr": {"causal": {"total": 72.0}, "nli": {"total": 60.66223252911016}, "total": 66.33111626455508}, "linguistic-diagnostics": {"mp-r": {"total": 38.467261904761926}, "pragmatics": {"total": 0}, "total": 19.233630952380963}, "instruction-following": {"total": 17.142857142857142}, "total": 46.126401064579966}, "jv": {"nlu": {"sentiment": {"total": 60.85693940041301}, "qa-mc": {"total": 60.47619047619048}, "metaphor": {"total": 23.65191146881287}, "total": 48.32834711513879}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 62.24888959390863}, "translation-xx-id": {"total": 76.60850253807106}, "total": 69.42869606598984}, "total": 43.696792171487324}, "su": {"nlu": {"sentiment": {"total": 69.74912767927081}, "qa-mc": {"total": 45.714285714285715}, "metaphor": {"total": 10.994668137525277}, "total": 42.15269384369393}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 57.64538388324873}, "translation-xx-id": {"total": 71.5006543464467}, "total": 64.57301911484771}, "total": 41.606983684593246}, "ban": {"nlu": {"sentiment": {"total": 52.53414512568539}, "total": 52.53414512568539}, "instruction-following": {"total": 26.666666666666668}, "nlg": {"translation-id-xx": {"total": 52.05575824873097}, "translation-xx-id": {"total": 63.63691703680203}, "total": 57.8463376427665}, "total": 45.68238314503952}, "bbc": {"nlu": {"sentiment": {"total": 19.372142704550303}, "qa": {"total": 36.86877604474056}, "total": 28.120459374645428}, "instruction-following": {"total": 21.904761904761905}, "nlg": {"translation-id-xx": {"total": 22.465418781725887}, "translation-xx-id": {"total": 53.68107550761421}, "total": 38.07324714467005}, "total": 29.366156141359127}}
|
2 |
+
{"model_name": "meta-llama/Meta-Llama-3-8B", "model_type": "Base", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 71.67852930355365}, "qa": {"total": 74.29545907237966}, "metaphor": {"total": 57.901268615554315}, "total": 67.95841899716254}, "safety": {"toxicity": {"total": 51.13260813250443}, "total": 51.13260813250443}, "nlg": {"abssum": {"total": 22.55571208466509}, "translation-en-xx": {"total": 89.17333791378458}, "translation-xx-en": {"total": 91.6809458374506}, "total": 56.49142698014134}, "nlr": {"causal": {"total": 68.8}, "nli": {"total": 51.71622423858406}, "total": 60.25811211929203}, "linguistic-diagnostics": {"mp-r": {"total": 43.08035714285714}, "pragmatics": {"total": 0}, "total": 21.54017857142857}, "instruction-following": {"total": 13.333333333333334}, "total": 45.11901302231038}, "jv": {"nlu": {"sentiment": {"total": 66.6700135298725}, "qa-mc": {"total": 58.57142857142858}, "metaphor": {"total": 24.28571428571429}, "total": 49.842385462338456}, "instruction-following": {"total": 11.428571428571429}, "nlg": {"translation-id-xx": {"total": 62.900222081218274}, "translation-xx-id": {"total": 77.09444400380711}, "total": 69.9973330425127}, "total": 43.75609664447419}, "su": {"nlu": {"sentiment": {"total": 67.27850174464145}, "qa-mc": {"total": 43.80952380952382}, "metaphor": {"total": 4.908990623276344}, "total": 38.6656720591472}, "instruction-following": {"total": 12.380952380952381}, "nlg": {"translation-id-xx": {"total": 55.41981281725889}, "translation-xx-id": {"total": 72.32473429568527}, "total": 63.87227355647208}, "total": 38.306299332190555}, "ban": {"nlu": {"sentiment": {"total": 52.15723136082033}, "total": 52.15723136082033}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 50.92592005076142}, "translation-xx-id": {"total": 64.71889871510152}, "total": 57.82240938293147}, "total": 42.056705644742664}, "bbc": {"nlu": {"sentiment": {"total": 13.632699565619873}, "qa": {"total": 40.6976614170028}, "total": 27.165180491311336}, "instruction-following": {"total": 25.71428571428571}, "nlg": {"translation-id-xx": {"total": 20.986992385786802}, "translation-xx-id": {"total": 53.95026967005076}, "total": 37.46863102791878}, "total": 30.116032411171943}}
|
3 |
+
{"model_name": "meta-llama/Llama-3.3-70B-Instruct", "model_type": "Instruct", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 74.03171395489761}, "qa": {"total": 76.30208921433771}, "metaphor": {"total": 75.53318624747196}, "total": 75.28899647223575}, "safety": {"toxicity": {"total": 51.150070847815535}, "total": 51.150070847815535}, "nlg": {"abssum": {"total": 17.398772409440475}, "translation-en-xx": {"total": 91.30748594985178}, "translation-xx-en": {"total": 92.44811287981719}, "total": 54.638285912137476}, "nlr": {"causal": {"total": 93.19999999999999}, "nli": {"total": 78.47882864922653}, "total": 85.83941432461326}, "linguistic-diagnostics": {"mp-r": {"total": 42.782738095238074}, "pragmatics": {"total": 74.8076923076923}, "total": 58.79521520146519}, "instruction-following": {"total": 93.33333333333333}, "multi-turn": {"total": 66.71768707482994}, "total": 69.3947147380615}, "jv": {"nlu": {"sentiment": {"total": 64.5398419141209}, "qa-mc": {"total": 85.71428571428572}, "metaphor": {"total": 47.48490945674044}, "total": 65.91301236171569}, "instruction-following": {"total": 30.476190476190478}, "nlg": {"translation-id-xx": {"total": 76.54370241116752}, "translation-xx-id": {"total": 87.42237071700508}, "total": 81.98303656408629}, "multi-turn": {"total": 69.60884353741497}, "total": 61.99527073485186}, "su": {"nlu": {"sentiment": {"total": 60.20650858078758}, "qa-mc": {"total": 76.19047619047619}, "metaphor": {"total": 34.8271741128884}, "total": 57.07471962805072}, "instruction-following": {"total": 50.476190476190474}, "nlg": {"translation-id-xx": {"total": 72.01518876903553}, "translation-xx-id": {"total": 81.19715656725889}, "total": 76.60617266814721}, "multi-turn": {"total": 74.48979591836735}, "total": 64.66171967268895}, "ban": {"nlu": {"sentiment": {"total": 55.27572456027914}, "total": 55.27572456027914}, "instruction-following": {"total": 30.476190476190478}, "nlg": {"translation-id-xx": {"total": 52.35049175126903}, "translation-xx-id": {"total": 73.82601324555837}, "total": 63.0882524984137}, "multi-turn": {"total": 68.41836734693878}, "total": 54.31463372045552}, "bbc": {"nlu": {"sentiment": {"total": 23.92152673930071}, "qa": {"total": 37.0966478780963}, "total": 30.509087308698504}, "instruction-following": {"total": 11.428571428571429}, "nlg": {"translation-id-xx": {"total": 40.36294416243655}, "translation-xx-id": {"total": 58.640625}, "total": 49.501784581218274}, "multi-turn": {"total": 64.21768707482994}, "total": 38.914282598329535}, "indommlu": {"total": 58.94715166043224, "STEM": {"total": 60.65596005103318}, "Humanities": {"total": 70.13279289089097}, "Social science": {"total": 68.19753140670149}, "Indonesian language": {"total": 62.79989190728599}, "Local languages and cultures": {"total": 37.77960008071277}}}
|
4 |
+
{"model_name": "meta-llama/Llama-3.1-8B-Instruct", "model_type": "Instruct", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 72.43735932562872}, "qa": {"total": 72.98486282425593}, "metaphor": {"total": 58.645890788747934}, "total": 68.02270431287752}, "safety": {"toxicity": {"total": 38.908373094726265}, "total": 38.908373094726265}, "nlg": {"abssum": {"total": 18.678845956389008}, "translation-en-xx": {"total": 89.42876497653162}, "translation-xx-en": {"total": 91.56136132040514}, "total": 54.58695455242869}, "nlr": {"causal": {"total": 63.20000000000001}, "nli": {"total": 58.413678956243096}, "total": 60.80683947812155}, "linguistic-diagnostics": {"mp-r": {"total": 22.54464285714286}, "pragmatics": {"total": 14.32692307692307}, "total": 18.435782967032964}, "instruction-following": {"total": 76.19047619047619}, "multi-turn": {"total": 52.61904761904762}, "total": 52.79573974495869}, "jv": {"nlu": {"sentiment": {"total": 53.942391226945794}, "qa-mc": {"total": 58.57142857142858}, "metaphor": {"total": 17.444668008048293}, "total": 43.319495935474215}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 59.02189086294416}, "translation-xx-id": {"total": 75.13764078362944}, "total": 67.0797658232868}, "multi-turn": {"total": 53.40136054421769}, "total": 43.33110795669706}, "su": {"nlu": {"sentiment": {"total": 43.91839350566117}, "qa-mc": {"total": 43.33333333333333}, "metaphor": {"total": 5.244530244530243}, "total": 30.832085694508248}, "instruction-following": {"total": 7.6190476190476195}, "nlg": {"translation-id-xx": {"total": 46.92480964467005}, "translation-xx-id": {"total": 61.685398159898476}, "total": 54.30510390228426}, "multi-turn": {"total": 58.11224489795919}, "total": 37.717120528449826}, "ban": {"nlu": {"sentiment": {"total": 33.98760948515275}, "total": 33.98760948515275}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 35.4739451142132}, "translation-xx-id": {"total": 59.36871430837564}, "total": 47.42132971129442}, "multi-turn": {"total": 52.908163265306115}, "total": 35.9602279963907}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 25.257711433282036}, "total": 12.628855716641018}, "instruction-following": {"total": 0.9523809523809524}, "nlg": {"translation-id-xx": {"total": 30.788229695431472}, "translation-xx-id": {"total": 38.916164340101524}, "total": 34.8521970177665}, "multi-turn": {"total": 43.265306122448976}, "total": 22.924684952309363}, "indommlu": {"total": 40.89436643992288, "STEM": {"total": 39.189826629857905}, "Humanities": {"total": 48.02667298928306}, "Social science": {"total": 51.41311806730703}, "Indonesian language": {"total": 47.57202823512603}, "Local languages and cultures": {"total": 21.777177572443968}}}
|
5 |
+
{"model_name": "meta-llama/Llama-3.1-70B-Instruct", "model_type": "Instruct", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 74.03171395489761}, "qa": {"total": 75.11092022580092}, "metaphor": {"total": 70.78047435190294}, "total": 73.30770284420049}, "safety": {"toxicity": {"total": 52.48912859389008}, "total": 52.48912859389008}, "nlg": {"abssum": {"total": 22.789696909710898}, "translation-en-xx": {"total": 91.94257001914525}, "translation-xx-en": {"total": 92.4886296087574}, "total": 57.50264836183111}, "nlr": {"causal": {"total": 92.39999999999999}, "nli": {"total": 74.06280477957523}, "total": 83.2314023897876}, "linguistic-diagnostics": {"mp-r": {"total": 54.077380952380935}, "pragmatics": {"total": 57.01923076923077}, "total": 55.548305860805854}, "instruction-following": {"total": 83.80952380952381}, "multi-turn": {"total": 64.28571428571429}, "total": 67.16777516367902}, "jv": {"nlu": {"sentiment": {"total": 65.16378266752119}, "qa-mc": {"total": 84.14785756249171}, "metaphor": {"total": 47.40442655935615}, "total": 65.57202226312302}, "instruction-following": {"total": 24.761904761904763}, "nlg": {"translation-id-xx": {"total": 79.32576538705584}, "translation-xx-id": {"total": 85.88955425126903}, "total": 82.60765981916244}, "multi-turn": {"total": 69.84693877551021}, "total": 60.69713140492511}, "su": {"nlu": {"sentiment": {"total": 63.15274513992737}, "qa-mc": {"total": 72.35942918869749}, "metaphor": {"total": 25.284978856407434}, "total": 53.599051061677436}, "instruction-following": {"total": 40.95238095238095}, "nlg": {"translation-id-xx": {"total": 74.6673144035533}, "translation-xx-id": {"total": 81.10521097715736}, "total": 77.88626269035532}, "multi-turn": {"total": 66.20748299319727}, "total": 59.66129442440274}, "ban": {"nlu": {"sentiment": {"total": 58.48444064658549}, "total": 58.48444064658549}, "instruction-following": {"total": 25.71428571428571}, "nlg": {"translation-id-xx": {"total": 50.2741116751269}, "translation-xx-id": {"total": 73.7688075031726}, "total": 62.02145958914975}, "multi-turn": {"total": 68.36734693877551}, "total": 53.646883222199115}, "bbc": {"nlu": {"sentiment": {"total": 21.778466139713746}, "qa": {"total": 36.54511563444106}, "total": 29.1617908870774}, "instruction-following": {"total": 11.428571428571429}, "nlg": {"translation-id-xx": {"total": 40.4752538071066}, "translation-xx-id": {"total": 57.3776173857868}, "total": 48.926435596446694}, "multi-turn": {"total": 56.173469387755105}, "total": 36.42256682496266}, "indommlu": {"total": 59.20845024270631, "STEM": {"total": 61.23914364790225}, "Humanities": {"total": 70.01440666360243}, "Social science": {"total": 68.1316435966181}, "Indonesian language": {"total": 63.557422093070194}, "Local languages and cultures": {"total": 37.485822556014845}}}
|
6 |
+
{"model_name": "meta-llama/Meta-Llama-3-8B-Instruct", "model_type": "Instruct", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 65.89674501147039}, "qa": {"total": 71.49747763705508}, "metaphor": {"total": 55.906416620702345}, "total": 64.43354642307594}, "safety": {"toxicity": {"total": 30.5564686517935}, "total": 30.5564686517935}, "nlg": {"abssum": {"total": 18.04238577912569}, "translation-en-xx": {"total": 87.625}, "translation-xx-en": {"total": 90.98604007766181}, "total": 53.6739529089783}, "nlr": {"causal": {"total": 68.00000000000001}, "nli": {"total": 58.605562589496515}, "total": 63.302781294748264}, "linguistic-diagnostics": {"mp-r": {"total": 20.818452380952365}, "pragmatics": {"total": 30.57692307692306}, "total": 25.697687728937712}, "instruction-following": {"total": 28.57142857142857}, "multi-turn": {"total": 55.90136054421768}, "total": 46.01960373188285}, "jv": {"nlu": {"sentiment": {"total": 42.61952574236275}, "qa-mc": {"total": 48.0952380952381}, "metaphor": {"total": 16.056338028169016}, "total": 35.59036728858996}, "instruction-following": {"total": 2.857142857142857}, "nlg": {"translation-id-xx": {"total": 43.50182423857868}, "translation-xx-id": {"total": 70.62736952728426}, "total": 57.06459688293147}, "multi-turn": {"total": 61.683673469387756}, "total": 39.29894512451301}, "su": {"nlu": {"sentiment": {"total": 28.000000000000004}, "qa-mc": {"total": 31.428571428571427}, "metaphor": {"total": 0}, "total": 19.80952380952381}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 45.93464467005076}, "translation-xx-id": {"total": 58.81846843274111}, "total": 52.376556551395936}, "multi-turn": {"total": 56.88775510204081}, "total": 33.22083981812109}, "ban": {"nlu": {"sentiment": {"total": 0}, "total": 0.0}, "instruction-following": {"total": 0.9523809523809524}, "nlg": {"translation-id-xx": {"total": 57.54861992385787}, "translation-xx-id": {"total": 52.827906884517766}, "total": 55.18826340418782}, "multi-turn": {"total": 54.48979591836734}, "total": 27.65761006873403}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 18.09226209386497}, "total": 9.046131046932485}, "nlg": {"translation-id-xx": {"total": 18.681789340101524}, "translation-xx-id": {"total": 32.99952411167513}, "total": 25.840656725888326}, "multi-turn": {"total": 48.33333333333334}, "total": 27.740040368718052}, "indommlu": {"total": 32.80503442646069, "STEM": {"total": 30.745384112748297}, "Humanities": {"total": 40.495667733523575}, "Social science": {"total": 44.50789696902289}, "Indonesian language": {"total": 36.58659081993335}, "Local languages and cultures": {"total": 15.699814759911021}}}
|
7 |
+
{"model_name": "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct", "model_type": "Instruct", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 80.03073193957495}, "qa": {"total": 73.67491779837458}, "metaphor": {"total": 63.297481154624016}, "total": 72.33437696419118}, "safety": {"toxicity": {"total": 49.08728635049469}, "total": 49.08728635049469}, "nlg": {"abssum": {"total": 21.912194636447175}, "translation-en-xx": {"total": 91.51027513586956}, "translation-xx-en": {"total": 91.94370869874012}, "total": 56.819593276876006}, "nlr": {"causal": {"total": 76.0}, "nli": {"total": 55.57921414897125}, "total": 65.78960707448563}, "linguistic-diagnostics": {"mp-r": {"total": 33.928571428571445}, "pragmatics": {"total": 22.307692307692317}, "total": 28.118131868131883}, "instruction-following": {"total": 79.04761904761905}, "multi-turn": {"total": 52.312925170068034}, "total": 57.64421996455234}, "jv": {"nlu": {"sentiment": {"total": 76.31624296802674}, "qa-mc": {"total": 56.66666666666668}, "metaphor": {"total": 39.81891348088533}, "total": 57.600607705192914}, "instruction-following": {"total": 30.476190476190478}, "nlg": {"translation-id-xx": {"total": 72.59359137055837}, "translation-xx-id": {"total": 85.57393123413705}, "total": 79.08376130234771}, "multi-turn": {"total": 58.40136054421768}, "total": 56.3904800069872}, "su": {"nlu": {"sentiment": {"total": 68.27543972085736}, "qa-mc": {"total": 43.8095238095238}, "metaphor": {"total": 24.774774774774766}, "total": 45.61991276838531}, "instruction-following": {"total": 40.95238095238095}, "nlg": {"translation-id-xx": {"total": 63.15315672588832}, "translation-xx-id": {"total": 80.32120479060913}, "total": 71.73718075824873}, "multi-turn": {"total": 55.2891156462585}, "total": 53.399647531318365}, "ban": {"nlu": {"sentiment": {"total": 37.35099337748344}, "total": 37.35099337748344}, "instruction-following": {"total": 12.380952380952381}, "nlg": {"translation-id-xx": {"total": 53.2818845177665}, "translation-xx-id": {"total": 62.99171161167513}, "total": 58.136798064720814}, "multi-turn": {"total": 47.32993197278911}, "total": 38.79966894898644}, "bbc": {"nlu": {"sentiment": {"total": 7.942604856512146}, "qa": {"total": 25.678260705469608}, "total": 16.810432780990876}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 23.885786802030456}, "translation-xx-id": {"total": 44.879917512690355}, "total": 34.382852157360404}, "multi-turn": {"total": 42.89115646258503}, "total": 26.85444368356741}, "indommlu": {"total": 31.99357367401823, "STEM": {"total": 33.22726354840381}, "Humanities": {"total": 41.0664295998681}, "Social science": {"total": 41.72069115515451}, "Indonesian language": {"total": 31.98855253821362}, "Local languages and cultures": {"total": 16.08184881924881}}}
|
8 |
+
{"model_name": "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct", "model_type": "Instruct", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 78.58212407479547}, "qa": {"total": 78.3094264272406}, "metaphor": {"total": 73.4969663541092}, "total": 76.79617228538176}, "safety": {"toxicity": {"total": 36.55204361614938}, "total": 36.55204361614938}, "nlg": {"abssum": {"total": 21.005682373720838}, "translation-en-xx": {"total": 92.34778990010498}, "translation-xx-en": {"total": 92.74215179563983}, "total": 56.775326610796625}, "nlr": {"causal": {"total": 88.39999999999999}, "nli": {"total": 64.9508863012699}, "total": 76.67544315063495}, "linguistic-diagnostics": {"mp-r": {"total": 43.764880952380935}, "pragmatics": {"total": 58.07692307692307}, "total": 50.920902014652}, "instruction-following": {"total": 85.71428571428571}, "multi-turn": {"total": 55.935374149659864}, "total": 62.767078220222906}, "jv": {"nlu": {"sentiment": {"total": 76.60243537705617}, "qa-mc": {"total": 83.33333333333334}, "metaphor": {"total": 50.30181086519114}, "total": 70.07919319186021}, "instruction-following": {"total": 49.523809523809526}, "nlg": {"translation-id-xx": {"total": 73.32515069796955}, "translation-xx-id": {"total": 88.26601165926395}, "total": 80.79558117861674}, "multi-turn": {"total": 66.39455782312925}, "total": 66.69828542935393}, "su": {"nlu": {"sentiment": {"total": 72.50786868902657}, "qa-mc": {"total": 71.42857142857143}, "metaphor": {"total": 37.57124471410185}, "total": 60.50256161056662}, "instruction-following": {"total": 55.23809523809524}, "nlg": {"translation-id-xx": {"total": 62.6653711928934}, "translation-xx-id": {"total": 83.43284026015229}, "total": 73.04910572652284}, "multi-turn": {"total": 69.4047619047619}, "total": 64.54863111998665}, "ban": {"nlu": {"sentiment": {"total": 55.57380901516771}, "total": 55.57380901516771}, "instruction-following": {"total": 6.666666666666667}, "nlg": {"translation-id-xx": {"total": 34.44487626903553}, "translation-xx-id": {"total": 59.96829394035533}, "total": 47.206585104695435}, "multi-turn": {"total": 59.336734693877546}, "total": 42.195948870101844}, "bbc": {"nlu": {"sentiment": {"total": 11.256070640176597}, "qa": {"total": 25.453627519664387}, "total": 18.35484907992049}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 32.850372779187815}, "translation-xx-id": {"total": 26.463039340101524}, "total": 29.65670605964467}, "multi-turn": {"total": 54.96598639455783}, "total": 29.077718716864084}, "indommlu": {"total": 52.11201654890406, "STEM": {"total": 53.8416812123537}, "Humanities": {"total": 58.044855388925995}, "Social science": {"total": 59.625418126693894}, "Indonesian language": {"total": 57.43955221495361}, "Local languages and cultures": {"total": 34.63575868668624}}}
|
9 |
+
{"model_name": "GoToCompany/llama3-8b-cpt-sahabatai-v1-base", "model_type": "Base", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 77.33188059992209}, "qa": {"total": 74.70098262553658}, "metaphor": {"total": 68.10994668137526}, "total": 73.3809366356113}, "safety": {"toxicity": {"total": 54.40860992141678}, "total": 54.40860992141678}, "nlg": {"abssum": {"total": 24.768063265770042}, "translation-en-xx": {"total": 91.82524356163538}, "translation-xx-en": {"total": 92.12213665506114}, "total": 58.370876687059145}, "nlr": {"causal": {"total": 78.4}, "nli": {"total": 65.01227670867794}, "total": 71.70613835433898}, "linguistic-diagnostics": {"mp-r": {"total": 37.85714285714286}, "pragmatics": {"total": 32.21153846153846}, "total": 35.03434065934066}, "instruction-following": {"total": 15.238095238095239}, "total": 51.356499582643686}, "jv": {"nlu": {"sentiment": {"total": 73.97244178594318}, "qa-mc": {"total": 64.76190476190476}, "metaphor": {"total": 51.026156941649894}, "total": 63.25350116316594}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 82.35594463832487}, "translation-xx-id": {"total": 88.25025777284264}, "total": 85.30310120558376}, "total": 55.55061348799594}, "su": {"nlu": {"sentiment": {"total": 75.53820408744569}, "qa-mc": {"total": 54.76190476190478}, "metaphor": {"total": 39.630446773303916}, "total": 56.64351854088479}, "instruction-following": {"total": 20.952380952380953}, "nlg": {"translation-id-xx": {"total": 78.38067100253807}, "translation-xx-id": {"total": 87.74551871827411}, "total": 83.0630948604061}, "total": 53.55299811789062}, "ban": {"nlu": {"sentiment": {"total": 58.36509292886136}, "total": 58.36509292886136}, "instruction-following": {"total": 24.761904761904763}, "nlg": {"translation-id-xx": {"total": 43.015704314720814}, "translation-xx-id": {"total": 69.08879282994924}, "total": 56.05224857233503}, "total": 46.393082087700385}, "bbc": {"nlu": {"sentiment": {"total": 19.190699992879008}, "qa": {"total": 36.80323137445211}, "total": 27.996965683665557}, "instruction-following": {"total": 24.761904761904763}, "nlg": {"translation-id-xx": {"total": 20.583439086294415}, "translation-xx-id": {"total": 58.37277918781726}, "total": 39.47810913705584}, "total": 30.74565986087539}}
|
10 |
+
{"model_name": "GoToCompany/gemma2-9b-cpt-sahabatai-v1-base", "model_type": "Base", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 79.61513385707484}, "qa": {"total": 74.32736269344839}, "metaphor": {"total": 71.46534289391431}, "total": 75.13594648147918}, "safety": {"toxicity": {"total": 63.078376622102674}, "total": 63.078376622102674}, "nlg": {"abssum": {"total": 26.31522882135615}, "translation-en-xx": {"total": 92.13952684967886}, "translation-xx-en": {"total": 92.6986272156003}, "total": 59.36715292699786}, "nlr": {"causal": {"total": 87.99999999999999}, "nli": {"total": 50.45527660978115}, "total": 69.22763830489058}, "linguistic-diagnostics": {"mp-r": {"total": 55.610119047619065}, "pragmatics": {"total": 49.90384615384615}, "total": 52.75698260073261}, "instruction-following": {"total": 34.285714285714285}, "total": 58.975301870319534}, "jv": {"nlu": {"sentiment": {"total": 77.19824823755607}, "qa-mc": {"total": 78.09523809523809}, "metaphor": {"total": 58.79275653923541}, "total": 71.3620809573432}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 84.98171795685279}, "translation-xx-id": {"total": 89.59297668147208}, "total": 87.28734731916244}, "total": 57.327587203279656}, "su": {"nlu": {"sentiment": {"total": 70.37057608773053}, "qa-mc": {"total": 70.4761904761905}, "metaphor": {"total": 45.04504504504503}, "total": 61.96393720298869}, "instruction-following": {"total": 19.047619047619047}, "nlg": {"translation-id-xx": {"total": 77.55825666243655}, "translation-xx-id": {"total": 88.59196541878173}, "total": 83.07511104060913}, "total": 54.69555576373896}, "ban": {"nlu": {"sentiment": {"total": 68.91348002563555}, "total": 68.91348002563555}, "instruction-following": {"total": 20.0}, "nlg": {"translation-id-xx": {"total": 61.31940038071066}, "translation-xx-id": {"total": 75.09899468591371}, "total": 68.20919753331219}, "total": 52.37422585298258}, "bbc": {"nlu": {"sentiment": {"total": 25.21889909563484}, "qa": {"total": 37.68915856623248}, "total": 31.45402883093366}, "instruction-following": {"total": 17.142857142857142}, "nlg": {"translation-id-xx": {"total": 33.5982709390863}, "translation-xx-id": {"total": 64.51764752538071}, "total": 49.0579592322335}, "total": 32.55161506867477}}
|
11 |
+
{"model_name": "google/gemma-2-9b-it", "model_type": "Instruct", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 67.69724170021209}, "qa": {"total": 79.92186846501487}, "metaphor": {"total": 74.20022062879205}, "total": 73.93977693133968}, "safety": {"toxicity": {"total": 37.87798080322485}, "total": 37.87798080322485}, "nlg": {"abssum": {"total": 19.35211019808279}, "translation-en-xx": {"total": 92.32352106750247}, "translation-xx-en": {"total": 92.73650375185277}, "total": 55.941061303880204}, "nlr": {"causal": {"total": 88.39999999999999}, "nli": {"total": 61.13890565692307}, "total": 74.76945282846154}, "linguistic-diagnostics": {"mp-r": {"total": 34.747023809523796}, "pragmatics": {"total": 50.86538461538461}, "total": 42.806204212454205}, "instruction-following": {"total": 92.38095238095238}, "multi-turn": {"total": 64.09863945578232}, "total": 63.116295416585025}, "jv": {"nlu": {"sentiment": {"total": 61.17040518407748}, "qa-mc": {"total": 75.23809523809524}, "metaphor": {"total": 39.708249496981885}, "total": 58.705583306384874}, "instruction-following": {"total": 17.142857142857142}, "nlg": {"translation-id-xx": {"total": 68.26288864213198}, "translation-xx-id": {"total": 84.07596367385787}, "total": 76.16942615799493}, "multi-turn": {"total": 71.61564625850339}, "total": 55.90837821643508}, "su": {"nlu": {"sentiment": {"total": 60.04358043153173}, "qa-mc": {"total": 68.57142857142856}, "metaphor": {"total": 24.080713366427652}, "total": 50.898574123129315}, "instruction-following": {"total": 20.952380952380953}, "nlg": {"translation-id-xx": {"total": 60.161326142131976}, "translation-xx-id": {"total": 79.23067695114213}, "total": 69.69600154663705}, "multi-turn": {"total": 67.87414965986393}, "total": 52.35527657050281}, "ban": {"nlu": {"sentiment": {"total": 52.08858506017232}, "total": 52.08858506017232}, "instruction-following": {"total": 20.0}, "nlg": {"translation-id-xx": {"total": 40.809327411167516}, "translation-xx-id": {"total": 72.12482154187818}, "total": 56.467074476522846}, "multi-turn": {"total": 63.23129251700681}, "total": 47.946738013425495}, "bbc": {"nlu": {"sentiment": {"total": 12.640390230007828}, "qa": {"total": 35.19812281429937}, "total": 23.919256522153596}, "instruction-following": {"total": 10.476190476190476}, "nlg": {"translation-id-xx": {"total": 26.89752538071066}, "translation-xx-id": {"total": 39.81813134517766}, "total": 33.35782836294416}, "multi-turn": {"total": 60.71428571428571}, "total": 32.116890268893485}, "indommlu": {"total": 48.999188639177675, "STEM": {"total": 51.07174652386319}, "Humanities": {"total": 55.98237552708435}, "Social science": {"total": 58.995928840011516}, "Indonesian language": {"total": 55.75432178720666}, "Local languages and cultures": {"total": 25.93271589159855}}}
|
12 |
+
{"model_name": "google/gemma-2-9b", "model_type": "Base", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 78.6433985845994}, "qa": {"total": 71.57453630485351}, "metaphor": {"total": 74.1956241956242}, "total": 74.8045196950257}, "safety": {"toxicity": {"total": 59.060094322285295}, "total": 59.060094322285295}, "nlg": {"abssum": {"total": 26.225004066556945}, "translation-en-xx": {"total": 91.69915120121047}, "translation-xx-en": {"total": 92.80231412503088}, "total": 59.23786836483881}, "nlr": {"causal": {"total": 90.8}, "nli": {"total": 43.8597978672891}, "total": 67.32989893364456}, "linguistic-diagnostics": {"mp-r": {"total": 43.05059523809524}, "pragmatics": {"total": 49.519230769230774}, "total": 46.28491300366301}, "instruction-following": {"total": 4.761904761904762}, "total": 51.91319984689369}, "jv": {"nlu": {"sentiment": {"total": 77.6460158085879}, "qa-mc": {"total": 71.42857142857144}, "metaphor": {"total": 43.14889336016097}, "total": 64.0744935324401}, "instruction-following": {"total": 0.9523809523809524}, "nlg": {"translation-id-xx": {"total": 73.56111199238579}, "translation-xx-id": {"total": 81.72416322969544}, "total": 77.64263761104061}, "total": 47.556504031953885}, "su": {"nlu": {"sentiment": {"total": 71.01381471195613}, "qa-mc": {"total": 63.33333333333333}, "metaphor": {"total": 25.36771465342893}, "total": 53.23828756623946}, "instruction-following": {"total": 7.6190476190476195}, "nlg": {"translation-id-xx": {"total": 58.14768401015228}, "translation-xx-id": {"total": 77.19652204949239}, "total": 67.67210302982234}, "total": 42.84314607170314}, "ban": {"nlu": {"sentiment": {"total": 49.60136722922452}, "total": 49.60136722922452}, "instruction-following": {"total": 17.142857142857142}, "nlg": {"translation-id-xx": {"total": 59.54909581218274}, "translation-xx-id": {"total": 70.86218075824873}, "total": 65.20563828521574}, "total": 43.983287552432465}, "bbc": {"nlu": {"sentiment": {"total": 21.214412874741864}, "qa": {"total": 39.3478532528149}, "total": 30.281133063778384}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 31.178616751269036}, "translation-xx-id": {"total": 62.37186706852792}, "total": 46.775241909898476}, "total": 31.082283721384346}}
|
13 |
+
{"model_name": "google/gemma-3-27b-it", "model_type": "Instruct", "model_size": "27B", "id": {"nlu": {"sentiment": {"total": 73.50222373717699}, "qa": {"total": 78.15773978542872}, "metaphor": {"total": 69.41993013421586}, "total": 73.6932978856072}, "safety": {"toxicity": {"total": 48.404320615693344}, "total": 48.404320615693344}, "nlg": {"abssum": {"total": 16.524665817864868}, "translation-en-xx": {"total": 94.28980360671937}, "translation-xx-en": {"total": 93.21113821640316}, "total": 55.13756836471307}, "nlr": {"causal": {"total": 89.6}, "nli": {"total": 73.55881285868958}, "total": 81.57940642934479}, "linguistic-diagnostics": {"mp-r": {"total": 48.422619047619065}, "pragmatics": {"total": 60.48076923076921}, "total": 54.45169413919414}, "instruction-following": {"total": 85.71428571428571}, "multi-turn": {"total": 74.8809523809524}, "total": 67.69450364711295}, "jv": {"nlu": {"sentiment": {"total": 68.99366232286546}, "qa-mc": {"total": 87.61904761904762}, "metaphor": {"total": 55.955734406438616}, "total": 70.85614811611723}, "instruction-following": {"total": 65.71428571428571}, "nlg": {"translation-id-xx": {"total": 85.44648239213198}, "translation-xx-id": {"total": 91.4309664498731}, "total": 88.43872442100255}, "multi-turn": {"total": 72.29591836734694}, "total": 74.3262691546881}, "su": {"nlu": {"sentiment": {"total": 68.94652139856157}, "qa-mc": {"total": 80.95238095238096}, "metaphor": {"total": 40.926640926640935}, "total": 63.60851442586116}, "instruction-following": {"total": 68.57142857142857}, "nlg": {"translation-id-xx": {"total": 78.67233105964468}, "translation-xx-id": {"total": 89.0990046002538}, "total": 83.88566782994924}, "multi-turn": {"total": 77.90816326530613}, "total": 73.49344352313628}, "ban": {"nlu": {"sentiment": {"total": 64.34244819483017}, "total": 64.34244819483017}, "instruction-following": {"total": 45.714285714285715}, "nlg": {"translation-id-xx": {"total": 64.00178458121827}, "translation-xx-id": {"total": 80.85844305203045}, "total": 72.43011381662436}, "multi-turn": {"total": 73.72448979591837}, "total": 64.05283438041465}, "bbc": {"nlu": {"sentiment": {"total": 41.17895036673077}, "qa": {"total": 53.90620481957703}, "total": 47.542577593153894}, "instruction-following": {"total": 36.19047619047619}, "nlg": {"translation-id-xx": {"total": 34.14482868020305}, "translation-xx-id": {"total": 72.01943210659898}, "total": 53.082130393401016}, "multi-turn": {"total": 71.76870748299321}, "total": 52.14597291500607}, "indommlu": {"total": 57.00052675017508, "STEM": {"total": 59.886083357110124}, "Humanities": {"total": 63.22328475452199}, "Social science": {"total": 64.23406160745138}, "Indonesian language": {"total": 60.906785648619774}, "Local languages and cultures": {"total": 38.257958112495274}}}
|
14 |
+
{"model_name": "GoToCompany/Llama-Sahabat-AI-v2-70B-IT", "model_type": "Instruct", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 77.6732188460373}, "qa": {"total": 76.61819825000589}, "metaphor": {"total": 70.78047435190294}, "total": 75.02396381598204}, "safety": {"toxicity": {"total": 51.04618340675695}, "total": 51.04618340675695}, "nlg": {"abssum": {"total": 19.1930577833964}, "translation-en-xx": {"total": 93.29174129199605}, "translation-xx-en": {"total": 92.92486162147975}, "total": 56.15067962006715}, "nlr": {"causal": {"total": 94.8}, "nli": {"total": 79.08219508232072}, "total": 86.94109754116036}, "linguistic-diagnostics": {"mp-r": {"total": 59.50892857142859}, "pragmatics": {"total": 73.5576923076923}, "total": 66.53331043956045}, "instruction-following": {"total": 93.33333333333333}, "multi-turn": {"total": 76.6326530612245}, "total": 72.23731731686925}, "jv": {"nlu": {"sentiment": {"total": 67.1011892045859}, "qa-mc": {"total": 90.0}, "metaphor": {"total": 50.231388329979865}, "total": 69.11085917818859}, "instruction-following": {"total": 69.52380952380952}, "nlg": {"translation-id-xx": {"total": 86.94888166243655}, "translation-xx-id": {"total": 89.48649666878173}, "total": 88.21768916560913}, "multi-turn": {"total": 74.08163265306122}, "total": 75.23349763016712}, "su": {"nlu": {"sentiment": {"total": 68.94210638752403}, "qa-mc": {"total": 82.85714285714285}, "metaphor": {"total": 39.51553594410737}, "total": 63.77159506292475}, "instruction-following": {"total": 69.52380952380952}, "nlg": {"translation-id-xx": {"total": 85.46533946700508}, "translation-xx-id": {"total": 86.5930262531726}, "total": 86.02918286008884}, "multi-turn": {"total": 76.27551020408163}, "total": 73.90002441272618}, "ban": {"nlu": {"sentiment": {"total": 64.59389019440292}, "total": 64.59389019440292}, "instruction-following": {"total": 64.76190476190476}, "nlg": {"translation-id-xx": {"total": 76.8918543781726}, "translation-xx-id": {"total": 80.28165648794416}, "total": 78.58675543305839}, "multi-turn": {"total": 67.6190476190476}, "total": 68.89039950210342}, "bbc": {"nlu": {"sentiment": {"total": 48.27437157302571}, "qa": {"total": 53.313288510304716}, "total": 50.79383004166522}, "instruction-following": {"total": 46.666666666666664}, "nlg": {"translation-id-xx": {"total": 57.41640228426396}, "translation-xx-id": {"total": 73.42498810279187}, "total": 65.42069519352792}, "multi-turn": {"total": 66.56462585034014}, "total": 57.361454438049975}, "indommlu": {"total": 60.70334495757589, "STEM": {"total": 61.3415240409803}, "Humanities": {"total": 70.51818982615777}, "Social science": {"total": 68.75885762465622}, "Indonesian language": {"total": 65.02913061953353}, "Local languages and cultures": {"total": 41.79340142831706}}}
|
15 |
+
{"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "model_type": "Reasoning", "model_size": "32B", "id": {"nlu": {"sentiment": {"total": 78.9701959702203}, "qa": {"total": 69.14921529154698}, "metaphor": {"total": 81.66023166023164}, "total": 76.59321430733297}, "safety": {"toxicity": {"total": 48.532687488463964}, "total": 48.532687488463964}, "nlg": {"abssum": {"total": 15.771986184570713}, "translation-en-xx": {"total": 89.06264281744072}, "translation-xx-en": {"total": 91.78938769917244}, "total": 53.09900072143864}, "nlr": {"causal": {"total": 92.0}, "nli": {"total": 79.68625340846559}, "total": 85.84312670423279}, "linguistic-diagnostics": {"mp-r": {"total": 52.70833333333334}, "pragmatics": {"total": 60.28846153846155}, "total": 56.498397435897445}, "instruction-following": {"total": 53.333333333333336}, "multi-turn": {"total": 68.26530612244899}, "total": 63.166438016164015}, "jv": {"nlu": {"sentiment": {"total": 67.3692231004771}, "qa-mc": {"total": 65.71428571428571}, "metaphor": {"total": 27.66599597585513}, "total": 53.583168263539314}, "instruction-following": {"total": 2.857142857142857}, "nlg": {"translation-id-xx": {"total": 44.75182423857868}, "translation-xx-id": {"total": 66.04903632614213}, "total": 55.400430282360404}, "multi-turn": {"total": 68.62244897959184}, "total": 45.115797595658606}, "su": {"nlu": {"sentiment": {"total": 60.04471978921884}, "qa-mc": {"total": 57.14285714285715}, "metaphor": {"total": 17.268799411656556}, "total": 44.81879211457751}, "instruction-following": {"total": 7.6190476190476195}, "nlg": {"translation-id-xx": {"total": 44.04267131979695}, "translation-xx-id": {"total": 57.660691624365484}, "total": 50.85168147208122}, "multi-turn": {"total": 66.13945578231292}, "total": 42.357244247004814}, "ban": {"nlu": {"sentiment": {"total": 0}, "total": 0.0}, "instruction-following": {"total": 1.9047619047619049}, "nlg": {"translation-id-xx": {"total": 50.61798064720812}, "translation-xx-id": {"total": 37.4948048857868}, "total": 44.05639276649746}, "multi-turn": {"total": 61.819727891156454}, "total": 26.945220640603956}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 9.281469990670297}, "total": 4.640734995335149}, "instruction-following": {"total": 1.9047619047619049}, "nlg": {"translation-id-xx": {"total": 34.2337404822335}, "translation-xx-id": {"total": 34.2864451142132}, "total": 34.260092798223354}, "multi-turn": {"total": 53.31632653061224}, "total": 23.530479057233162}, "indommlu": {"total": 56.97297175288766, "STEM": {"total": 76.60105360009733}, "Humanities": {"total": 61.78918336683863}, "Social science": {"total": 63.76683210862538}, "Indonesian language": {"total": 61.01908232177998}, "Local languages and cultures": {"total": 23.38479571344331}}}
|
16 |
+
{"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "model_type": "Reasoning", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 78.14143455395404}, "qa": {"total": 69.26059901277308}, "metaphor": {"total": 80.29049457620887}, "total": 75.89750938097866}, "safety": {"toxicity": {"total": 46.041727141219575}, "total": 46.041727141219575}, "nlg": {"abssum": {"total": 17.28477573793091}, "translation-en-xx": {"total": 91.97944972826087}, "translation-xx-en": {"total": 92.36568116199358}, "total": 54.72867059152907}, "nlr": {"causal": {"total": 94.0}, "nli": {"total": 77.47431525513804}, "total": 85.73715762756902}, "linguistic-diagnostics": {"mp-r": {"total": 44.15178571428572}, "pragmatics": {"total": 64.32692307692307}, "total": 54.239354395604394}, "instruction-following": {"total": 88.57142857142857}, "multi-turn": {"total": 69.57482993197279}, "total": 67.82723966290028}, "jv": {"nlu": {"sentiment": {"total": 72.96802677490564}, "qa-mc": {"total": 87.61904761904762}, "metaphor": {"total": 52.49496981891348}, "total": 71.02734807095558}, "instruction-following": {"total": 7.6190476190476195}, "nlg": {"translation-id-xx": {"total": 73.34557423857868}, "translation-xx-id": {"total": 81.28239014911168}, "total": 77.31398219384518}, "multi-turn": {"total": 71.47959183673468}, "total": 56.859992430145766}, "su": {"nlu": {"sentiment": {"total": 70.92088585060172}, "qa-mc": {"total": 80.4761904761905}, "metaphor": {"total": 38.93178893178892}, "total": 63.44295508619371}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 63.705385469543145}, "translation-xx-id": {"total": 74.6843075824873}, "total": 69.19484652601523}, "multi-turn": {"total": 72.55102040816325}, "total": 55.82101502890257}, "ban": {"nlu": {"sentiment": {"total": 62.72819198177025}, "total": 62.72819198177025}, "instruction-following": {"total": 12.380952380952381}, "nlg": {"translation-id-xx": {"total": 57.72533312182741}, "translation-xx-id": {"total": 64.42384795368021}, "total": 61.07459053775381}, "multi-turn": {"total": 66.44557823129252}, "total": 50.657328282942245}, "bbc": {"nlu": {"sentiment": {"total": 4.338460442925295}, "qa": {"total": 16.706621074803703}, "total": 10.5225407588645}, "instruction-following": {"total": 1.9047619047619049}, "nlg": {"translation-id-xx": {"total": 34.16560913705584}, "translation-xx-id": {"total": 43.58887214467005}, "total": 38.877240640862944}, "multi-turn": {"total": 61.156462585034014}, "total": 28.11525147238084}, "indommlu": {"total": 63.62162170332392, "STEM": {"total": 76.83877111633697}, "Humanities": {"total": 70.83927436481935}, "Social science": {"total": 71.29522879069941}, "Indonesian language": {"total": 64.73211959812697}, "Local languages and cultures": {"total": 36.80837078959374}}}
|
17 |
+
{"model_name": "GoToCompany/Llama-Sahabat-AI-v2-70B-R", "model_type": "Reasoning", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 74.36716822490585}, "qa": {"total": 68.70316576851644}, "metaphor": {"total": 82.3405037690752}, "total": 75.1369459208325}, "safety": {"toxicity": {"total": 44.805565887667676}, "total": 44.805565887667676}, "nlg": {"abssum": {"total": 15.770754059792369}, "translation-en-xx": {"total": 92.4779240906003}, "translation-xx-en": {"total": 89.78252763710475}, "total": 53.45048996182245}, "nlr": {"causal": {"total": 94.8}, "nli": {"total": 79.85016753970572}, "total": 87.32508376985285}, "linguistic-diagnostics": {"mp-r": {"total": 51.205357142857146}, "pragmatics": {"total": 69.71153846153845}, "total": 60.458447802197796}, "instruction-following": {"total": 86.66666666666667}, "multi-turn": {"total": 71.6326530612245}, "total": 68.4965504386092}, "jv": {"nlu": {"sentiment": {"total": 74.37662892544327}, "qa-mc": {"total": 88.57142857142857}, "metaphor": {"total": 58.08853118712274}, "total": 73.67886289466486}, "instruction-following": {"total": 52.38095238095239}, "nlg": {"translation-id-xx": {"total": 84.78731757614213}, "translation-xx-id": {"total": 86.91073128172589}, "total": 85.849024428934}, "multi-turn": {"total": 77.90816326530611}, "total": 72.45425074246434}, "su": {"nlu": {"sentiment": {"total": 53.23912269458092}, "qa-mc": {"total": 79.04761904761905}, "metaphor": {"total": 44.3417907703622}, "total": 58.87617750418739}, "instruction-following": {"total": 63.8095238095238}, "nlg": {"translation-id-xx": {"total": 81.1491513324873}, "translation-xx-id": {"total": 81.8542393718274}, "total": 81.50169535215736}, "multi-turn": {"total": 77.89115646258503}, "total": 70.51963828211339}, "ban": {"nlu": {"sentiment": {"total": 65.37499109876808}, "total": 65.37499109876808}, "instruction-following": {"total": 63.8095238095238}, "nlg": {"translation-id-xx": {"total": 74.0709470177665}, "translation-xx-id": {"total": 68.70261936865482}, "total": 71.38678319321066}, "multi-turn": {"total": 71.64965986394559}, "total": 68.05523949136203}, "bbc": {"nlu": {"sentiment": {"total": 60.06950081891333}, "qa": {"total": 39.05529170888816}, "total": 49.56239626390075}, "instruction-following": {"total": 57.14285714285714}, "nlg": {"translation-id-xx": {"total": 54.30369606598985}, "translation-xx-id": {"total": 66.19709708121827}, "total": 60.250396573604064}, "multi-turn": {"total": 74.93197278911566}, "total": 60.471905692369404}, "indommlu": {"total": 65.61855199966932, "STEM": {"total": 76.76900555375443}, "Humanities": {"total": 72.57418960079825}, "Social science": {"total": 71.07883839300611}, "Indonesian language": {"total": 67.2507866798964}, "Local languages and cultures": {"total": 43.22103260819583}}}
|
18 |
+
{"model_name": "aisingapore/Llama-SEA-LION-v2-8B", "model_type": "Base", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 69.60385285460764}, "qa": {"total": 73.73839363312449}, "metaphor": {"total": 59.914506343077754}, "total": 67.7522509436033}, "safety": {"toxicity": {"total": 45.25492601518173}, "total": 45.25492601518173}, "nlg": {"abssum": {"total": 25.372425005335675}, "translation-en-xx": {"total": 91.61413815464427}, "translation-xx-en": {"total": 92.07667631971034}, "total": 58.60891612125649}, "nlr": {"causal": {"total": 68.39999999999999}, "nli": {"total": 60.31150864698831}, "total": 64.35575432349415}, "linguistic-diagnostics": {"mp-r": {"total": 41.309523809523796}, "pragmatics": {"total": 6.634615384615383}, "total": 23.97206959706959}, "instruction-following": {"total": 16.19047619047619}, "total": 46.02239886518024}, "jv": {"nlu": {"sentiment": {"total": 66.13458662678914}, "qa-mc": {"total": 46.666666666666664}, "metaphor": {"total": 24.637826961770635}, "total": 45.81302675174214}, "instruction-following": {"total": 11.428571428571429}, "nlg": {"translation-id-xx": {"total": 61.99734295685279}, "translation-xx-id": {"total": 78.83339942893402}, "total": 70.4153711928934}, "total": 42.55232312440233}, "su": {"nlu": {"sentiment": {"total": 65.45908993804741}, "qa-mc": {"total": 37.61904761904762}, "metaphor": {"total": 5.5295091009376796}, "total": 36.2025488860109}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 51.8411326142132}, "translation-xx-id": {"total": 72.78686151649747}, "total": 62.31399706535534}, "total": 36.01345182505859}, "ban": {"nlu": {"sentiment": {"total": 55.74115217546106}, "total": 55.74115217546106}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 53.35564720812183}, "translation-xx-id": {"total": 64.95974777918782}, "total": 59.15769749365482}, "total": 44.331362588118}, "bbc": {"nlu": {"sentiment": {"total": 14.378765221106596}, "qa": {"total": 37.418918485381994}, "total": 25.898841853244296}, "instruction-following": {"total": 18.095238095238095}, "nlg": {"translation-id-xx": {"total": 25.358819796954315}, "translation-xx-id": {"total": 59.188650063451774}, "total": 42.27373493020305}, "total": 28.75593829289515}}
|
19 |
+
{"model_name": "aisingapore/Gemma-SEA-LION-v3-9B-IT", "model_type": "Instruct", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 71.48882179803489}, "qa": {"total": 79.70399822335983}, "metaphor": {"total": 72.84427284427287}, "total": 74.67903095522253}, "safety": {"toxicity": {"total": 40.59132401534749}, "total": 40.59132401534749}, "nlg": {"abssum": {"total": 18.181388830492043}, "translation-en-xx": {"total": 93.09884945111784}, "translation-xx-en": {"total": 93.12180880218627}, "total": 55.64585897857205}, "nlr": {"causal": {"total": 90.8}, "nli": {"total": 72.54852270674931}, "total": 81.67426135337465}, "linguistic-diagnostics": {"mp-r": {"total": 34.85119047619048}, "pragmatics": {"total": 60.57692307692308}, "total": 47.71405677655678}, "instruction-following": {"total": 96.19047619047619}, "multi-turn": {"total": 71.53061224489797}, "total": 66.86080293063539}, "jv": {"nlu": {"sentiment": {"total": 69.40888698995941}, "qa-mc": {"total": 82.3809523809524}, "metaphor": {"total": 46.73038229376259}, "total": 66.17340722155814}, "instruction-following": {"total": 22.857142857142858}, "nlg": {"translation-id-xx": {"total": 76.26477236675127}, "translation-xx-id": {"total": 87.09299651015229}, "total": 81.67888443845177}, "multi-turn": {"total": 74.67687074829932}, "total": 61.34657631636303}, "su": {"nlu": {"sentiment": {"total": 62.93826105533004}, "qa-mc": {"total": 70.95238095238095}, "metaphor": {"total": 35.59018201875346}, "total": 56.49360800882149}, "instruction-following": {"total": 35.23809523809524}, "nlg": {"translation-id-xx": {"total": 69.71629124365482}, "translation-xx-id": {"total": 82.27563848350253}, "total": 75.99596486357868}, "multi-turn": {"total": 74.72789115646259}, "total": 60.6138898167395}, "ban": {"nlu": {"sentiment": {"total": 50.81122267321797}, "total": 50.81122267321797}, "instruction-following": {"total": 17.142857142857142}, "nlg": {"translation-id-xx": {"total": 64.24619289340102}, "translation-xx-id": {"total": 76.28541402284264}, "total": 70.26580345812184}, "multi-turn": {"total": 72.3809523809524}, "total": 52.65020891378734}, "bbc": {"nlu": {"sentiment": {"total": 20.32713807590971}, "qa": {"total": 39.7914974064387}, "total": 30.059317741174205}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 47.01633883248731}, "translation-xx-id": {"total": 54.60501269035533}, "total": 50.81067576142132}, "multi-turn": {"total": 69.57482993197279}, "total": 39.99215823959446}, "indommlu": {"total": 50.91995623941112, "STEM": {"total": 53.621038289268476}, "Humanities": {"total": 58.20677354222946}, "Social science": {"total": 59.187432464425626}, "Indonesian language": {"total": 57.20965571441104}, "Local languages and cultures": {"total": 30.158333200310455}}}
|
20 |
+
{"model_name": "aisingapore/Llama-SEA-LION-v3-70B-IT", "model_type": "Instruct", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 77.15671395489763}, "qa": {"total": 76.84738358190378}, "metaphor": {"total": 73.49236992094134}, "total": 75.83215581924757}, "safety": {"toxicity": {"total": 51.98770863470513}, "total": 51.98770863470513}, "nlg": {"abssum": {"total": 19.525191817646014}, "translation-en-xx": {"total": 92.33508252532114}, "translation-xx-en": {"total": 92.7480999490489}, "total": 56.033391527415525}, "nlr": {"causal": {"total": 93.6}, "nli": {"total": 79.65788716619113}, "total": 86.62894358309556}, "linguistic-diagnostics": {"mp-r": {"total": 53.154761904761926}, "pragmatics": {"total": 75.76923076923077}, "total": 64.46199633699635}, "instruction-following": {"total": 92.38095238095238}, "multi-turn": {"total": 69.01360544217687}, "total": 70.90553624636992}, "jv": {"nlu": {"sentiment": {"total": 69.79377625863418}, "qa-mc": {"total": 88.09523809523809}, "metaphor": {"total": 47.40442655935615}, "total": 68.43114697107613}, "instruction-following": {"total": 34.285714285714285}, "nlg": {"translation-id-xx": {"total": 82.43952252538071}, "translation-xx-id": {"total": 87.85191941624366}, "total": 85.14572097081219}, "multi-turn": {"total": 73.48639455782313}, "total": 65.33724419635644}, "su": {"nlu": {"sentiment": {"total": 65.91839350566117}, "qa-mc": {"total": 80.0}, "metaphor": {"total": 26.65011950726237}, "total": 57.52283767097452}, "instruction-following": {"total": 56.19047619047619}, "nlg": {"translation-id-xx": {"total": 77.60628172588832}, "translation-xx-id": {"total": 82.63099817576142}, "total": 80.11863995082487}, "multi-turn": {"total": 75.5612244897959}, "total": 67.34829457551787}, "ban": {"nlu": {"sentiment": {"total": 63.236843979206725}, "total": 63.236843979206725}, "instruction-following": {"total": 36.19047619047619}, "nlg": {"translation-id-xx": {"total": 57.826142131979694}, "translation-xx-id": {"total": 74.99389276649747}, "total": 66.41001744923858}, "multi-turn": {"total": 68.72448979591837}, "total": 58.64045685370996}, "bbc": {"nlu": {"sentiment": {"total": 30.247383037812455}, "qa": {"total": 41.793369541286886}, "total": 36.020376289549674}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 44.33359771573604}, "translation-xx-id": {"total": 59.36377696700507}, "total": 51.84868734137056}, "multi-turn": {"total": 63.58843537414967}, "total": 41.91199379888652}, "indommlu": {"total": 60.18025084141225, "STEM": {"total": 62.267906671911646}, "Humanities": {"total": 70.42539141840543}, "Social science": {"total": 68.85334900407847}, "Indonesian language": {"total": 64.34575595370362}, "Local languages and cultures": {"total": 39.88441676395111}}}
|
21 |
+
{"model_name": "aisingapore/Gemma-SEA-LION-v3-9B", "model_type": "Base", "model_size": "9B", "id": {"nlu": {"sentiment": {"total": 78.19838062156428}, "qa": {"total": 74.12483728239906}, "metaphor": {"total": 73.50156278727708}, "total": 75.27492689708014}, "safety": {"toxicity": {"total": 59.66682908880377}, "total": 59.66682908880377}, "nlg": {"abssum": {"total": 26.54738670064436}, "translation-en-xx": {"total": 91.58656473875988}, "translation-xx-en": {"total": 92.55769052618577}, "total": 59.30975716655858}, "nlr": {"causal": {"total": 91.19999999999999}, "nli": {"total": 74.75970856366459}, "total": 82.97985428183229}, "linguistic-diagnostics": {"mp-r": {"total": 57.23214285714286}, "pragmatics": {"total": 52.01923076923079}, "total": 54.625686813186825}, "instruction-following": {"total": 26.666666666666668}, "total": 59.753953485688044}, "jv": {"nlu": {"sentiment": {"total": 79.42270170191551}, "qa-mc": {"total": 73.33333333333334}, "metaphor": {"total": 38.87323943661971}, "total": 63.87642482395619}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 72.42651491116752}, "translation-xx-id": {"total": 85.33161484771574}, "total": 78.87906487944163}, "total": 52.98198863129134}, "su": {"nlu": {"sentiment": {"total": 73.00719219539987}, "qa-mc": {"total": 66.19047619047619}, "metaphor": {"total": 34.16988416988418}, "total": 57.78918418525341}, "instruction-following": {"total": 20.952380952380953}, "nlg": {"translation-id-xx": {"total": 55.703362944162436}, "translation-xx-id": {"total": 81.53739689086295}, "total": 68.6203799175127}, "total": 49.12064835171569}, "ban": {"nlu": {"sentiment": {"total": 62.65954568112227}, "total": 62.65954568112227}, "instruction-following": {"total": 25.71428571428571}, "nlg": {"translation-id-xx": {"total": 53.75531408629442}, "translation-xx-id": {"total": 71.69029584390863}, "total": 62.722804965101524}, "total": 50.36554545350317}, "bbc": {"nlu": {"sentiment": {"total": 37.39357687103894}, "qa": {"total": 40.46068444098612}, "total": 38.92713065601253}, "instruction-following": {"total": 22.857142857142858}, "nlg": {"translation-id-xx": {"total": 29.435279187817258}, "translation-xx-id": {"total": 62.45594067258883}, "total": 45.94560993020305}, "total": 35.90996114778614}}
|
22 |
+
{"model_name": "aisingapore/Llama-SEA-LION-v3.5-70B-R", "model_type": "Reasoning", "model_size": "70B", "id": {"nlu": {"sentiment": {"total": 75.73130383499978}, "qa": {"total": 69.35122429051765}, "metaphor": {"total": 82.33590733590734}, "total": 75.80614515380826}, "safety": {"toxicity": {"total": 45.285611387219845}, "total": 45.285611387219845}, "nlg": {"abssum": {"total": 15.83263587689335}, "translation-en-xx": {"total": 92.98521646492095}, "translation-xx-en": {"total": 92.67311307281373}, "total": 54.330900322880346}, "nlr": {"causal": {"total": 93.6}, "nli": {"total": 79.33422124444434}, "total": 86.46711062222217}, "linguistic-diagnostics": {"mp-r": {"total": 43.110119047619065}, "pragmatics": {"total": 68.46153846153847}, "total": 55.785828754578766}, "instruction-following": {"total": 87.61904761904762}, "multi-turn": {"total": 76.49659863945578}, "total": 68.8273203570304}, "jv": {"nlu": {"sentiment": {"total": 71.35512354909919}, "qa-mc": {"total": 89.04761904761904}, "metaphor": {"total": 52.464788732394354}, "total": 70.95584377637086}, "instruction-following": {"total": 38.095238095238095}, "nlg": {"translation-id-xx": {"total": 81.60529029187818}, "translation-xx-id": {"total": 86.06099302030456}, "total": 83.83314165609137}, "multi-turn": {"total": 79.08163265306122}, "total": 67.99146404519038}, "su": {"nlu": {"sentiment": {"total": 65.80702129174678}, "qa-mc": {"total": 81.90476190476191}, "metaphor": {"total": 44.34638720353008}, "total": 64.01939013334625}, "instruction-following": {"total": 58.0952380952381}, "nlg": {"translation-id-xx": {"total": 74.65085659898477}, "translation-xx-id": {"total": 80.70001189720813}, "total": 77.67543424809645}, "multi-turn": {"total": 75.13605442176869}, "total": 68.73152922461237}, "ban": {"nlu": {"sentiment": {"total": 62.439792067222086}, "total": 62.439792067222086}, "instruction-following": {"total": 37.142857142857146}, "nlg": {"translation-id-xx": {"total": 62.54401967005076}, "translation-xx-id": {"total": 67.96902760152284}, "total": 65.2565236357868}, "multi-turn": {"total": 74.91496598639456}, "total": 59.93853470806515}, "bbc": {"nlu": {"sentiment": {"total": 37.488072349213134}, "qa": {"total": 38.34901535900559}, "total": 37.91854385410936}, "instruction-following": {"total": 20.952380952380953}, "nlg": {"translation-id-xx": {"total": 40.71541878172589}, "translation-xx-id": {"total": 50.81959866751269}, "total": 45.76750872461929}, "multi-turn": {"total": 63.55442176870748}, "total": 42.04821382495427}, "indommlu": {"total": 64.19240645617727, "STEM": {"total": 77.44502530319998}, "Humanities": {"total": 70.1734518684928}, "Social science": {"total": 70.29975159070486}, "Indonesian language": {"total": 64.49084946417122}, "Local languages and cultures": {"total": 40.95263915781106}}}
|
23 |
+
{"model_name": "aisingapore/Llama-SEA-LION-v2-8B-IT", "model_type": "Instruct", "model_size": "8B", "id": {"nlu": {"sentiment": {"total": 74.54389040384366}, "qa": {"total": 71.6100894869895}, "metaphor": {"total": 61.29803272660417}, "total": 69.15067087247911}, "safety": {"toxicity": {"total": 41.1573698344055}, "total": 41.1573698344055}, "nlg": {"abssum": {"total": 19.705313096632644}, "translation-en-xx": {"total": 91.68619148344861}, "translation-xx-en": {"total": 91.56246719058794}, "total": 55.66482121682546}, "nlr": {"causal": {"total": 78.0}, "nli": {"total": 53.512191896867066}, "total": 65.75609594843354}, "linguistic-diagnostics": {"mp-r": {"total": 16.42857142857143}, "pragmatics": {"total": 23.076923076923084}, "total": 19.752747252747255}, "instruction-following": {"total": 72.38095238095238}, "multi-turn": {"total": 57.31292517006803}, "total": 54.45365466798732}, "jv": {"nlu": {"sentiment": {"total": 62.67798903368226}, "qa-mc": {"total": 43.33333333333334}, "metaphor": {"total": 14.014084507042245}, "total": 40.00846895801928}, "instruction-following": {"total": 6.666666666666667}, "nlg": {"translation-id-xx": {"total": 61.38527125634518}, "translation-xx-id": {"total": 67.46113578680203}, "total": 64.42320352157361}, "multi-turn": {"total": 57.874149659863946}, "total": 42.24312220153087}, "su": {"nlu": {"sentiment": {"total": 51.23876664530371}, "qa-mc": {"total": 33.8095238095238}, "metaphor": {"total": 8.586137157565732}, "total": 31.211475870797745}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 54.9426951142132}, "translation-xx-id": {"total": 68.41759200507614}, "total": 61.68014355964468}, "multi-turn": {"total": 59.93197278911564}, "total": 39.158279007270465}, "ban": {"nlu": {"sentiment": {"total": 29.35099337748344}, "total": 29.35099337748344}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 56.31150856598985}, "translation-xx-id": {"total": 61.053676237309645}, "total": 58.68259240164974}, "multi-turn": {"total": 47.89115646258503}, "total": 34.93356651281051}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 20.077159425640094}, "total": 10.038579712820047}, "instruction-following": {"total": 4.761904761904762}, "nlg": {"translation-id-xx": {"total": 40.314800126903556}, "translation-xx-id": {"total": 37.23913388324873}, "total": 38.776967005076145}, "multi-turn": {"total": 47.976190476190474}, "total": 25.388410488997856}, "indommlu": {"total": 44.174114800549994, "STEM": {"total": 40.5167101219968}, "Humanities": {"total": 54.183577604266866}, "Social science": {"total": 55.14234869840264}, "Indonesian language": {"total": 50.39557152880211}, "Local languages and cultures": {"total": 25.18883493645394}}}
|
24 |
+
{"model_name": "Qwen/Qwen2.5-7B-Instruct", "model_type": "Instruct", "model_size": "7B", "id": {"nlu": {"sentiment": {"total": 75.78669436869671}, "qa": {"total": 74.27600858973535}, "metaphor": {"total": 60.594778451921314}, "total": 70.21916047011779}, "safety": {"toxicity": {"total": 46.48682318715473}, "total": 46.48682318715473}, "nlg": {"abssum": {"total": 18.283972085777638}, "translation-en-xx": {"total": 85.15674793107708}, "translation-xx-en": {"total": 91.37426830186203}, "total": 53.2747401011236}, "nlr": {"causal": {"total": 80.0}, "nli": {"total": 68.64494448299581}, "total": 74.3224722414979}, "linguistic-diagnostics": {"mp-r": {"total": 36.383928571428584}, "pragmatics": {"total": 41.634615384615394}, "total": 39.009271978021985}, "instruction-following": {"total": 79.04761904761905}, "multi-turn": {"total": 61.0204081632653}, "total": 60.482927884114325}, "jv": {"nlu": {"sentiment": {"total": 60.33945738090152}, "qa-mc": {"total": 39.99999999999999}, "metaphor": {"total": 16.830985915492967}, "total": 39.05681443213149}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 42.45994606598985}, "translation-xx-id": {"total": 61.021930520304565}, "total": 51.74093829314721}, "multi-turn": {"total": 61.25850340136054}, "total": 41.34739736499314}, "su": {"nlu": {"sentiment": {"total": 41.98518835006765}, "qa-mc": {"total": 31.428571428571423}, "metaphor": {"total": 0}, "total": 24.47125325954636}, "instruction-following": {"total": 13.333333333333334}, "nlg": {"translation-id-xx": {"total": 22.513642131979694}, "translation-xx-id": {"total": 31.6829394035533}, "total": 27.098290767766496}, "multi-turn": {"total": 48.18027210884354}, "total": 28.27078736737243}, "ban": {"nlu": {"sentiment": {"total": 31.45495976643166}, "total": 31.45495976643166}, "instruction-following": {"total": 5.714285714285714}, "nlg": {"translation-id-xx": {"total": 37.45213356598985}, "translation-xx-id": {"total": 36.39344067258883}, "total": 36.92278711928934}, "multi-turn": {"total": 51.10544217687074}, "total": 31.299368694219364}, "bbc": {"nlu": {"sentiment": {"total": 0}, "qa": {"total": 33.60734871091491}, "total": 16.803674355457456}, "instruction-following": {"total": 5.714285714285714}, "nlg": {"translation-id-xx": {"total": 27.23080583756345}, "translation-xx-id": {"total": 33.87468274111675}, "total": 30.5527442893401}, "multi-turn": {"total": 53.656462585034014}, "total": 26.68179173602932}, "indommlu": {"total": 42.71309418959688, "STEM": {"total": 45.47521929491425}, "Humanities": {"total": 48.733629272051395}, "Social science": {"total": 53.93752543659862}, "Indonesian language": {"total": 51.20940125670692}, "Local languages and cultures": {"total": 17.313678586984217}}}
|
25 |
+
{"model_name": "Qwen/Qwen2-7B", "model_type": "Base", "model_size": "7B", "id": {"nlu": {"sentiment": {"total": 75.21946554560013}, "qa": {"total": 71.52413905008522}, "metaphor": {"total": 57.2209965067108}, "total": 67.98820036746538}, "safety": {"toxicity": {"total": 44.205661659340315}, "total": 44.205661659340315}, "nlg": {"abssum": {"total": 23.68972738280887}, "translation-en-xx": {"total": 84.4631994194664}, "translation-xx-en": {"total": 90.75560944447875}, "total": 55.64956590739072}, "nlr": {"causal": {"total": 84.39999999999999}, "nli": {"total": 72.91286068288636}, "total": 78.65643034144318}, "linguistic-diagnostics": {"mp-r": {"total": 38.39285714285714}, "pragmatics": {"total": 0}, "total": 19.19642857142857}, "instruction-following": {"total": 25.71428571428571}, "total": 48.56842876022565}, "jv": {"nlu": {"sentiment": {"total": 64.0681478316599}, "qa-mc": {"total": 48.57142857142858}, "metaphor": {"total": 26.12676056338028}, "total": 46.25544565548959}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 41.66949555837564}, "translation-xx-id": {"total": 61.29969067258883}, "total": 51.484593115482234}, "total": 33.849854193498544}, "su": {"nlu": {"sentiment": {"total": 42.50680054119491}, "qa-mc": {"total": 41.428571428571416}, "metaphor": {"total": 9.624931053502483}, "total": 31.186767674422935}, "instruction-following": {"total": 3.8095238095238098}, "nlg": {"translation-id-xx": {"total": 40.04616116751269}, "translation-xx-id": {"total": 52.83169416243655}, "total": 46.43892766497462}, "total": 27.145073049640455}, "ban": {"nlu": {"sentiment": {"total": 50.76493626717937}, "total": 50.76493626717937}, "instruction-following": {"total": 21.904761904761905}, "nlg": {"translation-id-xx": {"total": 39.58026649746193}, "translation-xx-id": {"total": 50.566782994923855}, "total": 45.07352474619289}, "total": 39.24774097271139}, "bbc": {"nlu": {"sentiment": {"total": 26.338745282347077}, "qa": {"total": 35.88321060236509}, "total": 31.110977942356083}, "instruction-following": {"total": 16.19047619047619}, "nlg": {"translation-id-xx": {"total": 19.100571065989847}, "translation-xx-id": {"total": 43.421478426395936}, "total": 31.26102474619289}, "total": 26.187492959675055}}
|
26 |
+
{"model_name": "Qwen/Qwen2.5-7B", "model_type": "Base", "model_size": "7B", "id": {"nlu": {"sentiment": {"total": 75.75328420551442}, "qa": {"total": 73.32407561711896}, "metaphor": {"total": 65.36127964699394}, "total": 71.47954648987577}, "safety": {"toxicity": {"total": 31.55389939249849}, "total": 31.55389939249849}, "nlg": {"abssum": {"total": 21.564449902216676}, "translation-en-xx": {"total": 86.72052942811264}, "translation-xx-en": {"total": 91.35743827970603}, "total": 55.30171687806301}, "nlr": {"causal": {"total": 84.4}, "nli": {"total": 73.39944270760374}, "total": 78.89972135380188}, "linguistic-diagnostics": {"mp-r": {"total": 44.89583333333334}, "pragmatics": {"total": 33.94230769230768}, "total": 39.41907051282051}, "instruction-following": {"total": 34.285714285714285}, "total": 51.823278152128985}, "jv": {"nlu": {"sentiment": {"total": 65.79192480239264}, "qa-mc": {"total": 49.523809523809526}, "metaphor": {"total": 24.75855130784708}, "total": 46.691428544683085}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 45.925364847715734}, "translation-xx-id": {"total": 64.8521573604061}, "total": 55.38876110406092}, "total": 37.20133305751784}, "su": {"nlu": {"sentiment": {"total": 47.658477533290615}, "qa-mc": {"total": 41.42857142857143}, "metaphor": {"total": 8.416069130354842}, "total": 32.5010393640723}, "instruction-following": {"total": 4.761904761904762}, "nlg": {"translation-id-xx": {"total": 43.161326142131976}, "translation-xx-id": {"total": 61.348350253807105}, "total": 52.25483819796954}, "total": 29.839260774648864}, "ban": {"nlu": {"sentiment": {"total": 53.75033824681337}, "total": 53.75033824681337}, "instruction-following": {"total": 14.285714285714285}, "nlg": {"translation-id-xx": {"total": 44.255869289340104}, "translation-xx-id": {"total": 60.060259359137056}, "total": 52.15806432423858}, "total": 40.06470561892208}, "bbc": {"nlu": {"sentiment": {"total": 20.73417360962757}, "qa": {"total": 41.071441696158544}, "total": 30.90280765289306}, "instruction-following": {"total": 12.380952380952381}, "nlg": {"translation-id-xx": {"total": 26.63626269035533}, "translation-xx-id": {"total": 49.21890862944162}, "total": 37.927585659898476}, "total": 27.0704485645813}}
|
27 |
+
{"model_name": "Qwen/Qwen2-7B-Instruct", "model_type": "Instruct", "model_size": "7B", "id": {"nlu": {"sentiment": {"total": 76.32118934770374}, "qa": {"total": 60.29704883952192}, "metaphor": {"total": 54.490715205000924}, "total": 63.70298446407552}, "safety": {"toxicity": {"total": 23.21240291206743}, "total": 23.21240291206743}, "nlg": {"abssum": {"total": 13.130711645467363}, "translation-en-xx": {"total": 82.98908797554348}, "translation-xx-en": {"total": 83.69165328557312}, "total": 48.235541138012834}, "nlr": {"causal": {"total": 81.2}, "nli": {"total": 52.41372244666343}, "total": 66.80686122333171}, "linguistic-diagnostics": {"mp-r": {"total": 3.3333333333333286}, "pragmatics": {"total": 27.019230769230784}, "total": 15.176282051282056}, "instruction-following": {"total": 65.71428571428571}, "multi-turn": {"total": 51.83673469387754}, "total": 47.81215602813325}, "jv": {"nlu": {"sentiment": {"total": 56.66424553158155}, "qa-mc": {"total": 24.28571428571429}, "metaphor": {"total": 3.6116700201207275}, "total": 28.18720994580552}, "instruction-following": {"total": 2.857142857142857}, "nlg": {"translation-id-xx": {"total": 58.805242703045685}, "translation-xx-id": {"total": 54.17488895939086}, "total": 56.490065831218274}, "multi-turn": {"total": 54.234693877551024}, "total": 35.44227812792942}, "su": {"nlu": {"sentiment": {"total": 47.37171544541765}, "qa-mc": {"total": 31.428571428571423}, "metaphor": {"total": 0}, "total": 26.266762291329695}, "instruction-following": {"total": 4.761904761904762}, "nlg": {"translation-id-xx": {"total": 31.82955266497462}, "translation-xx-id": {"total": 34.00594860406091}, "total": 32.917750634517766}, "multi-turn": {"total": 47.15986394557824}, "total": 27.776570408332617}, "ban": {"nlu": {"sentiment": {"total": 0}, "total": 0.0}, "instruction-following": {"total": 9.523809523809524}, "nlg": {"translation-id-xx": {"total": 38.09668464467005}, "translation-xx-id": {"total": 36.76491116751269}, "total": 37.43079790609137}, "multi-turn": {"total": 39.94897959183673}, "total": 21.725896755434405}, "bbc": {"nlu": {"sentiment": {"total": 9.251655629139064}, "qa": {"total": 27.430561483601497}, "total": 18.34110855637028}, "instruction-following": {"total": 2.857142857142857}, "nlg": {"translation-id-xx": {"total": 36.072572969543145}, "translation-xx-id": {"total": 22.299254441624367}, "total": 29.185913705583758}, "multi-turn": {"total": 45.527210884353735}, "total": 23.977844000862657}, "indommlu": {"total": 19.439219720805205, "STEM": {"total": 19.089113128479287}, "Humanities": {"total": 27.818499157288194}, "Social science": {"total": 22.833935000753403}, "Indonesian language": {"total": 34.0245970704392}, "Local languages and cultures": {"total": 0.0}}}
|
28 |
+
{"model_name": "Qwen/Qwen2.5-72B-Instruct", "model_type": "Instruct", "model_size": "72B", "id": {"nlu": {"sentiment": {"total": 78.40107345366401}, "qa": {"total": 74.64988471304491}, "metaphor": {"total": 72.14561500275786}, "total": 75.06552438982227}, "safety": {"toxicity": {"total": 50.59214880483599}, "total": 50.59214880483599}, "nlg": {"abssum": {"total": 17.336644608612378}, "translation-en-xx": {"total": 91.94026178050889}, "translation-xx-en": {"total": 93.00841464920948}, "total": 54.90549141173578}, "nlr": {"causal": {"total": 94.39999999999999}, "nli": {"total": 67.33660629863192}, "total": 80.86830314931595}, "linguistic-diagnostics": {"mp-r": {"total": 57.961309523809526}, "pragmatics": {"total": 67.3076923076923}, "total": 62.63450091575092}, "instruction-following": {"total": 92.38095238095238}, "multi-turn": {"total": 71.4455782312925}, "total": 69.69892846910082}, "jv": {"nlu": {"sentiment": {"total": 66.68461155023854}, "qa-mc": {"total": 77.6190476190476}, "metaphor": {"total": 34.14486921529174}, "total": 59.4828427948593}, "instruction-following": {"total": 40.0}, "nlg": {"translation-id-xx": {"total": 55.5690038071066}, "translation-xx-id": {"total": 80.43935398159898}, "total": 68.00417889435279}, "multi-turn": {"total": 76.9047619047619}, "total": 61.0979458984935}, "su": {"nlu": {"sentiment": {"total": 62.30413729260128}, "qa-mc": {"total": 64.76190476190476}, "metaphor": {"total": 20.104798676227233}, "total": 49.05694691024443}, "instruction-following": {"total": 45.714285714285715}, "nlg": {"translation-id-xx": {"total": 46.21391180203046}, "translation-xx-id": {"total": 69.89271692576142}, "total": 58.053314363895936}, "multi-turn": {"total": 64.03061224489795}, "total": 54.21378980833101}, "ban": {"nlu": {"sentiment": {"total": 53.58263903724273}, "total": 53.58263903724273}, "instruction-following": {"total": 4.761904761904762}, "nlg": {"translation-id-xx": {"total": 43.244923857868024}, "translation-xx-id": {"total": 59.21627141497462}, "total": 51.23059763642132}, "multi-turn": {"total": 62.704081632653065}, "total": 43.06980576705547}, "bbc": {"nlu": {"sentiment": {"total": 26.478459018728184}, "qa": {"total": 37.11172567294669}, "total": 31.79509234583744}, "instruction-following": {"total": 6.666666666666667}, "nlg": {"translation-id-xx": {"total": 22.50555203045685}, "translation-xx-id": {"total": 31.081535532994923}, "total": 26.793543781725887}, "multi-turn": {"total": 69.35374149659863}, "total": 33.652261072707155}, "indommlu": {"total": 56.41249578205506, "STEM": {"total": 63.61631263878045}, "Humanities": {"total": 66.03320962601761}, "Social science": {"total": 67.05889343710753}, "Indonesian language": {"total": 62.046257290964135}, "Local languages and cultures": {"total": 28.10104489182636}}}
|
dataframe.csv
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,Model,Type,Size,Indonesian Language (Average),ID,JV,SU,Open LLM Leaderboard 2 (EN)
|
2 |
+
10,GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct,Instruct,9B,59.91,63.96,60.1,55.66,33.67
|
3 |
+
9,GoToCompany/gemma2-9b-cpt-sahabatai-v1-base,Base,9B,51.43,54.74,51.46,48.08,19.62
|
4 |
+
11,GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct,Instruct,8B,49.98,56.56,49.64,43.74,24.43
|
5 |
+
6,aisingapore/gemma2-9b-cpt-sea-lionv3-base,Base,9B,49.23,55.09,47.65,44.95,21.99
|
6 |
+
0,google/gemma-2-9b-it,Instruct,9B,48.85,63.09,43.36,40.1,23.49
|
7 |
+
12,GoToCompany/llama3-8b-cpt-sahabatai-v1-base,Base,8B,47.78,46.67,49.15,47.51,13.92
|
8 |
+
1,google/gemma-2-9b,Base,9B,43.35,48.02,42.83,39.2,13.34
|
9 |
+
15,meta-llama/Llama-3.1-8B,Base,8B,39.8,41.93,41.53,35.94,13.69
|
10 |
+
16,meta-llama/Meta-Llama-3-8B,Base,8B,38.52,41.14,40.04,34.39,13.56
|
11 |
+
7,aisingapore/llama3-8b-cpt-sea-lionv2-base,Base,8B,38.5,42.45,38.94,34.11,12.77
|
12 |
+
8,aisingapore/llama3-8b-cpt-sea-lionv2.1-instruct,Instruct,8B,38.29,54.35,33.2,27.32,24.52
|
13 |
+
14,meta-llama/Llama-3.1-8B-Instruct,Instruct,8B,37.74,50.52,35.26,27.43,27.98
|
14 |
+
2,Qwen/Qwen2.5-7B-Instruct,Instruct,7B,37.74,59.2,30.47,23.54,27.75
|
15 |
+
5,Qwen/Qwen2.5-7B,Base,7B,37.53,51.66,34.11,26.82,24.65
|
16 |
+
4,Qwen/Qwen2-7B,Base,7B,33.26,44.81,29.87,25.11,23.68
|
17 |
+
3,Qwen/Qwen2-7B-Instruct,Instruct,7B,30.58,46.25,23.29,22.21,24.48
|
18 |
+
13,meta-llama/Meta-Llama-3-8B-Instruct,Instruct,8B,30.44,41.36,28.91,21.04,23.91
|
pyproject.toml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.ruff]
|
2 |
+
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
+
select = ["E", "F"]
|
4 |
+
ignore = ["E501"] # line too long (black is taking care of this)
|
5 |
+
line-length = 119
|
6 |
+
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
7 |
+
|
8 |
+
[tool.isort]
|
9 |
+
profile = "black"
|
10 |
+
line_length = 119
|
11 |
+
|
12 |
+
[tool.black]
|
13 |
+
line-length = 119
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
APScheduler
|
2 |
+
black
|
3 |
+
datasets
|
4 |
+
gradio
|
5 |
+
gradio[oauth]
|
6 |
+
gradio_leaderboard==0.0.13
|
7 |
+
gradio_client
|
8 |
+
huggingface-hub>=0.18.0
|
9 |
+
matplotlib
|
10 |
+
numpy
|
11 |
+
pandas
|
12 |
+
python-dateutil
|
13 |
+
tqdm
|
14 |
+
transformers
|
15 |
+
tokenizers>=0.15.0
|
16 |
+
sentencepiece
|
src/config.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DESCRIPTION CONFIG
|
2 |
+
|
3 |
+
# Title for the leaderboard page
|
4 |
+
TITLE = """<h1 align="center" id="space-title">Sahabat-AI Leaderboard</h1>"""
|
5 |
+
|
6 |
+
# Introduction text providing an overview of the leaderboard
|
7 |
+
INTRODUCTION_TEXT = """
|
8 |
+
Sahabat-AI (Indonesian language for "close friends") is a collection of large language models which has been pretrained and instruct-tuned for Indonesian language and its various local languages.
|
9 |
+
This leaderboard evaluates general language capabilities of Sahabat-AI and other open source models using SEA-HELM and IndoMMLU, focusing on Indonesian, Javanese, Sundanese, Balinese, and Batak.
|
10 |
+
"""
|
11 |
+
|
12 |
+
# Detailed information about benchmark tasks evaluated in the leaderboard
|
13 |
+
INFO_BENCHMARK_TASK = """
|
14 |
+
## Overview
|
15 |
+
This leaderboard evaluates the performance of various Large Language Models (LLMs) using SEA-HELM and IndoMMLU.
|
16 |
+
SEA-HELM is a benchmark that evaluates LLM on Natural Language Processing (NLP) classic tasks, safety, linguistics, culture, instruction following, and chat capabilities.
|
17 |
+
We focus on Indonesian, Javanese, Sundanese, Balinese, and Batak languages, adding tasks that are relevant to these languages.
|
18 |
+
IndoMMLU covers various subjects and educational levels, including STEM, social sciences, humanities, Indonesian language, and local languages & cultures.
|
19 |
+
|
20 |
+
## Competencies
|
21 |
+
|
22 |
+
### Natural Language Understanding (NLU)
|
23 |
+
- **Sentiment Analysis:** Classifies sentences as positive, negative, or neutral.
|
24 |
+
- **Question Answering (QA):** Answers questions based on a given passage. For Javanese and Sundanese, we employ a multiple-choice format.
|
25 |
+
- **Metaphor Recognition:** Selects between two options that best explain a given metaphorical sentence.
|
26 |
+
|
27 |
+
### Natural Language Generation (NLG)
|
28 |
+
- **Translation:** For Indonesian, we evaluate translation to and from English. For the local languages, we evaluate translation to and from Indonesian.
|
29 |
+
- **Abstractive Summarization:** Summarize a passage into 1 or 2 sentences.
|
30 |
+
|
31 |
+
### Natural Language Reasoning (NLR)
|
32 |
+
- **Causal Reasoning:** Given a premise and two options, select one which is the cause or effect of the premise.
|
33 |
+
- **Natural Language Inference (NLI):** Determine the relationship between a premise and hypothesis, classifying it as entailment, contradiction, or neutral.
|
34 |
+
|
35 |
+
### Safety
|
36 |
+
- **Toxicity Detection:** Classifies sentences as toxic, hate speech, or clean.
|
37 |
+
|
38 |
+
### Linguistic Diagnostics
|
39 |
+
- **Syntax:** Selects the grammatically correct sentence from two minimally differed sentences.
|
40 |
+
- **Pragmatics:** Given a situation, determines whether a sentence is true or false.
|
41 |
+
|
42 |
+
### Instruction Following
|
43 |
+
- Follows human instructions to respond using a specific format, e.g., using JSON, mentioning a certain keyword, or providing a specific number of sentences.
|
44 |
+
|
45 |
+
### Multi Turn
|
46 |
+
- Holds a human-like conversation in a multi-turn setting.
|
47 |
+
"""
|
48 |
+
|
49 |
+
# Explanation of score calculation methodology
|
50 |
+
INFO_SCORE_CALCULATION = """
|
51 |
+
- The **overall score** for a language is computed as the **average** of all competency scores.
|
52 |
+
- Each **competency score** is computed as the **average** of its tasks.
|
53 |
+
- Normalization is applied for classification tasks by substracting the random baseline score and scaling it to the range of 0-100.
|
54 |
+
"""
|
55 |
+
|
56 |
+
# Placeholder information about GoTo and Sahabat AI
|
57 |
+
INFO_GOTO_SAHABAT_AI = """
|
58 |
+
Sahabat-AI (Indonesian language for “close friends”) is a local open source Large Language Model (LLM) ecosystem in Indonesian language, co-initiated by Indonesian tech and telecommunication companies: GoTo Group and Indosat Ooredoo Hutchison. Sahabat-AI ecosystem aims to empower Indonesians who want to develop AI-based services and applications using Bahasa Indonesia and its various local languages.
|
59 |
+
|
60 |
+
We are supported by research centers and global tech experts such as AI Singapore to train the model to gain general language understanding.
|
61 |
+
|
62 |
+
We also collaborate with key top Indonesia universities such as University of Indonesia, Gadjah Mada University, Bogor Institute of Agriculture, Bandung Institute of Technology, University of North Sumatera (Universitas Sumatera Utara), and Udayana University, including top Indonesian media groups, such as Kompas Gramedia Group, and Republika, Tempo, and Hukumonline to train and enrich the model in Bahasa Indonesia, ensuring optimum provision of local context and cultural relevance.
|
63 |
+
|
64 |
+
We would like to invite researchers, developers, and language enthusiasts to actively contribute to the enhancement and expansion of Sahabat-AI. Your collaborations can involve:
|
65 |
+
- Identifying and reporting technical issues
|
66 |
+
- Sharing pre-training, instruction, and preference data
|
67 |
+
- Improving documentation usability
|
68 |
+
- Proposing and implementing new model evaluation tasks and metrics
|
69 |
+
|
70 |
+
Join us in shaping the future of Sahabat-AI by sharing your expertise and insights to make these models more accessible, accurate, and versatile.
|
71 |
+
|
72 |
+
You can contribute your ideas through [this form](https://docs.google.com/forms/d/1_us969eQtEooYOn4XkvGkdP5VHOyCbO6L_sd9kTMnaA).
|
73 |
+
"""
|
74 |
+
|
75 |
+
CITATIONS = """
|
76 |
+
```
|
77 |
+
@misc{susanto2025seahelmsoutheastasianholistic,
|
78 |
+
title={SEA-HELM: Southeast Asian Holistic Evaluation of Language Models},
|
79 |
+
author={Yosephine Susanto and Adithya Venkatadri Hulagadri and Jann Railey Montalan and Jian Gang Ngui and Xian Bin Yong and Weiqi Leong and Hamsawardhini Rengarajan and Peerat Limkonchotiwat and Yifan Mai and William Chandra Tjhi},
|
80 |
+
year={2025},
|
81 |
+
eprint={2502.14301},
|
82 |
+
archivePrefix={arXiv},
|
83 |
+
primaryClass={cs.CL},
|
84 |
+
url={https://arxiv.org/abs/2502.14301},
|
85 |
+
}
|
86 |
+
```
|
87 |
+
```
|
88 |
+
@inproceedings{koto-etal-2023-indommlu,
|
89 |
+
title = "Large Language Models Only Pass Primary School Exams in {I}ndonesia: A Comprehensive Test on {I}ndo{MMLU}",
|
90 |
+
author = "Fajri Koto and Nurul Aisyah and Haonan Li and Timothy Baldwin",
|
91 |
+
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
|
92 |
+
month = December,
|
93 |
+
year = "2023",
|
94 |
+
address = "Singapore",
|
95 |
+
publisher = "Association for Computational Linguistics",
|
96 |
+
}
|
97 |
+
```
|
98 |
+
"""
|
99 |
+
|
100 |
+
# LEADERBOARD CONFIGURATION
|
101 |
+
|
102 |
+
# Path to the JSON file containing model performance data
|
103 |
+
file_path = "config/model_performance.jsonl"
|
104 |
+
|
105 |
+
# Label for the average score of SEA-HELM Indonesian languages
|
106 |
+
avg_label = "Indonesian Languages Average"
|
107 |
+
|
108 |
+
# Number of decimal places for rounding scores
|
109 |
+
round_precision = 2
|
110 |
+
|
111 |
+
# Delimiter used in dataset keys
|
112 |
+
delimiter = "."
|
113 |
+
|
114 |
+
model_types = ["Instruct", "Base"]
|
115 |
+
|
116 |
+
# Base information about model to be displayed in every leaderboard
|
117 |
+
# key is from JSONL, so it must be the same
|
118 |
+
# display used as column name in leaderboard
|
119 |
+
base_info = [
|
120 |
+
{
|
121 |
+
"key": "model_name",
|
122 |
+
"display": "Model"
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"key": "model_type",
|
126 |
+
"display": "Type"
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"key": "model_size",
|
130 |
+
"display": "Size"
|
131 |
+
},
|
132 |
+
]
|
133 |
+
|
134 |
+
# List of languages evaluated in the leaderboard
|
135 |
+
# key: is from JSONL, so it must be the same
|
136 |
+
# display: used as column name in overall leaderboard
|
137 |
+
# main_table_avg: determine if the language shoul be added to average in overall leaderboard
|
138 |
+
# tab: tab name in top of leaderboard
|
139 |
+
# hidden_col: list of column to be hidden from leaderboard, so it must be the same col name as in leaderboard
|
140 |
+
|
141 |
+
language_list = [
|
142 |
+
{
|
143 |
+
"key": "id",
|
144 |
+
"display": "ID",
|
145 |
+
"main_table_avg": True,
|
146 |
+
"tab": "Indonesian",
|
147 |
+
"hidden_col": ["nlg", "nlu", "nlr", "safety", "linguistic-diagnostics"]
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"key": "jv",
|
151 |
+
"display": "JV",
|
152 |
+
"main_table_avg": True,
|
153 |
+
"tab": "Javanese",
|
154 |
+
"hidden_col": ["nlg", "nlu", "nlr"]
|
155 |
+
},
|
156 |
+
{
|
157 |
+
"key": "su",
|
158 |
+
"display": "SU",
|
159 |
+
"main_table_avg": True,
|
160 |
+
"tab": "Sundanese",
|
161 |
+
"hidden_col": ["nlg", "nlu", "nlr"]
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"key": "ban",
|
165 |
+
"display": "BAN",
|
166 |
+
"main_table_avg": True,
|
167 |
+
"tab": "Balinese",
|
168 |
+
"hidden_col": ["nlg", "nlu", "nlr"]
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"key": "bbc",
|
172 |
+
"display": "BBC",
|
173 |
+
"main_table_avg": True,
|
174 |
+
"tab": "Batak",
|
175 |
+
"hidden_col": ["nlg", "nlu", "nlr"]
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"key": "indommlu",
|
179 |
+
"display": "IndoMMLU",
|
180 |
+
"main_table_avg": False,
|
181 |
+
"tab": "IndoMMLU",
|
182 |
+
"hidden_col": []
|
183 |
+
}
|
184 |
+
]
|
185 |
+
|
186 |
+
hidden_tabs = [
|
187 |
+
("Base", "IndoMMLU")
|
188 |
+
]
|
src/populate.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import json
|
3 |
+
|
4 |
+
from .config import base_info, language_list, delimiter, avg_label, round_precision
|
5 |
+
|
6 |
+
def load_tables(file_path: str) -> pd.DataFrame:
|
7 |
+
"""
|
8 |
+
Load and process the leaderboard data from a JSONL file.
|
9 |
+
- Flattens nested JSON structures.
|
10 |
+
- Computes total scores for each language.
|
11 |
+
- Sorts models by their performance.
|
12 |
+
"""
|
13 |
+
data = []
|
14 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
15 |
+
for line in f:
|
16 |
+
json_obj = json.loads(line) # Load each JSON object from the file
|
17 |
+
flattened = pd.json_normalize(json_obj, sep=delimiter) # Flatten the nested JSON structure
|
18 |
+
data.append(flattened)
|
19 |
+
|
20 |
+
# Combine all JSON objects into a single DataFrame
|
21 |
+
df = pd.concat(data, ignore_index=True)
|
22 |
+
|
23 |
+
# Round numeric values to the specified precision
|
24 |
+
df = df.map(lambda x: round(x, round_precision) if isinstance(x, (int, float)) else x)
|
25 |
+
|
26 |
+
base = pd.DataFrame()
|
27 |
+
|
28 |
+
# Extract base information (e.g., model name, type, size)
|
29 |
+
for info in base_info:
|
30 |
+
base[info["display"]] = df[info["key"]]
|
31 |
+
|
32 |
+
# Create the main leaderboard table
|
33 |
+
main_table = base.copy()
|
34 |
+
|
35 |
+
detailed_tables = []
|
36 |
+
|
37 |
+
for lang in language_list:
|
38 |
+
# Add total scores for each language to the main table
|
39 |
+
main_table[lang['display']] = df[f"{lang['key']}{delimiter}total"]
|
40 |
+
|
41 |
+
# Identify all columns related to the language
|
42 |
+
cols = [col for col in df.columns if col.startswith(lang["key"])]
|
43 |
+
total_col = None
|
44 |
+
table = base.copy()
|
45 |
+
|
46 |
+
for col in cols:
|
47 |
+
display_col = col.split(delimiter)[:-1] # Extract display column name
|
48 |
+
|
49 |
+
# Identify the total column (if it exists)
|
50 |
+
if len(display_col) == 1:
|
51 |
+
total_col = col
|
52 |
+
|
53 |
+
# Format column name for better readability
|
54 |
+
display_col = col if len(display_col) < 2 else " - ".join(display_col[1:])
|
55 |
+
table[display_col] = df[col]
|
56 |
+
|
57 |
+
# If a total column exists, move it to the front and sort the table
|
58 |
+
if total_col:
|
59 |
+
total_col_data = table.pop(total_col)
|
60 |
+
table.insert(len(base.columns), "Total", total_col_data)
|
61 |
+
table = table.sort_values(by="Total", ascending=False)
|
62 |
+
|
63 |
+
detailed_tables.append(table)
|
64 |
+
|
65 |
+
# Compute the overall average score for Indonesian languages
|
66 |
+
main_table[avg_label] = sum(
|
67 |
+
[main_table[lang["display"]] if lang["main_table_avg"] else 0 for lang in language_list]
|
68 |
+
)
|
69 |
+
main_table[avg_label] = round(
|
70 |
+
main_table[avg_label] / sum(lang["main_table_avg"] for lang in language_list), round_precision
|
71 |
+
)
|
72 |
+
|
73 |
+
# Move the average score column to the rightmost position
|
74 |
+
last_col = main_table.pop(main_table.columns[-1])
|
75 |
+
main_table.insert(len(base.columns), last_col.name, last_col)
|
76 |
+
|
77 |
+
# Sort models by the average score in descending order
|
78 |
+
main_table = main_table.sort_values(by=avg_label, ascending=False)
|
79 |
+
|
80 |
+
# Return structured leaderboard tables (overall + language-specific)
|
81 |
+
return [{"name": "Overall", "table": main_table, "hidden_col": []}] + [
|
82 |
+
{"name": lang["tab"], "table": table, "hidden_col": lang["hidden_col"]}
|
83 |
+
for lang, table in zip(language_list, detailed_tables)
|
84 |
+
]
|