|
|
|
|
|
|
|
TITLE = """<h1 align="center" id="space-title">Sahabat-AI Leaderboard</h1>""" |
|
|
|
|
|
INTRODUCTION_TEXT = """ |
|
Sahabat-AI (Indonesian language for "close friends") is a collection of large language models which has been pretrained and instruct-tuned for Indonesian language and its various local languages. |
|
This leaderboard evaluates general language capabilities of Sahabat-AI and other open source models using SEA-HELM and IndoMMLU, focusing on Indonesian, Javanese, Sundanese, Balinese, and Batak. |
|
""" |
|
|
|
|
|
INFO_BENCHMARK_TASK = """ |
|
## Overview |
|
This leaderboard evaluates the performance of various Large Language Models (LLMs) using SEA-HELM and IndoMMLU. |
|
SEA-HELM is a benchmark that evaluates LLM on Natural Language Processing (NLP) classic tasks, safety, linguistics, culture, instruction following, and chat capabilities. |
|
We focus on Indonesian, Javanese, Sundanese, Balinese, and Batak languages, adding tasks that are relevant to these languages. |
|
IndoMMLU covers various subjects and educational levels, including STEM, social sciences, humanities, Indonesian language, and local languages & cultures. |
|
|
|
## Competencies |
|
|
|
### Natural Language Understanding (NLU) |
|
- **Sentiment Analysis:** Classifies sentences as positive, negative, or neutral. |
|
- **Question Answering (QA):** Answers questions based on a given passage. For Javanese and Sundanese, we employ a multiple-choice format. |
|
- **Metaphor Recognition:** Selects between two options that best explain a given metaphorical sentence. |
|
|
|
### Natural Language Generation (NLG) |
|
- **Translation:** For Indonesian, we evaluate translation to and from English. For the local languages, we evaluate translation to and from Indonesian. |
|
- **Abstractive Summarization:** Summarize a passage into 1 or 2 sentences. |
|
|
|
### Natural Language Reasoning (NLR) |
|
- **Causal Reasoning:** Given a premise and two options, select one which is the cause or effect of the premise. |
|
- **Natural Language Inference (NLI):** Determine the relationship between a premise and hypothesis, classifying it as entailment, contradiction, or neutral. |
|
|
|
### Safety |
|
- **Toxicity Detection:** Classifies sentences as toxic, hate speech, or clean. |
|
|
|
### Linguistic Diagnostics |
|
- **Syntax:** Selects the grammatically correct sentence from two minimally differed sentences. |
|
- **Pragmatics:** Given a situation, determines whether a sentence is true or false. |
|
|
|
### Instruction Following |
|
- Follows human instructions to respond using a specific format, e.g., using JSON, mentioning a certain keyword, or providing a specific number of sentences. |
|
|
|
### Multi Turn |
|
- Holds a human-like conversation in a multi-turn setting. |
|
""" |
|
|
|
|
|
INFO_SCORE_CALCULATION = """ |
|
- The **overall score** for a language is computed as the **average** of all competency scores. |
|
- Each **competency score** is computed as the **average** of its tasks. |
|
- Normalization is applied for classification tasks by substracting the random baseline score and scaling it to the range of 0-100. |
|
""" |
|
|
|
|
|
INFO_GOTO_SAHABAT_AI = """ |
|
Sahabat-AI (Indonesian language for “close friends”) is a local open source Large Language Model (LLM) ecosystem in Indonesian language, co-initiated by Indonesian tech and telecommunication companies: GoTo Group and Indosat Ooredoo Hutchison. Sahabat-AI ecosystem aims to empower Indonesians who want to develop AI-based services and applications using Bahasa Indonesia and its various local languages. |
|
|
|
We are supported by research centers and global tech experts such as AI Singapore to train the model to gain general language understanding. |
|
|
|
We also collaborate with key top Indonesia universities such as University of Indonesia, Gadjah Mada University, Bogor Institute of Agriculture, Bandung Institute of Technology, University of North Sumatera (Universitas Sumatera Utara), and Udayana University, including top Indonesian media groups, such as Kompas Gramedia Group, and Republika, Tempo, and Hukumonline to train and enrich the model in Bahasa Indonesia, ensuring optimum provision of local context and cultural relevance. |
|
|
|
We would like to invite researchers, developers, and language enthusiasts to actively contribute to the enhancement and expansion of Sahabat-AI. Your collaborations can involve: |
|
- Identifying and reporting technical issues |
|
- Sharing pre-training, instruction, and preference data |
|
- Improving documentation usability |
|
- Proposing and implementing new model evaluation tasks and metrics |
|
|
|
Join us in shaping the future of Sahabat-AI by sharing your expertise and insights to make these models more accessible, accurate, and versatile. |
|
|
|
You can contribute your ideas through [this form](https://docs.google.com/forms/d/1_us969eQtEooYOn4XkvGkdP5VHOyCbO6L_sd9kTMnaA). |
|
""" |
|
|
|
CITATIONS = """ |
|
``` |
|
@misc{susanto2025seahelmsoutheastasianholistic, |
|
title={SEA-HELM: Southeast Asian Holistic Evaluation of Language Models}, |
|
author={Yosephine Susanto and Adithya Venkatadri Hulagadri and Jann Railey Montalan and Jian Gang Ngui and Xian Bin Yong and Weiqi Leong and Hamsawardhini Rengarajan and Peerat Limkonchotiwat and Yifan Mai and William Chandra Tjhi}, |
|
year={2025}, |
|
eprint={2502.14301}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL}, |
|
url={https://arxiv.org/abs/2502.14301}, |
|
} |
|
``` |
|
``` |
|
@inproceedings{koto-etal-2023-indommlu, |
|
title = "Large Language Models Only Pass Primary School Exams in {I}ndonesia: A Comprehensive Test on {I}ndo{MMLU}", |
|
author = "Fajri Koto and Nurul Aisyah and Haonan Li and Timothy Baldwin", |
|
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP)", |
|
month = December, |
|
year = "2023", |
|
address = "Singapore", |
|
publisher = "Association for Computational Linguistics", |
|
} |
|
``` |
|
""" |
|
|
|
|
|
|
|
|
|
file_path = "config/model_performance.jsonl" |
|
|
|
|
|
avg_label = "Indonesian Languages Average" |
|
|
|
|
|
round_precision = 2 |
|
|
|
|
|
delimiter = "." |
|
|
|
model_types = ["Instruct", "Base"] |
|
|
|
|
|
|
|
|
|
base_info = [ |
|
{ |
|
"key": "model_name", |
|
"display": "Model" |
|
}, |
|
{ |
|
"key": "model_type", |
|
"display": "Type" |
|
}, |
|
{ |
|
"key": "model_size", |
|
"display": "Size" |
|
}, |
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
language_list = [ |
|
{ |
|
"key": "id", |
|
"display": "ID", |
|
"main_table_avg": True, |
|
"tab": "Indonesian", |
|
"hidden_col": ["nlg", "nlu", "nlr", "safety", "linguistic-diagnostics"] |
|
}, |
|
{ |
|
"key": "jv", |
|
"display": "JV", |
|
"main_table_avg": True, |
|
"tab": "Javanese", |
|
"hidden_col": ["nlg", "nlu", "nlr"] |
|
}, |
|
{ |
|
"key": "su", |
|
"display": "SU", |
|
"main_table_avg": True, |
|
"tab": "Sundanese", |
|
"hidden_col": ["nlg", "nlu", "nlr"] |
|
}, |
|
{ |
|
"key": "ban", |
|
"display": "BAN", |
|
"main_table_avg": True, |
|
"tab": "Balinese", |
|
"hidden_col": ["nlg", "nlu", "nlr"] |
|
}, |
|
{ |
|
"key": "bbc", |
|
"display": "BBC", |
|
"main_table_avg": True, |
|
"tab": "Batak", |
|
"hidden_col": ["nlg", "nlu", "nlr"] |
|
}, |
|
{ |
|
"key": "indommlu", |
|
"display": "IndoMMLU", |
|
"main_table_avg": False, |
|
"tab": "IndoMMLU", |
|
"hidden_col": [] |
|
} |
|
] |
|
|
|
hidden_tabs = [ |
|
("Base", "IndoMMLU") |
|
] |
|
|