Sahabat-AI-Leaderboard

Running

App Files Files Community

Sahabat-AI-Leaderboard / src /config.py

kiliangoto

Initial commit of Sahabat-AI Leaderboard

ef54478 15 days ago

raw

history blame contribute delete

8.28 kB

	# DESCRIPTION CONFIG

	# Title for the leaderboard page
	TITLE = """<h1 align="center" id="space-title">Sahabat-AI Leaderboard</h1>"""

	# Introduction text providing an overview of the leaderboard
	INTRODUCTION_TEXT = """
	Sahabat-AI (Indonesian language for "close friends") is a collection of large language models which has been pretrained and instruct-tuned for Indonesian language and its various local languages.
	This leaderboard evaluates general language capabilities of Sahabat-AI and other open source models using SEA-HELM and IndoMMLU, focusing on Indonesian, Javanese, Sundanese, Balinese, and Batak.
	"""

	# Detailed information about benchmark tasks evaluated in the leaderboard
	INFO_BENCHMARK_TASK = """
	## Overview
	This leaderboard evaluates the performance of various Large Language Models (LLMs) using SEA-HELM and IndoMMLU.
	SEA-HELM is a benchmark that evaluates LLM on Natural Language Processing (NLP) classic tasks, safety, linguistics, culture, instruction following, and chat capabilities.
	We focus on Indonesian, Javanese, Sundanese, Balinese, and Batak languages, adding tasks that are relevant to these languages.
	IndoMMLU covers various subjects and educational levels, including STEM, social sciences, humanities, Indonesian language, and local languages & cultures.

	## Competencies

	### Natural Language Understanding (NLU)
	- Sentiment Analysis: Classifies sentences as positive, negative, or neutral.
	- Question Answering (QA): Answers questions based on a given passage. For Javanese and Sundanese, we employ a multiple-choice format.
	- Metaphor Recognition: Selects between two options that best explain a given metaphorical sentence.

	### Natural Language Generation (NLG)
	- Translation: For Indonesian, we evaluate translation to and from English. For the local languages, we evaluate translation to and from Indonesian.
	- Abstractive Summarization: Summarize a passage into 1 or 2 sentences.

	### Natural Language Reasoning (NLR)
	- Causal Reasoning: Given a premise and two options, select one which is the cause or effect of the premise.
	- Natural Language Inference (NLI): Determine the relationship between a premise and hypothesis, classifying it as entailment, contradiction, or neutral.

	### Safety
	- Toxicity Detection: Classifies sentences as toxic, hate speech, or clean.

	### Linguistic Diagnostics
	- Syntax: Selects the grammatically correct sentence from two minimally differed sentences.
	- Pragmatics: Given a situation, determines whether a sentence is true or false.

	### Instruction Following
	- Follows human instructions to respond using a specific format, e.g., using JSON, mentioning a certain keyword, or providing a specific number of sentences.

	### Multi Turn
	- Holds a human-like conversation in a multi-turn setting.
	"""

	# Explanation of score calculation methodology
	INFO_SCORE_CALCULATION = """
	- The overall score for a language is computed as the average of all competency scores.
	- Each competency score is computed as the average of its tasks.
	- Normalization is applied for classification tasks by substracting the random baseline score and scaling it to the range of 0-100.
	"""

	# Placeholder information about GoTo and Sahabat AI
	INFO_GOTO_SAHABAT_AI = """
	Sahabat-AI (Indonesian language for “close friends”) is a local open source Large Language Model (LLM) ecosystem in Indonesian language, co-initiated by Indonesian tech and telecommunication companies: GoTo Group and Indosat Ooredoo Hutchison. Sahabat-AI ecosystem aims to empower Indonesians who want to develop AI-based services and applications using Bahasa Indonesia and its various local languages.

	We are supported by research centers and global tech experts such as AI Singapore to train the model to gain general language understanding.

	We also collaborate with key top Indonesia universities such as University of Indonesia, Gadjah Mada University, Bogor Institute of Agriculture, Bandung Institute of Technology, University of North Sumatera (Universitas Sumatera Utara), and Udayana University, including top Indonesian media groups, such as Kompas Gramedia Group, and Republika, Tempo, and Hukumonline to train and enrich the model in Bahasa Indonesia, ensuring optimum provision of local context and cultural relevance.

	We would like to invite researchers, developers, and language enthusiasts to actively contribute to the enhancement and expansion of Sahabat-AI. Your collaborations can involve:
	- Identifying and reporting technical issues
	- Sharing pre-training, instruction, and preference data
	- Improving documentation usability
	- Proposing and implementing new model evaluation tasks and metrics

	Join us in shaping the future of Sahabat-AI by sharing your expertise and insights to make these models more accessible, accurate, and versatile.

	You can contribute your ideas through [this form](https://docs.google.com/forms/d/1_us969eQtEooYOn4XkvGkdP5VHOyCbO6L_sd9kTMnaA).
	"""

	CITATIONS = """
	```
	@misc{susanto2025seahelmsoutheastasianholistic,
	title={SEA-HELM: Southeast Asian Holistic Evaluation of Language Models},
	author={Yosephine Susanto and Adithya Venkatadri Hulagadri and Jann Railey Montalan and Jian Gang Ngui and Xian Bin Yong and Weiqi Leong and Hamsawardhini Rengarajan and Peerat Limkonchotiwat and Yifan Mai and William Chandra Tjhi},
	year={2025},
	eprint={2502.14301},
	archivePrefix={arXiv},
	primaryClass={cs.CL},
	url={https://arxiv.org/abs/2502.14301},
	}
	```
	```
	@inproceedings{koto-etal-2023-indommlu,
	title = "Large Language Models Only Pass Primary School Exams in {I}ndonesia: A Comprehensive Test on {I}ndo{MMLU}",
	author = "Fajri Koto and Nurul Aisyah and Haonan Li and Timothy Baldwin",
	booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
	month = December,
	year = "2023",
	address = "Singapore",
	publisher = "Association for Computational Linguistics",
	}
	```
	"""

	# LEADERBOARD CONFIGURATION

	# Path to the JSON file containing model performance data
	file_path = "config/model_performance.jsonl"

	# Label for the average score of SEA-HELM Indonesian languages
	avg_label = "Indonesian Languages Average"

	# Number of decimal places for rounding scores
	round_precision = 2

	# Delimiter used in dataset keys
	delimiter = "."

	model_types = ["Instruct", "Base"]

	# Base information about model to be displayed in every leaderboard
	# key is from JSONL, so it must be the same
	# display used as column name in leaderboard
	base_info = [
	{
	"key": "model_name",
	"display": "Model"
	},
	{
	"key": "model_type",
	"display": "Type"
	},
	{
	"key": "model_size",
	"display": "Size"
	},
	]

	# List of languages evaluated in the leaderboard
	# key: is from JSONL, so it must be the same
	# display: used as column name in overall leaderboard
	# main_table_avg: determine if the language shoul be added to average in overall leaderboard
	# tab: tab name in top of leaderboard
	# hidden_col: list of column to be hidden from leaderboard, so it must be the same col name as in leaderboard

	language_list = [
	{
	"key": "id",
	"display": "ID",
	"main_table_avg": True,
	"tab": "Indonesian",
	"hidden_col": ["nlg", "nlu", "nlr", "safety", "linguistic-diagnostics"]
	},
	{
	"key": "jv",
	"display": "JV",
	"main_table_avg": True,
	"tab": "Javanese",
	"hidden_col": ["nlg", "nlu", "nlr"]
	},
	{
	"key": "su",
	"display": "SU",
	"main_table_avg": True,
	"tab": "Sundanese",
	"hidden_col": ["nlg", "nlu", "nlr"]
	},
	{
	"key": "ban",
	"display": "BAN",
	"main_table_avg": True,
	"tab": "Balinese",
	"hidden_col": ["nlg", "nlu", "nlr"]
	},
	{
	"key": "bbc",
	"display": "BBC",
	"main_table_avg": True,
	"tab": "Batak",
	"hidden_col": ["nlg", "nlu", "nlr"]
	},
	{
	"key": "indommlu",
	"display": "IndoMMLU",
	"main_table_avg": False,
	"tab": "IndoMMLU",
	"hidden_col": []
	}
	]

	hidden_tabs = [
	("Base", "IndoMMLU")
	]