File size: 5,777 Bytes
d088b76 29546b4 91e8a06 6dff40c 2caaddf 29546b4 d088b76 2caaddf 29546b4 91e8a06 32b707a 29546b4 4f3c2a8 d088b76 01ea22b 32b707a 29546b4 d088b76 58733e4 29546b4 b98f07f d088b76 e7226cc 29546b4 d088b76 e7226cc d088b76 f7d1b51 d088b76 3aa78c2 b98f07f 236bb17 58733e4 2a73469 fccd458 d088b76 2a860f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import re
from dataclasses import dataclass
from enum import Enum
@dataclass
class TaskDetails:
name: str
display_name: str = ""
symbol: str = "" # emoji
class TaskType(Enum):
NLU = TaskDetails("nlu", "NLU", "🧠")
NLG = TaskDetails("nlg", "NLG", "✍️")
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
url: str
task_type: TaskType
is_primary_metric: bool = True
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", "https://github.com/jerbarnes/typology_of_crosslingual/tree/master/data/sentiment/mt", TaskType.NLU)
task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", "https://huggingface.co/datasets/Davlan/sib200/viewer/mlt_Latn", TaskType.NLU)
task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", "https://github.com/cisnlp/Taxi1500", TaskType.NLU)
task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", "https://huggingface.co/datasets/MLRS/maltese_news_categories", TaskType.NLU)
task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", "https://huggingface.co/datasets/nlpaueb/multi_eurlex", TaskType.NLU)
task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", "https://huggingface.co/datasets/facebook/belebele/viewer/mlt_Latn", TaskType.NLU)
task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG, False)
task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG)
task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", "https://huggingface.co/datasets/Muennighoff/flores200", TaskType.NLG, False)
task9 = Task("flores200_eng-mlt", "chrf", "Flores-200 EN→MT (ChrF)", "https://huggingface.co/datasets/Muennighoff/flores200", TaskType.NLG)
task10 = Task("webnlg_mlt", "chrf", "WebNLG (ChrF)", "https://synalp.gitlabpages.inria.fr/webnlg-challenge/challenge_2023/", TaskType.NLG)
task11 = Task("webnlg_mlt", "rouge", "WebNLG (Rouge-L)", "https://synalp.gitlabpages.inria.fr/webnlg-challenge/challenge_2023/", TaskType.NLG, False)
task12 = Task("eurlexsum_mlt", "chrf", "EUR-Lex-Sum (ChrF)", "https://huggingface.co/datasets/dennlinger/eur-lex-sum", TaskType.NLG, False)
task13 = Task("eurlexsum_mlt", "rouge", "EUR-Lex-Sum (Rouge-L)", "https://huggingface.co/datasets/dennlinger/eur-lex-sum", TaskType.NLG)
task14 = Task("maltese_news_headlines", "chrf", "Maltese News Headlines (ChrF)", "https://huggingface.co/datasets/MLRS/maltese_news_headlines", TaskType.NLG, False)
task15 = Task("maltese_news_headlines", "rouge", "Maltese News Headlines (Rouge-L)", "https://huggingface.co/datasets/MLRS/maltese_news_headlines", TaskType.NLG)
NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------
# Your leaderboard name
TITLE = """
<h1 align="center" id="space-title">
<img src="https://raw.githubusercontent.com/MLRS/MELABench/refs/heads/main/logo.jpg" alt="MELABench logo" width="200px">
Leaderboard
</h1>
"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
<p align="center">A Maltese Evaluation Language Benchmark 🇲🇹</p>
"""
# Which evaluations are you running? how can people reproduce what you have?
tasks = {task_type.value.display_name: {} for task_type in TaskType}
for task in Tasks:
tasks[task.value.task_type.value.display_name][re.sub(r" \(.*\)$", "", task.value.col_name)] = task.value.url
LLM_BENCHMARKS_TEXT = f"""
MELABench evaluates language model capabilities on Maltese.
Currently, the following tasks are supported:
""" + \
"\n".join([
f"- {task_type}:\n" + "\n".join(f" - [{task}]({url})" for task, url in sub_tasks.items()) + "\n"
for task_type, sub_tasks in tasks.items()
]) + \
"""
The leaderboard is developed and maintained by people managing [MLRS](https://mlrs.research.um.edu.mt/).
We plan to expand our initial work with more tasks, if you would like to contribute your data, please reach out!
If you would like to include results for models/setups we did not include, we also accept submissions.
This work was introduced in [MELABenchv1: Benchmarking Large Language Models against Smaller Fine-Tuned Models for Low-Resource Maltese NLP](https://arxiv.org/abs/2506.04385).
"""
EVALUATION_QUEUE_TEXT = """
To include new results on this benchmark, follow the instructions on our [GitHub Repository](https://github.com/MLRS/MELABench/tree/main/prompting).
You can then upload the output files which should include the configuration/results file and all the prediction files.
In addition, we ask for additional metadata about model training.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@inproceedings{micallef-borg-2025-melabenchv1,
title = "{MELAB}enchv1: Benchmarking Large Language Models against Smaller Fine-Tuned Models for Low-Resource {M}altese {NLP}",
author = "Micallef, Kurt and
Borg, Claudia",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1053/",
doi = "10.18653/v1/2025.findings-acl.1053",
pages = "20505--20527",
ISBN = "979-8-89176-256-5",
}
"""
|