File size: 5,777 Bytes
d088b76
 
29546b4
91e8a06
6dff40c
2caaddf
 
 
 
 
 
 
 
 
 
 
 
29546b4
 
 
 
 
d088b76
2caaddf
 
29546b4
91e8a06
32b707a
 
29546b4
4f3c2a8
d088b76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01ea22b
 
32b707a
 
29546b4
 
 
d088b76
 
 
 
 
 
58733e4
29546b4
b98f07f
d088b76
e7226cc
 
29546b4
d088b76
 
 
e7226cc
d088b76
 
 
 
 
 
 
 
 
 
 
f7d1b51
d088b76
3aa78c2
 
b98f07f
236bb17
 
 
58733e4
2a73469
 
fccd458
d088b76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a860f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import re

from dataclasses import dataclass
from enum import Enum

@dataclass
class TaskDetails:
    name: str
    display_name: str = ""
    symbol: str = "" # emoji


class TaskType(Enum):
    NLU = TaskDetails("nlu", "NLU", "🧠")
    NLG = TaskDetails("nlg", "NLG", "✍️")


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str
    url: str
    task_type: TaskType
    is_primary_metric: bool = True


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", "https://github.com/jerbarnes/typology_of_crosslingual/tree/master/data/sentiment/mt", TaskType.NLU)
    task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", "https://huggingface.co/datasets/Davlan/sib200/viewer/mlt_Latn", TaskType.NLU)
    task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", "https://github.com/cisnlp/Taxi1500", TaskType.NLU)
    task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", "https://huggingface.co/datasets/MLRS/maltese_news_categories", TaskType.NLU)
    task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", "https://huggingface.co/datasets/nlpaueb/multi_eurlex", TaskType.NLU)
    task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", "https://huggingface.co/datasets/facebook/belebele/viewer/mlt_Latn", TaskType.NLU)
    task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG, False)
    task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG)
    task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", "https://huggingface.co/datasets/Muennighoff/flores200", TaskType.NLG, False)
    task9 = Task("flores200_eng-mlt", "chrf", "Flores-200 EN→MT (ChrF)", "https://huggingface.co/datasets/Muennighoff/flores200", TaskType.NLG)
    task10 = Task("webnlg_mlt", "chrf", "WebNLG (ChrF)", "https://synalp.gitlabpages.inria.fr/webnlg-challenge/challenge_2023/", TaskType.NLG)
    task11 = Task("webnlg_mlt", "rouge", "WebNLG (Rouge-L)", "https://synalp.gitlabpages.inria.fr/webnlg-challenge/challenge_2023/", TaskType.NLG, False)
    task12 = Task("eurlexsum_mlt", "chrf", "EUR-Lex-Sum (ChrF)", "https://huggingface.co/datasets/dennlinger/eur-lex-sum", TaskType.NLG, False)
    task13 = Task("eurlexsum_mlt", "rouge", "EUR-Lex-Sum (Rouge-L)", "https://huggingface.co/datasets/dennlinger/eur-lex-sum", TaskType.NLG)
    task14 = Task("maltese_news_headlines", "chrf", "Maltese News Headlines (ChrF)", "https://huggingface.co/datasets/MLRS/maltese_news_headlines", TaskType.NLG, False)
    task15 = Task("maltese_news_headlines", "rouge", "Maltese News Headlines (Rouge-L)", "https://huggingface.co/datasets/MLRS/maltese_news_headlines", TaskType.NLG)

NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------



# Your leaderboard name
TITLE = """
<h1 align="center" id="space-title">
<img src="https://raw.githubusercontent.com/MLRS/MELABench/refs/heads/main/logo.jpg" alt="MELABench logo" width="200px">
Leaderboard
</h1>
"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
<p align="center">A Maltese Evaluation Language Benchmark 🇲🇹</p>
"""

# Which evaluations are you running? how can people reproduce what you have?
tasks = {task_type.value.display_name: {} for task_type in TaskType}
for task in Tasks:
    tasks[task.value.task_type.value.display_name][re.sub(r" \(.*\)$", "", task.value.col_name)] = task.value.url
LLM_BENCHMARKS_TEXT = f"""
MELABench evaluates language model capabilities on Maltese.
Currently, the following tasks are supported:
""" + \
"\n".join([
    f"- {task_type}:\n" + "\n".join(f"  - [{task}]({url})" for task, url in sub_tasks.items()) + "\n"
    for task_type, sub_tasks in tasks.items()
]) + \
"""
The leaderboard is developed and maintained by people managing [MLRS](https://mlrs.research.um.edu.mt/).
We plan to expand our initial work with more tasks, if you would like to contribute your data, please reach out!
If you would like to include results for models/setups we did not include, we also accept submissions.

This work was introduced in [MELABenchv1: Benchmarking Large Language Models against Smaller Fine-Tuned Models for Low-Resource Maltese NLP](https://arxiv.org/abs/2506.04385).
"""

EVALUATION_QUEUE_TEXT = """
To include new results on this benchmark, follow the instructions on our [GitHub Repository](https://github.com/MLRS/MELABench/tree/main/prompting).
You can then upload the output files which should include the configuration/results file and all the prediction files.
In addition, we ask for additional metadata about model training. 
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@inproceedings{micallef-borg-2025-melabenchv1,
    title = "{MELAB}enchv1: Benchmarking Large Language Models against Smaller Fine-Tuned Models for Low-Resource {M}altese {NLP}",
    author = "Micallef, Kurt  and
      Borg, Claudia",
    editor = "Che, Wanxiang  and
      Nabende, Joyce  and
      Shutova, Ekaterina  and
      Pilehvar, Mohammad Taher",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
    month = jul,
    year = "2025",
    address = "Vienna, Austria",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.findings-acl.1053/",
    doi = "10.18653/v1/2025.findings-acl.1053",
    pages = "20505--20527",
    ISBN = "979-8-89176-256-5",
}
"""