File size: 2,989 Bytes
29546b4 91e8a06 6dff40c 2caaddf 29546b4 2caaddf 29546b4 91e8a06 32b707a 29546b4 4f3c2a8 236bb17 01ea22b 32b707a 29546b4 1b780de 58733e4 29546b4 b98f07f 1b780de e7226cc 29546b4 e7226cc 3aa78c2 2a860f6 f7d1b51 3aa78c2 b98f07f 236bb17 58733e4 2a73469 fccd458 2a860f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from dataclasses import dataclass
from enum import Enum
@dataclass
class TaskDetails:
name: str
display_name: str = ""
symbol: str = "" # emoji
class TaskType(Enum):
NLU = TaskDetails("nlu", "NLU", "🧠")
NLG = TaskDetails("nlg", "NLG", "✍️")
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
task_type: TaskType
is_primary_metric: bool = True
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", TaskType.NLU)
task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", TaskType.NLU)
task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", TaskType.NLU)
task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", TaskType.NLU)
task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", TaskType.NLU)
task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", TaskType.NLU)
task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", TaskType.NLG, False)
task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", TaskType.NLG)
task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", TaskType.NLG, False)
task9 = Task("flores200_eng-mlt", "chrf", "Flores-200 EN→MT (ChrF)", TaskType.NLG)
task10 = Task("webnlg_mlt", "chrf", "WebNLG (ChrF)", TaskType.NLG)
task11 = Task("webnlg_mlt", "rouge", "WebNLG (Rouge-L)", TaskType.NLG, False)
task12 = Task("eurlexsum_mlt", "chrf", "EUR-Lex-Sum (ChrF)", TaskType.NLG, False)
task13 = Task("eurlexsum_mlt", "rouge", "EUR-Lex-Sum (Rouge-L)", TaskType.NLG)
task14 = Task("maltese_news_headlines", "chrf", "Maltese News Headlines (ChrF)", TaskType.NLG, False)
task15 = Task("maltese_news_headlines", "rouge", "Maltese News Headlines (Rouge-L)", TaskType.NLG)
NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">🇲🇹 MELABench Leaderboard</h1>"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
A Maltese Evaluation Language Benchmark
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works
## Reproducibility
To reproduce our results, here is the commands you can run:
"""
EVALUATION_QUEUE_TEXT = """
To include new results on this benchmark, follow the instructions on our [GitHub Repository](https://github.com/MLRS/MELABench/tree/main/prompting).
You can then upload the output files which should include the configuration/results file and all the prediction files.
In addition, we ask for additional metadata about model training.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
"""
|