Spaces:

MLRS
/

MELABench

Running

File size: 2,989 Bytes

29546b4
91e8a06
6dff40c
2caaddf
 
 
 
 
 
 
 
 
 
 
 
29546b4
 
 
 
 
2caaddf
 
29546b4
91e8a06
32b707a
 
29546b4
4f3c2a8
236bb17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01ea22b
 
32b707a
 
29546b4
 
 
1b780de
58733e4
29546b4
b98f07f
1b780de
e7226cc
 
29546b4
e7226cc
3aa78c2
 
 
2a860f6
f7d1b51
3aa78c2
 
b98f07f
236bb17
 
 
58733e4
2a73469
 
fccd458
2a860f6

from dataclasses import dataclass
from enum import Enum

@dataclass
class TaskDetails:
    name: str
    display_name: str = ""
    symbol: str = "" # emoji


class TaskType(Enum):
    NLU = TaskDetails("nlu", "NLU", "🧠")
    NLG = TaskDetails("nlg", "NLG", "✍️")


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str
    task_type: TaskType
    is_primary_metric: bool = True


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", TaskType.NLU)
    task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", TaskType.NLU)
    task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", TaskType.NLU)
    task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", TaskType.NLU)
    task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", TaskType.NLU)
    task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", TaskType.NLU)
    task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", TaskType.NLG, False)
    task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", TaskType.NLG)
    task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", TaskType.NLG, False)
    task9 = Task("flores200_eng-mlt", "chrf", "Flores-200 EN→MT (ChrF)", TaskType.NLG)
    task10 = Task("webnlg_mlt", "chrf", "WebNLG (ChrF)", TaskType.NLG)
    task11 = Task("webnlg_mlt", "rouge", "WebNLG (Rouge-L)", TaskType.NLG, False)
    task12 = Task("eurlexsum_mlt", "chrf", "EUR-Lex-Sum (ChrF)", TaskType.NLG, False)
    task13 = Task("eurlexsum_mlt", "rouge", "EUR-Lex-Sum (Rouge-L)", TaskType.NLG)
    task14 = Task("maltese_news_headlines", "chrf", "Maltese News Headlines (ChrF)", TaskType.NLG, False)
    task15 = Task("maltese_news_headlines", "rouge", "Maltese News Headlines (Rouge-L)", TaskType.NLG)

NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------



# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">🇲🇹 MELABench Leaderboard</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
A Maltese Evaluation Language Benchmark
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works

## Reproducibility
To reproduce our results, here is the commands you can run:

"""

EVALUATION_QUEUE_TEXT = """
To include new results on this benchmark, follow the instructions on our [GitHub Repository](https://github.com/MLRS/MELABench/tree/main/prompting).
You can then upload the output files which should include the configuration/results file and all the prediction files.
In addition, we ask for additional metadata about model training. 
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
"""