Leaderboard configuration & descriptions.
Browse files- README.md +5 -5
- src/about.py +18 -4
- src/display/utils.py +3 -0
- src/envs.py +4 -4
- src/leaderboard/read_evals.py +21 -5
- src/submission/check_validity.py +1 -1
README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
-
short_description:
|
11 |
sdk_version: 5.19.0
|
12 |
---
|
13 |
|
|
|
1 |
---
|
2 |
+
title: Maltese MELABench Leaderboard π²πΉ
|
3 |
+
emoji: π₯π²πΉ
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: red
|
6 |
sdk: gradio
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
+
short_description: Evaluation of language models on Maltese tasks
|
11 |
sdk_version: 5.19.0
|
12 |
---
|
13 |
|
src/about.py
CHANGED
@@ -12,8 +12,22 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
@@ -21,11 +35,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title"
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("sentiment", "f1,none", "Sentiment Analysis (F1)")
|
16 |
+
task1 = Task("sib200", "f1,none", "SIB200 (F1)")
|
17 |
+
task2 = Task("taxi1500", "f1,none", "Taxi1500 (F1)")
|
18 |
+
task3 = Task("maltese_news_categories", "loglikelihood,none", "Maltese News Categories (F1)")
|
19 |
+
task4 = Task("multi_eurlex", "loglikelihood,none", "MultiEURLEX (F1)")
|
20 |
+
task5 = Task("belebele", "acc,none", "Belebele (Accuracy)")
|
21 |
+
task6 = Task("opus100_en-mt", "bleu,none", "OPUS-100 ENβMT (BLEU)")
|
22 |
+
task7 = Task("opus100_en-mt", "chrf,none", "OPUS-100 ENβMT (ChrF)")
|
23 |
+
task8 = Task("flores200_en-mt", "bleu,none", "Flores-200 ENβMT (BLEU)")
|
24 |
+
task9 = Task("flores200_en-mt", "chrf,none", "Flores-200 ENβMT (ChrF)")
|
25 |
+
task10 = Task("webnlg", "chrf,none", "WebNLG (ChrF)")
|
26 |
+
task11 = Task("webnlg", "rouge,none", "WebNLG (Rouge-L)")
|
27 |
+
task12 = Task("eurlex_sum", "chrf,none", "EUR-Lex-Sum (ChrF)")
|
28 |
+
task13 = Task("eurlex_sum", "rouge,none", "EUR-Lex-Sum (Rouge-L)")
|
29 |
+
task14 = Task("maltese_news_headlines", "chrf,none", "Maltese News Headlines (ChrF)")
|
30 |
+
task15 = Task("maltese_news_headlines", "rouge,none", "Maltese News Headlines (Rouge-L)")
|
31 |
|
32 |
NUM_FEWSHOT = 0 # Change with your few shot
|
33 |
# ---------------------------------------------------
|
|
|
35 |
|
36 |
|
37 |
# Your leaderboard name
|
38 |
+
TITLE = """<h1 align="center" id="space-title">π²πΉ MELABench Leaderboard</h1>"""
|
39 |
|
40 |
# What does your leaderboard evaluate?
|
41 |
INTRODUCTION_TEXT = """
|
42 |
+
A Maltese Evaluation Language Benchmark
|
43 |
"""
|
44 |
|
45 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/display/utils.py
CHANGED
@@ -89,11 +89,14 @@ class WeightType(Enum):
|
|
89 |
Delta = ModelDetails("Delta")
|
90 |
|
91 |
class Precision(Enum):
|
|
|
92 |
float16 = ModelDetails("float16")
|
93 |
bfloat16 = ModelDetails("bfloat16")
|
94 |
Unknown = ModelDetails("?")
|
95 |
|
96 |
def from_str(precision):
|
|
|
|
|
97 |
if precision in ["torch.float16", "float16"]:
|
98 |
return Precision.float16
|
99 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
|
|
89 |
Delta = ModelDetails("Delta")
|
90 |
|
91 |
class Precision(Enum):
|
92 |
+
float32 = ModelDetails("float32")
|
93 |
float16 = ModelDetails("float16")
|
94 |
bfloat16 = ModelDetails("bfloat16")
|
95 |
Unknown = ModelDetails("?")
|
96 |
|
97 |
def from_str(precision):
|
98 |
+
if precision in ["torch.float32", "float32"]:
|
99 |
+
return Precision.float32
|
100 |
if precision in ["torch.float16", "float16"]:
|
101 |
return Precision.float16
|
102 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
src/envs.py
CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/
|
13 |
-
QUEUE_REPO = f"{OWNER}/
|
14 |
-
RESULTS_REPO = f"{OWNER}/
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "MLRS" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
+
REPO_ID = f"{OWNER}/MELABench"
|
13 |
+
QUEUE_REPO = f"{OWNER}/MELABench_requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/MELABench_results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
src/leaderboard/read_evals.py
CHANGED
@@ -44,7 +44,7 @@ class EvalResult:
|
|
44 |
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
|
46 |
# Get model and org
|
47 |
-
org_and_model = config.get("model_name",
|
48 |
org_and_model = org_and_model.split("/", 1)
|
49 |
|
50 |
if len(org_and_model) == 1:
|
@@ -57,14 +57,27 @@ class EvalResult:
|
|
57 |
result_key = f"{org}_{model}_{precision.value.name}"
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
|
|
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
-
full_model,
|
62 |
)
|
63 |
architecture = "?"
|
64 |
if model_config is not None:
|
65 |
architectures = getattr(model_config, "architectures", None)
|
66 |
if architectures:
|
67 |
architecture = ";".join(architectures)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
# Extract results available in this file (some results are split in several files)
|
70 |
results = {}
|
@@ -76,7 +89,7 @@ class EvalResult:
|
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
|
79 |
-
mean_acc = np.mean(accs)
|
80 |
results[task.benchmark] = mean_acc
|
81 |
|
82 |
return self(
|
@@ -86,9 +99,12 @@ class EvalResult:
|
|
86 |
model=model,
|
87 |
results=results,
|
88 |
precision=precision,
|
89 |
-
revision=
|
90 |
still_on_hub=still_on_hub,
|
91 |
-
architecture=architecture
|
|
|
|
|
|
|
92 |
)
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
|
|
44 |
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
|
46 |
# Get model and org
|
47 |
+
org_and_model = config.get("model_name", None)
|
48 |
org_and_model = org_and_model.split("/", 1)
|
49 |
|
50 |
if len(org_and_model) == 1:
|
|
|
57 |
result_key = f"{org}_{model}_{precision.value.name}"
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
60 |
+
revision = config.get("model_sha", config.get("model_revision", "main"))
|
61 |
+
|
62 |
still_on_hub, _, model_config = is_model_on_hub(
|
63 |
+
full_model, revision, trust_remote_code=True, test_tokenizer=False
|
64 |
)
|
65 |
architecture = "?"
|
66 |
if model_config is not None:
|
67 |
architectures = getattr(model_config, "architectures", None)
|
68 |
if architectures:
|
69 |
architecture = ";".join(architectures)
|
70 |
+
license = "?"
|
71 |
+
likes = 0
|
72 |
+
if still_on_hub:
|
73 |
+
try:
|
74 |
+
model_info = API.model_info(repo_id=full_model, revision=revision)
|
75 |
+
if not model_size:
|
76 |
+
model_size = get_model_size(model_info=model_info, precision=precision)
|
77 |
+
license = model_info.cardData.get("license")
|
78 |
+
likes = model_info.likes
|
79 |
+
except Exception:
|
80 |
+
pass
|
81 |
|
82 |
# Extract results available in this file (some results are split in several files)
|
83 |
results = {}
|
|
|
89 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
90 |
continue
|
91 |
|
92 |
+
mean_acc = np.mean(accs)
|
93 |
results[task.benchmark] = mean_acc
|
94 |
|
95 |
return self(
|
|
|
99 |
model=model,
|
100 |
results=results,
|
101 |
precision=precision,
|
102 |
+
revision=revision,
|
103 |
still_on_hub=still_on_hub,
|
104 |
+
architecture=architecture,
|
105 |
+
likes=likes,
|
106 |
+
num_params=round(model_size / 1e9, 3),
|
107 |
+
license=license,
|
108 |
)
|
109 |
|
110 |
def update_with_request_file(self, requests_path):
|
src/submission/check_validity.py
CHANGED
@@ -62,7 +62,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
62 |
def get_model_size(model_info: ModelInfo, precision: str):
|
63 |
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
64 |
try:
|
65 |
-
model_size =
|
66 |
except (AttributeError, TypeError):
|
67 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
68 |
|
|
|
62 |
def get_model_size(model_info: ModelInfo, precision: str):
|
63 |
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
64 |
try:
|
65 |
+
model_size = model_info.safetensors["total"]
|
66 |
except (AttributeError, TypeError):
|
67 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
68 |
|