KurtMica commited on
Commit
1b780de
Β·
1 Parent(s): 2b848e2

Leaderboard configuration & descriptions.

Browse files
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Example Leaderboard Template
3
- emoji: πŸ₯‡
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
- short_description: Duplicate this leaderboard to initialize your own!
11
  sdk_version: 5.19.0
12
  ---
13
 
 
1
  ---
2
+ title: Maltese MELABench Leaderboard πŸ‡²πŸ‡Ή
3
+ emoji: πŸ₯‡πŸ‡²πŸ‡Ή
4
+ colorFrom: gray
5
+ colorTo: red
6
  sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
+ short_description: Evaluation of language models on Maltese tasks
11
  sdk_version: 5.19.0
12
  ---
13
 
src/about.py CHANGED
@@ -12,8 +12,22 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,11 +35,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("sentiment", "f1,none", "Sentiment Analysis (F1)")
16
+ task1 = Task("sib200", "f1,none", "SIB200 (F1)")
17
+ task2 = Task("taxi1500", "f1,none", "Taxi1500 (F1)")
18
+ task3 = Task("maltese_news_categories", "loglikelihood,none", "Maltese News Categories (F1)")
19
+ task4 = Task("multi_eurlex", "loglikelihood,none", "MultiEURLEX (F1)")
20
+ task5 = Task("belebele", "acc,none", "Belebele (Accuracy)")
21
+ task6 = Task("opus100_en-mt", "bleu,none", "OPUS-100 EN→MT (BLEU)")
22
+ task7 = Task("opus100_en-mt", "chrf,none", "OPUS-100 EN→MT (ChrF)")
23
+ task8 = Task("flores200_en-mt", "bleu,none", "Flores-200 EN→MT (BLEU)")
24
+ task9 = Task("flores200_en-mt", "chrf,none", "Flores-200 EN→MT (ChrF)")
25
+ task10 = Task("webnlg", "chrf,none", "WebNLG (ChrF)")
26
+ task11 = Task("webnlg", "rouge,none", "WebNLG (Rouge-L)")
27
+ task12 = Task("eurlex_sum", "chrf,none", "EUR-Lex-Sum (ChrF)")
28
+ task13 = Task("eurlex_sum", "rouge,none", "EUR-Lex-Sum (Rouge-L)")
29
+ task14 = Task("maltese_news_headlines", "chrf,none", "Maltese News Headlines (ChrF)")
30
+ task15 = Task("maltese_news_headlines", "rouge,none", "Maltese News Headlines (Rouge-L)")
31
 
32
  NUM_FEWSHOT = 0 # Change with your few shot
33
  # ---------------------------------------------------
 
35
 
36
 
37
  # Your leaderboard name
38
+ TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ή MELABench Leaderboard</h1>"""
39
 
40
  # What does your leaderboard evaluate?
41
  INTRODUCTION_TEXT = """
42
+ A Maltese Evaluation Language Benchmark
43
  """
44
 
45
  # Which evaluations are you running? how can people reproduce what you have?
src/display/utils.py CHANGED
@@ -89,11 +89,14 @@ class WeightType(Enum):
89
  Delta = ModelDetails("Delta")
90
 
91
  class Precision(Enum):
 
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
94
  Unknown = ModelDetails("?")
95
 
96
  def from_str(precision):
 
 
97
  if precision in ["torch.float16", "float16"]:
98
  return Precision.float16
99
  if precision in ["torch.bfloat16", "bfloat16"]:
 
89
  Delta = ModelDetails("Delta")
90
 
91
  class Precision(Enum):
92
+ float32 = ModelDetails("float32")
93
  float16 = ModelDetails("float16")
94
  bfloat16 = ModelDetails("bfloat16")
95
  Unknown = ModelDetails("?")
96
 
97
  def from_str(precision):
98
+ if precision in ["torch.float32", "float32"]:
99
+ return Precision.float32
100
  if precision in ["torch.float16", "float16"]:
101
  return Precision.float16
102
  if precision in ["torch.bfloat16", "bfloat16"]:
src/envs.py CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "MLRS" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/MELABench"
13
+ QUEUE_REPO = f"{OWNER}/MELABench_requests"
14
+ RESULTS_REPO = f"{OWNER}/MELABench_results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -44,7 +44,7 @@ class EvalResult:
44
  precision = Precision.from_str(config.get("model_dtype"))
45
 
46
  # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
  org_and_model = org_and_model.split("/", 1)
49
 
50
  if len(org_and_model) == 1:
@@ -57,14 +57,27 @@ class EvalResult:
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
 
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
  architecture = "?"
64
  if model_config is not None:
65
  architectures = getattr(model_config, "architectures", None)
66
  if architectures:
67
  architecture = ";".join(architectures)
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Extract results available in this file (some results are split in several files)
70
  results = {}
@@ -76,7 +89,7 @@ class EvalResult:
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
79
- mean_acc = np.mean(accs) * 100.0
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
@@ -86,9 +99,12 @@ class EvalResult:
86
  model=model,
87
  results=results,
88
  precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
- architecture=architecture
 
 
 
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
 
44
  precision = Precision.from_str(config.get("model_dtype"))
45
 
46
  # Get model and org
47
+ org_and_model = config.get("model_name", None)
48
  org_and_model = org_and_model.split("/", 1)
49
 
50
  if len(org_and_model) == 1:
 
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
60
+ revision = config.get("model_sha", config.get("model_revision", "main"))
61
+
62
  still_on_hub, _, model_config = is_model_on_hub(
63
+ full_model, revision, trust_remote_code=True, test_tokenizer=False
64
  )
65
  architecture = "?"
66
  if model_config is not None:
67
  architectures = getattr(model_config, "architectures", None)
68
  if architectures:
69
  architecture = ";".join(architectures)
70
+ license = "?"
71
+ likes = 0
72
+ if still_on_hub:
73
+ try:
74
+ model_info = API.model_info(repo_id=full_model, revision=revision)
75
+ if not model_size:
76
+ model_size = get_model_size(model_info=model_info, precision=precision)
77
+ license = model_info.cardData.get("license")
78
+ likes = model_info.likes
79
+ except Exception:
80
+ pass
81
 
82
  # Extract results available in this file (some results are split in several files)
83
  results = {}
 
89
  if accs.size == 0 or any([acc is None for acc in accs]):
90
  continue
91
 
92
+ mean_acc = np.mean(accs)
93
  results[task.benchmark] = mean_acc
94
 
95
  return self(
 
99
  model=model,
100
  results=results,
101
  precision=precision,
102
+ revision=revision,
103
  still_on_hub=still_on_hub,
104
+ architecture=architecture,
105
+ likes=likes,
106
+ num_params=round(model_size / 1e9, 3),
107
+ license=license,
108
  )
109
 
110
  def update_with_request_file(self, requests_path):
src/submission/check_validity.py CHANGED
@@ -62,7 +62,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
62
  def get_model_size(model_info: ModelInfo, precision: str):
63
  """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
  try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
  except (AttributeError, TypeError):
67
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
 
 
62
  def get_model_size(model_info: ModelInfo, precision: str):
63
  """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
  try:
65
+ model_size = model_info.safetensors["total"]
66
  except (AttributeError, TypeError):
67
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68