Spaces:

MLRS
/

MELABench

Running

App Files Files Community

KurtMica commited on Jun 3

Commit

1b780de

1 Parent(s): 2b848e2

Leaderboard configuration & descriptions.

Browse files

Files changed (6) hide show

README.md +5 -5
src/about.py +18 -4
src/display/utils.py +3 -0
src/envs.py +4 -4
src/leaderboard/read_evals.py +21 -5
src/submission/check_validity.py +1 -1

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: Example Leaderboard Template
-emoji: 🥇
-colorFrom: green
-colorTo: indigo
 sdk: gradio
 app_file: app.py
 pinned: true
 license: apache-2.0
-short_description: Duplicate this leaderboard to initialize your own!
 sdk_version: 5.19.0
 ---

 ---
+title: Maltese MELABench Leaderboard 🇲🇹
+emoji: 🥇🇲🇹
+colorFrom: gray
+colorTo: red
 sdk: gradio
 app_file: app.py
 pinned: true
 license: apache-2.0
+short_description: Evaluation of language models on Maltese tasks
 sdk_version: 5.19.0
 ---

src/about.py CHANGED Viewed

@@ -12,8 +12,22 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
@@ -21,11 +35,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("sentiment", "f1,none", "Sentiment Analysis (F1)")
+    task1 = Task("sib200", "f1,none", "SIB200 (F1)")
+    task2 = Task("taxi1500", "f1,none", "Taxi1500 (F1)")
+    task3 = Task("maltese_news_categories", "loglikelihood,none", "Maltese News Categories (F1)")
+    task4 = Task("multi_eurlex", "loglikelihood,none", "MultiEURLEX (F1)")
+    task5 = Task("belebele", "acc,none", "Belebele (Accuracy)")
+    task6 = Task("opus100_en-mt", "bleu,none", "OPUS-100 EN→MT (BLEU)")
+    task7 = Task("opus100_en-mt", "chrf,none", "OPUS-100 EN→MT (ChrF)")
+    task8 = Task("flores200_en-mt", "bleu,none", "Flores-200 EN→MT (BLEU)")
+    task9 = Task("flores200_en-mt", "chrf,none", "Flores-200 EN→MT (ChrF)")
+    task10 = Task("webnlg", "chrf,none", "WebNLG (ChrF)")
+    task11 = Task("webnlg", "rouge,none", "WebNLG (Rouge-L)")
+    task12 = Task("eurlex_sum", "chrf,none", "EUR-Lex-Sum (ChrF)")
+    task13 = Task("eurlex_sum", "rouge,none", "EUR-Lex-Sum (Rouge-L)")
+    task14 = Task("maltese_news_headlines", "chrf,none", "Maltese News Headlines (ChrF)")
+    task15 = Task("maltese_news_headlines", "rouge,none", "Maltese News Headlines (Rouge-L)")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">🇲🇹 MELABench Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+A Maltese Evaluation Language Benchmark
 """
 # Which evaluations are you running? how can people reproduce what you have?

src/display/utils.py CHANGED Viewed

@@ -89,11 +89,14 @@ class WeightType(Enum):
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     Unknown = ModelDetails("?")
     def from_str(precision):
         if precision in ["torch.float16", "float16"]:
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:

     Delta = ModelDetails("Delta")
 class Precision(Enum):
+    float32 = ModelDetails("float32")
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     Unknown = ModelDetails("?")
     def from_str(precision):
+        if precision in ["torch.float32", "float32"]:
+            return Precision.float32
         if precision in ["torch.float16", "float16"]:
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:

src/envs.py CHANGED Viewed

@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
-QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "MLRS" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
+REPO_ID = f"{OWNER}/MELABench"
+QUEUE_REPO = f"{OWNER}/MELABench_requests"
+RESULTS_REPO = f"{OWNER}/MELABench_results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -44,7 +44,7 @@ class EvalResult:
         precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
         org_and_model = org_and_model.split("/", 1)
         if len(org_and_model) == 1:
@@ -57,14 +57,27 @@ class EvalResult:
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
         results = {}
@@ -76,7 +89,7 @@ class EvalResult:
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
-            mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(
@@ -86,9 +99,12 @@ class EvalResult:
             model=model,
             results=results,
             precision=precision,
-            revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
-            architecture=architecture
         )
     def update_with_request_file(self, requests_path):

         precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
+        org_and_model = config.get("model_name", None)
         org_and_model = org_and_model.split("/", 1)
         if len(org_and_model) == 1:
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
+        revision = config.get("model_sha", config.get("model_revision", "main"))
         still_on_hub, _, model_config = is_model_on_hub(
+            full_model, revision, trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
+        license = "?"
+        likes = 0
+        if still_on_hub:
+            try:
+                model_info = API.model_info(repo_id=full_model, revision=revision)
+                if not model_size:
+                    model_size = get_model_size(model_info=model_info, precision=precision)
+                license = model_info.cardData.get("license")
+                likes = model_info.likes
+            except Exception:
+                pass
         # Extract results available in this file (some results are split in several files)
         results = {}
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
+            mean_acc = np.mean(accs)
             results[task.benchmark] = mean_acc
         return self(
             model=model,
             results=results,
             precision=precision,
+            revision=revision,
             still_on_hub=still_on_hub,
+            architecture=architecture,
+            likes=likes,
+            num_params=round(model_size / 1e9, 3),
+            license=license,
         )
     def update_with_request_file(self, requests_path):

src/submission/check_validity.py CHANGED Viewed

@@ -62,7 +62,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
 def get_model_size(model_info: ModelInfo, precision: str):
     """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
     try:
-        model_size = round(model_info.safetensors["total"] / 1e9, 3)
     except (AttributeError, TypeError):
         return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py

 def get_model_size(model_info: ModelInfo, precision: str):
     """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
     try:
+        model_size = model_info.safetensors["total"]
     except (AttributeError, TypeError):
         return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py