Spaces:

silma-ai
/

Arabic-LLM-Broad-Leaderboard

Running

App Files Files Community

karimouda commited on May 5

Commit

bcbf716

1 Parent(s): c8c4e79

Contamination + main page styling

Browse files

Files changed (6) hide show

app.py +11 -20
src/about.py +16 -5
src/display/css_html_js.py +9 -1
src/display/formatting.py +5 -0
src/leaderboard/read_evals.py +6 -2
src/populate.py +2 -0

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 #from huggingface_hub import snapshot_download
@@ -62,25 +62,16 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
 def init_leaderboard(dataframe):
     #if dataframe is None or dataframe.empty:
         #raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_source.name, type="checkboxgroup", label="Model Source"),
-            ColumnFilter(AutoEvalColumn.model_category.name, type="checkboxgroup", label="Model Category"),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=True,
-    )
 demo = gr.Blocks(css=custom_css)

 import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns,SearchColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 #from huggingface_hub import snapshot_download
 def init_leaderboard(dataframe):
     #if dataframe is None or dataframe.empty:
         #raise ValueError("Leaderboard DataFrame is empty or None.")
+     return gr.Dataframe(
+            value=dataframe[[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default]],
+            datatype="markdown",
+            wrap=True,
+            show_fullscreen_button=False,
+            interactive=False,
+            column_widths=[20,60,40,150,60,70,70],
+            max_height=420,
+            elem_classes="leaderboard_col_style"
+        )
 demo = gr.Blocks(css=custom_css)

src/about.py CHANGED Viewed

@@ -10,7 +10,7 @@ class EvalDimension:
 # Select your tasks here
 # ---------------------------------------------------
 class EvalDimensions(Enum):
-    d0 = EvalDimension("speed",  "Speed (s/q)")
     d1 = EvalDimension("contamination_score",  "Contamination Score")
 NUM_FEWSHOT = 0 # Change with your few shot
@@ -23,8 +23,10 @@ TITLE = """<img src='https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Lea
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-<h1 style='width: 100%;text-align: center;' id="space-title">The First Comprehensive Leaderboard for Arabic LLMs</h1>
-Welcome to the official Leaderboard of the unique meta benchmark Arabic Board Benchmark (ABB), ABB combines 464 questions sampled from 63 Arabic benchmarking datasets on the internet, evaluating 22 categories and 174 sub-categories.
 """
 # Which evaluations are you running? how can people reproduce what you have?
@@ -66,8 +68,17 @@ Make sure you have followed the above steps first.
 If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """
-FOOTER_TEXT = """<div style='display:flex;justify-content:center;align-items:center;'><span style='font-size:40px;font-weight:bold;margin-right:20px;'>Sponsored By</span><a href='https://silma.ai/?ref=abl' target='_blank'><img src='https://blog.silma.ai/wp-content/uploads/2024/10/cropped-silma-logo-box.png' style='height:60px'></a></div>"""

 # Select your tasks here
 # ---------------------------------------------------
 class EvalDimensions(Enum):
+    d0 = EvalDimension("speed",  "Speed (words/sec)")
     d1 = EvalDimension("contamination_score",  "Contamination Score")
 NUM_FEWSHOT = 0 # Change with your few shot
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+<h1 style='width: 100%;text-align: center;' id="space-title">Arabic Board Benchmark (ABL) is the first comprehensive Leaderboard for Arabic LLMs</h1>
+ABL is the official leaderboard of <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Board Benchmark (ABB)</a>. ABB is a compact, yet comprehensive benchmark aiming to evaluate Arabic LLMs from all angels. The benchmark consists of <b>450</b> high quality questions sampled from <b>63</b> Arabic benchmarking datasets, evaluating <b>22 categories</b> some of which are unique to Arabic language such as Diacritization and Dialects. Find more details in the about Tab.
+<br/>
+<br/>
 """
 # Which evaluations are you running? how can people reproduce what you have?
 If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite the Leaderboard"
 CITATION_BUTTON_TEXT = r"""
+@misc{ABL,
+  author = {SILMA AI Team},
+  title = {Arabic Broad Leaderboard},
+  year = {2025},
+  publisher = {SILMA.AI},
+  howpublished = "{\url{https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Leaderboard}}"
+}
 """
+FOOTER_TEXT = """<div style='display:flex;justify-content:center;align-items:center;'><span style='font-size:36px;font-weight:bold;margin-right:20px;'>Sponsored By</span><a href='https://silma.ai/?ref=abl' target='_blank'><img src='https://blog.silma.ai/wp-content/uploads/2024/10/cropped-silma-logo-box.png' style='height:60px'></a></div>"""

src/display/css_html_js.py CHANGED Viewed

@@ -47,7 +47,7 @@ custom_css = """
 }
 .tab-buttons button {
-    font-size: 20px;
 }
 #scale-logo {
@@ -94,6 +94,14 @@ custom_css = """
 #box-filter > .form{
     border: 0
 }
 """
 get_window_url_params = """

 }
 .tab-buttons button {
+    font-size:20px;
 }
 #scale-logo {
 #box-filter > .form{
     border: 0
 }
+.leaderboard_col_style{
+}
+.leaderboard_col_style th button  {
+    font-size:14px !important
+}
 """
 get_window_url_params = """

src/display/formatting.py CHANGED Viewed

@@ -6,6 +6,11 @@ def make_clickable_model(model_name):
     link = f"https://huggingface.co/{model_name}"
     return model_hyperlink(link, model_name)
 def styled_error(error):
     return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"

     link = f"https://huggingface.co/{model_name}"
     return model_hyperlink(link, model_name)
+def make_contamination_red(contamination_score):
+    if contamination_score <=0:
+        return f"<div style='background-color:green;padding:5px;color: white;  text-align: center;margin:0px'  title='Clean model!'>{round((contamination_score))}</div>"
+    else:
+        return f"<div style='background-color:red;padding:5px;color: white; text-align: center;margin:0px' title='Contaminated model!'>{round((contamination_score),2)}</div>"
 def styled_error(error):
     return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"

src/leaderboard/read_evals.py CHANGED Viewed

@@ -7,7 +7,7 @@ from dataclasses import dataclass
 import dateutil
 import numpy as np
-from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalDimensions#, ModelType, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
@@ -135,7 +135,11 @@ class EvalResult:
         }
         for eval_dim in EvalDimensions:
-            data_dict[eval_dim.value.col_name] = self.results[eval_dim.value.metric]
         return data_dict

 import dateutil
 import numpy as np
+from src.display.formatting import make_clickable_model, make_contamination_red
 from src.display.utils import AutoEvalColumn, EvalDimensions#, ModelType, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
         }
         for eval_dim in EvalDimensions:
+            dimension_name = eval_dim.value.col_name
+            dimension_value = self.results[eval_dim.value.metric]
+            if dimension_name == "Contamination Score":
+                dimension_value = make_contamination_red(dimension_value)
+            data_dict[dimension_name] = dimension_value
         return data_dict

src/populate.py CHANGED Viewed

@@ -25,6 +25,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
         df.insert(0, "Rank", range(1, len(df) + 1))
         df = df[cols].round(decimals=2)
         print(df)
         return df
     else:
         return pd.DataFrame(columns=cols)

         df.insert(0, "Rank", range(1, len(df) + 1))
         df = df[cols].round(decimals=2)
         print(df)
         return df
     else:
         return pd.DataFrame(columns=cols)