Contamination + main page styling
Browse files- app.py +11 -20
- src/about.py +16 -5
- src/display/css_html_js.py +9 -1
- src/display/formatting.py +5 -0
- src/leaderboard/read_evals.py +6 -2
- src/populate.py +2 -0
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
#from huggingface_hub import snapshot_download
|
@@ -62,25 +62,16 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
62 |
def init_leaderboard(dataframe):
|
63 |
#if dataframe is None or dataframe.empty:
|
64 |
#raise ValueError("Leaderboard DataFrame is empty or None.")
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
filter_columns=[
|
76 |
-
|
77 |
-
ColumnFilter(AutoEvalColumn.model_source.name, type="checkboxgroup", label="Model Source"),
|
78 |
-
ColumnFilter(AutoEvalColumn.model_category.name, type="checkboxgroup", label="Model Category"),
|
79 |
-
|
80 |
-
],
|
81 |
-
bool_checkboxgroup_label="Hide models",
|
82 |
-
interactive=True,
|
83 |
-
)
|
84 |
|
85 |
|
86 |
demo = gr.Blocks(css=custom_css)
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns,SearchColumns
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
#from huggingface_hub import snapshot_download
|
|
|
62 |
def init_leaderboard(dataframe):
|
63 |
#if dataframe is None or dataframe.empty:
|
64 |
#raise ValueError("Leaderboard DataFrame is empty or None.")
|
65 |
+
return gr.Dataframe(
|
66 |
+
value=dataframe[[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default]],
|
67 |
+
datatype="markdown",
|
68 |
+
wrap=True,
|
69 |
+
show_fullscreen_button=False,
|
70 |
+
interactive=False,
|
71 |
+
column_widths=[20,60,40,150,60,70,70],
|
72 |
+
max_height=420,
|
73 |
+
elem_classes="leaderboard_col_style"
|
74 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
|
77 |
demo = gr.Blocks(css=custom_css)
|
src/about.py
CHANGED
@@ -10,7 +10,7 @@ class EvalDimension:
|
|
10 |
# Select your tasks here
|
11 |
# ---------------------------------------------------
|
12 |
class EvalDimensions(Enum):
|
13 |
-
d0 = EvalDimension("speed", "Speed (
|
14 |
d1 = EvalDimension("contamination_score", "Contamination Score")
|
15 |
|
16 |
NUM_FEWSHOT = 0 # Change with your few shot
|
@@ -23,8 +23,10 @@ TITLE = """<img src='https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Lea
|
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
25 |
INTRODUCTION_TEXT = """
|
26 |
-
<h1 style='width: 100%;text-align: center;' id="space-title">
|
27 |
-
|
|
|
|
|
28 |
"""
|
29 |
|
30 |
# Which evaluations are you running? how can people reproduce what you have?
|
@@ -66,8 +68,17 @@ Make sure you have followed the above steps first.
|
|
66 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
67 |
"""
|
68 |
|
69 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite
|
70 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
"""
|
72 |
|
73 |
-
FOOTER_TEXT = """<div style='display:flex;justify-content:center;align-items:center;'><span style='font-size:
|
|
|
10 |
# Select your tasks here
|
11 |
# ---------------------------------------------------
|
12 |
class EvalDimensions(Enum):
|
13 |
+
d0 = EvalDimension("speed", "Speed (words/sec)")
|
14 |
d1 = EvalDimension("contamination_score", "Contamination Score")
|
15 |
|
16 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
25 |
INTRODUCTION_TEXT = """
|
26 |
+
<h1 style='width: 100%;text-align: center;' id="space-title">Arabic Board Benchmark (ABL) is the first comprehensive Leaderboard for Arabic LLMs</h1>
|
27 |
+
ABL is the official leaderboard of <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Board Benchmark (ABB)</a>. ABB is a compact, yet comprehensive benchmark aiming to evaluate Arabic LLMs from all angels. The benchmark consists of <b>450</b> high quality questions sampled from <b>63</b> Arabic benchmarking datasets, evaluating <b>22 categories</b> some of which are unique to Arabic language such as Diacritization and Dialects. Find more details in the about Tab.
|
28 |
+
<br/>
|
29 |
+
<br/>
|
30 |
"""
|
31 |
|
32 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
68 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
69 |
"""
|
70 |
|
71 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite the Leaderboard"
|
72 |
CITATION_BUTTON_TEXT = r"""
|
73 |
+
|
74 |
+
@misc{ABL,
|
75 |
+
author = {SILMA AI Team},
|
76 |
+
title = {Arabic Broad Leaderboard},
|
77 |
+
year = {2025},
|
78 |
+
publisher = {SILMA.AI},
|
79 |
+
howpublished = "{\url{https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Leaderboard}}"
|
80 |
+
}
|
81 |
+
|
82 |
"""
|
83 |
|
84 |
+
FOOTER_TEXT = """<div style='display:flex;justify-content:center;align-items:center;'><span style='font-size:36px;font-weight:bold;margin-right:20px;'>Sponsored By</span><a href='https://silma.ai/?ref=abl' target='_blank'><img src='https://blog.silma.ai/wp-content/uploads/2024/10/cropped-silma-logo-box.png' style='height:60px'></a></div>"""
|
src/display/css_html_js.py
CHANGED
@@ -47,7 +47,7 @@ custom_css = """
|
|
47 |
}
|
48 |
|
49 |
.tab-buttons button {
|
50 |
-
font-size:
|
51 |
}
|
52 |
|
53 |
#scale-logo {
|
@@ -94,6 +94,14 @@ custom_css = """
|
|
94 |
#box-filter > .form{
|
95 |
border: 0
|
96 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
"""
|
98 |
|
99 |
get_window_url_params = """
|
|
|
47 |
}
|
48 |
|
49 |
.tab-buttons button {
|
50 |
+
font-size:20px;
|
51 |
}
|
52 |
|
53 |
#scale-logo {
|
|
|
94 |
#box-filter > .form{
|
95 |
border: 0
|
96 |
}
|
97 |
+
|
98 |
+
.leaderboard_col_style{
|
99 |
+
|
100 |
+
}
|
101 |
+
.leaderboard_col_style th button {
|
102 |
+
font-size:14px !important
|
103 |
+
}
|
104 |
+
|
105 |
"""
|
106 |
|
107 |
get_window_url_params = """
|
src/display/formatting.py
CHANGED
@@ -6,6 +6,11 @@ def make_clickable_model(model_name):
|
|
6 |
link = f"https://huggingface.co/{model_name}"
|
7 |
return model_hyperlink(link, model_name)
|
8 |
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def styled_error(error):
|
11 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
|
|
6 |
link = f"https://huggingface.co/{model_name}"
|
7 |
return model_hyperlink(link, model_name)
|
8 |
|
9 |
+
def make_contamination_red(contamination_score):
|
10 |
+
if contamination_score <=0:
|
11 |
+
return f"<div style='background-color:green;padding:5px;color: white; text-align: center;margin:0px' title='Clean model!'>{round((contamination_score))}</div>"
|
12 |
+
else:
|
13 |
+
return f"<div style='background-color:red;padding:5px;color: white; text-align: center;margin:0px' title='Contaminated model!'>{round((contamination_score),2)}</div>"
|
14 |
|
15 |
def styled_error(error):
|
16 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
src/leaderboard/read_evals.py
CHANGED
@@ -7,7 +7,7 @@ from dataclasses import dataclass
|
|
7 |
import dateutil
|
8 |
import numpy as np
|
9 |
|
10 |
-
from src.display.formatting import make_clickable_model
|
11 |
from src.display.utils import AutoEvalColumn, EvalDimensions#, ModelType, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
@@ -135,7 +135,11 @@ class EvalResult:
|
|
135 |
}
|
136 |
|
137 |
for eval_dim in EvalDimensions:
|
138 |
-
|
|
|
|
|
|
|
|
|
139 |
|
140 |
return data_dict
|
141 |
|
|
|
7 |
import dateutil
|
8 |
import numpy as np
|
9 |
|
10 |
+
from src.display.formatting import make_clickable_model, make_contamination_red
|
11 |
from src.display.utils import AutoEvalColumn, EvalDimensions#, ModelType, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
|
|
135 |
}
|
136 |
|
137 |
for eval_dim in EvalDimensions:
|
138 |
+
dimension_name = eval_dim.value.col_name
|
139 |
+
dimension_value = self.results[eval_dim.value.metric]
|
140 |
+
if dimension_name == "Contamination Score":
|
141 |
+
dimension_value = make_contamination_red(dimension_value)
|
142 |
+
data_dict[dimension_name] = dimension_value
|
143 |
|
144 |
return data_dict
|
145 |
|
src/populate.py
CHANGED
@@ -25,6 +25,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
25 |
df.insert(0, "Rank", range(1, len(df) + 1))
|
26 |
df = df[cols].round(decimals=2)
|
27 |
print(df)
|
|
|
|
|
28 |
return df
|
29 |
else:
|
30 |
return pd.DataFrame(columns=cols)
|
|
|
25 |
df.insert(0, "Rank", range(1, len(df) + 1))
|
26 |
df = df[cols].round(decimals=2)
|
27 |
print(df)
|
28 |
+
|
29 |
+
|
30 |
return df
|
31 |
else:
|
32 |
return pd.DataFrame(columns=cols)
|