Spaces:
Running
Running
File size: 9,906 Bytes
a147f3f b08e344 a147f3f b08e344 a147f3f b08e344 a147f3f 5390c45 a147f3f b08e344 a147f3f b08e344 a147f3f b08e344 a147f3f b08e344 a147f3f b08e344 a147f3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# some code blocks are taken from https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard/tree/main
import os
import gradio as gr
from huggingface_hub import HfApi
from src.css_html import custom_css
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3
from src.utils import (
plot_leaderboard_scores,
create_result_dataframes, create_result_dataframes_lite
)
TOKEN = os.environ.get("HF_TOKEN", None)
api = HfApi(TOKEN)
def filter_items(df, leaderboard_table, query):
if query == "all":
return df[leaderboard_table.columns]
else:
query = query[0]
filtered_df = df[df["T"].str.contains(query, na=False)]
return filtered_df[leaderboard_table.columns]
def search_table(df, leaderboard_table, query):
filtered_df = df[(df["Model"].str.contains(query, case=False))]
return filtered_df[leaderboard_table.columns]
demo = gr.Blocks(css=custom_css)
with demo:
with gr.Row():
gr.Markdown(
"""
<div style="text-align: center;">
<h1>π AfroBench <span style='color: #e6b800;'>Leaderboard</span></h1>
</div>
<p style="text-align: center; font-size: 16px;">
This leaderboard tracks the performance of multilingual models across <b>64 African languages</b>, <b>15 NLP tasks</b> and <b>22 datasets</b>,
covering a range of tasks from POS tagging to question answering, summarization, and machine translation.
</p>
<p style="font-size: 14px; text-align: center;">
It's based on the <a href="https://mcgill-nlp.github.io/AfroBench/index.html" target="_blank">AfroBench benchmark</a> and is designed
to highlight both full-scale evaluations and cost-efficient subsets (AfroBench-Lite).<br><br>
We aim to support better transparency and tooling for evaluating models in African languages.
</p>
""",
elem_classes="markdown-text",
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.Column():
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
with gr.TabItem("π Evaluation table", id=0):
with gr.Column():
view_source = gr.Radio(
label="π Select Leaderboard Source",
choices=["afrobench", "afrobench_lite"],
value="afrobench",
interactive=True,
)
view_selector = gr.Dropdown(
label="π Select View",
choices=["category", "task", "dataset"],
value="category",
interactive=True,
)
with gr.Accordion("β‘οΈ See All Columns", open=False):
shown_columns = gr.CheckboxGroup(
choices=[],
value=[],
label="Select columns to display",
elem_id="column-select",
interactive=True,
)
leaderboard_df = gr.Dataframe(
label="Leaderboard",
interactive=False,
elem_id="leaderboard-table",
wrap=True,
)
view_options_map = {
"afrobench": ["category", "task", "dataset"],
"afrobench_lite": ["task", "dataset", "language"],
}
init_trigger = gr.Button(visible=False)
def update_view_selector(source):
options = view_options_map[source]
default = options[0]
return gr.update(choices=options, value=default), default
def refresh_table_and_columns(view_type, source):
path = "data/leaderboard_json/afrobench_lite.json" if source == "afrobench_lite" else "data/leaderboard_json/afrobench.json"
if source == "afrobench_lite":
df = create_result_dataframes_lite(path, level=view_type)
else:
df = create_result_dataframes(path, level=view_type)
df.reset_index(inplace=True)
df.rename(columns={"index": "Model"}, inplace=True)
metric_cols = [col for col in df.columns if col != "Model"]
df["Score"] = df[metric_cols].mean(axis=1).round(1)
all_cols = ["Model", "Score"] + sorted(
[col for col in df.columns if col not in ["Model", "Score"]])
df = df[all_cols]
shown_choices = sorted([col for col in df.columns if col not in ["Model", "Score"]])
return df, gr.update(choices=shown_choices, value=shown_choices), shown_choices
def refresh_table_only(view_type, selected_cols, source):
path = "data/leaderboard_json/afrobench_lite.json" if source == "afrobench_lite" else "data/leaderboard_json/afrobench.json"
if source == "afrobench_lite":
df = create_result_dataframes_lite(path, level=view_type)
else:
df = create_result_dataframes(path, level=view_type)
df.reset_index(inplace=True)
df.rename(columns={"index": "Model"}, inplace=True)
metric_cols = [col for col in df.columns if col != "Model"]
df["Score"] = df[metric_cols].mean(axis=1).round(1)
return df[["Model", "Score"] + [c for c in selected_cols if c in df.columns]]
# Trigger once on launch
def initialize(_):
return refresh_table_and_columns("category", "afrobench")
init_trigger.click(
fn=initialize,
inputs=[init_trigger],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
view_source.change(
fn=update_view_selector,
inputs=[view_source],
outputs=[view_selector, view_selector],
).then(
fn=refresh_table_and_columns,
inputs=[view_selector, view_source],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
view_selector.change(
fn=refresh_table_and_columns,
inputs=[view_selector, view_source],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
shown_columns.change(
fn=refresh_table_only,
inputs=[view_selector, shown_columns, view_source],
outputs=leaderboard_df,
)
demo.load(
fn=initialize,
inputs=[init_trigger],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
gr.Markdown(
"""
**Notes:**
- Score is the average across all the columns you're seeing in the leaderboard, based on the view and filters youβve selected.
- For more details check the π About section.
""",
elem_classes="markdown-text",
)
with gr.TabItem("π Performance Plot", id=1):
with gr.Row():
model_score_plot = gr.Plot(label="Model Score Comparison")
# Update plot when view_source, view_selector, or shown_columns change
view_source.change(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
view_selector.change(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
shown_columns.change(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
demo.load(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
with gr.TabItem("π About", id=2):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
with gr.TabItem("Submit results π", id=3):
gr.Markdown(SUBMISSION_TEXT_3)
# demo.launch()
demo.launch(server_name="0.0.0.0", server_port=7860)
|