# some code blocks are taken from https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard/tree/main
import os
import gradio as gr
from huggingface_hub import HfApi
from src.css_html import custom_css
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3
from src.utils import (
plot_leaderboard_scores,
create_result_dataframes, create_result_dataframes_lite
)
TOKEN = os.environ.get("HF_TOKEN", None)
api = HfApi(TOKEN)
def filter_items(df, leaderboard_table, query):
if query == "all":
return df[leaderboard_table.columns]
else:
query = query[0]
filtered_df = df[df["T"].str.contains(query, na=False)]
return filtered_df[leaderboard_table.columns]
def search_table(df, leaderboard_table, query):
filtered_df = df[(df["Model"].str.contains(query, case=False))]
return filtered_df[leaderboard_table.columns]
demo = gr.Blocks(css=custom_css)
with demo:
with gr.Row():
gr.Markdown(
"""
🌍 AfroBench Leaderboard
This leaderboard tracks the performance of multilingual models across 64 African languages, 15 NLP tasks and 22 datasets,
covering a range of tasks from POS tagging to question answering, summarization, and machine translation.
It's based on the AfroBench benchmark and is designed
to highlight both full-scale evaluations and cost-efficient subsets (AfroBench-Lite).
We aim to support better transparency and tooling for evaluating models in African languages.
""",
elem_classes="markdown-text",
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.Column():
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
with gr.TabItem("🔍 Evaluation table", id=0):
with gr.Column():
view_source = gr.Radio(
label="📂 Select Leaderboard Source",
choices=["afrobench", "afrobench_lite"],
value="afrobench",
interactive=True,
)
view_selector = gr.Dropdown(
label="📂 Select View",
choices=["category", "task", "dataset"],
value="category",
interactive=True,
)
with gr.Accordion("➡️ See All Columns", open=False):
shown_columns = gr.CheckboxGroup(
choices=[],
value=[],
label="Select columns to display",
elem_id="column-select",
interactive=True,
)
leaderboard_df = gr.Dataframe(
label="Leaderboard",
interactive=False,
elem_id="leaderboard-table",
wrap=True,
)
view_options_map = {
"afrobench": ["category", "task", "dataset"],
"afrobench_lite": ["task", "dataset", "language"],
}
init_trigger = gr.Button(visible=False)
def update_view_selector(source):
options = view_options_map[source]
default = options[0]
return gr.update(choices=options, value=default), default
def refresh_table_and_columns(view_type, source):
path = "data/leaderboard_json/afrobench_lite.json" if source == "afrobench_lite" else "data/leaderboard_json/afrobench.json"
if source == "afrobench_lite":
df = create_result_dataframes_lite(path, level=view_type)
else:
df = create_result_dataframes(path, level=view_type)
df.reset_index(inplace=True)
df.rename(columns={"index": "Model"}, inplace=True)
metric_cols = [col for col in df.columns if col != "Model"]
df["Score"] = df[metric_cols].mean(axis=1).round(1)
all_cols = ["Model", "Score"] + sorted(
[col for col in df.columns if col not in ["Model", "Score"]])
df = df[all_cols]
shown_choices = sorted([col for col in df.columns if col not in ["Model", "Score"]])
return df, gr.update(choices=shown_choices, value=shown_choices), shown_choices
def refresh_table_only(view_type, selected_cols, source):
path = "data/leaderboard_json/afrobench_lite.json" if source == "afrobench_lite" else "data/leaderboard_json/afrobench.json"
if source == "afrobench_lite":
df = create_result_dataframes_lite(path, level=view_type)
else:
df = create_result_dataframes(path, level=view_type)
df.reset_index(inplace=True)
df.rename(columns={"index": "Model"}, inplace=True)
metric_cols = [col for col in df.columns if col != "Model"]
df["Score"] = df[metric_cols].mean(axis=1).round(1)
return df[["Model", "Score"] + [c for c in selected_cols if c in df.columns]]
# Trigger once on launch
def initialize(_):
return refresh_table_and_columns("category", "afrobench")
init_trigger.click(
fn=initialize,
inputs=[init_trigger],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
view_source.change(
fn=update_view_selector,
inputs=[view_source],
outputs=[view_selector, view_selector],
).then(
fn=refresh_table_and_columns,
inputs=[view_selector, view_source],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
view_selector.change(
fn=refresh_table_and_columns,
inputs=[view_selector, view_source],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
shown_columns.change(
fn=refresh_table_only,
inputs=[view_selector, shown_columns, view_source],
outputs=leaderboard_df,
)
demo.load(
fn=initialize,
inputs=[init_trigger],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
gr.Markdown(
"""
**Notes:**
- Score is the average across all the columns you're seeing in the leaderboard, based on the view and filters you’ve selected.
- For more details check the 📝 About section.
""",
elem_classes="markdown-text",
)
with gr.TabItem("📊 Performance Plot", id=1):
with gr.Row():
model_score_plot = gr.Plot(label="Model Score Comparison")
# Update plot when view_source, view_selector, or shown_columns change
view_source.change(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
view_selector.change(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
shown_columns.change(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
demo.load(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
with gr.TabItem("📝 About", id=2):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
with gr.TabItem("Submit results 🚀", id=3):
gr.Markdown(SUBMISSION_TEXT_3)
# demo.launch()
demo.launch(server_name="0.0.0.0", server_port=7860)