AfroBench / app.py
JessicaOjo's picture
Update description (#3)
5390c45 verified
# some code blocks are taken from https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard/tree/main
import os
import gradio as gr
from huggingface_hub import HfApi
from src.css_html import custom_css
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3
from src.utils import (
plot_leaderboard_scores,
create_result_dataframes, create_result_dataframes_lite
)
TOKEN = os.environ.get("HF_TOKEN", None)
api = HfApi(TOKEN)
def filter_items(df, leaderboard_table, query):
if query == "all":
return df[leaderboard_table.columns]
else:
query = query[0]
filtered_df = df[df["T"].str.contains(query, na=False)]
return filtered_df[leaderboard_table.columns]
def search_table(df, leaderboard_table, query):
filtered_df = df[(df["Model"].str.contains(query, case=False))]
return filtered_df[leaderboard_table.columns]
demo = gr.Blocks(css=custom_css)
with demo:
with gr.Row():
gr.Markdown(
"""
<div style="text-align: center;">
<h1>🌍 AfroBench <span style='color: #e6b800;'>Leaderboard</span></h1>
</div>
<p style="text-align: center; font-size: 16px;">
This leaderboard tracks the performance of multilingual models across <b>64 African languages</b>, <b>15 NLP tasks</b> and <b>22 datasets</b>,
covering a range of tasks from POS tagging to question answering, summarization, and machine translation.
</p>
<p style="font-size: 14px; text-align: center;">
It's based on the <a href="https://mcgill-nlp.github.io/AfroBench/index.html" target="_blank">AfroBench benchmark</a> and is designed
to highlight both full-scale evaluations and cost-efficient subsets (AfroBench-Lite).<br><br>
We aim to support better transparency and tooling for evaluating models in African languages.
</p>
""",
elem_classes="markdown-text",
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.Column():
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
with gr.TabItem("πŸ” Evaluation table", id=0):
with gr.Column():
view_source = gr.Radio(
label="πŸ“‚ Select Leaderboard Source",
choices=["afrobench", "afrobench_lite"],
value="afrobench",
interactive=True,
)
view_selector = gr.Dropdown(
label="πŸ“‚ Select View",
choices=["category", "task", "dataset"],
value="category",
interactive=True,
)
with gr.Accordion("➑️ See All Columns", open=False):
shown_columns = gr.CheckboxGroup(
choices=[],
value=[],
label="Select columns to display",
elem_id="column-select",
interactive=True,
)
leaderboard_df = gr.Dataframe(
label="Leaderboard",
interactive=False,
elem_id="leaderboard-table",
wrap=True,
)
view_options_map = {
"afrobench": ["category", "task", "dataset"],
"afrobench_lite": ["task", "dataset", "language"],
}
init_trigger = gr.Button(visible=False)
def update_view_selector(source):
options = view_options_map[source]
default = options[0]
return gr.update(choices=options, value=default), default
def refresh_table_and_columns(view_type, source):
path = "data/leaderboard_json/afrobench_lite.json" if source == "afrobench_lite" else "data/leaderboard_json/afrobench.json"
if source == "afrobench_lite":
df = create_result_dataframes_lite(path, level=view_type)
else:
df = create_result_dataframes(path, level=view_type)
df.reset_index(inplace=True)
df.rename(columns={"index": "Model"}, inplace=True)
metric_cols = [col for col in df.columns if col != "Model"]
df["Score"] = df[metric_cols].mean(axis=1).round(1)
all_cols = ["Model", "Score"] + sorted(
[col for col in df.columns if col not in ["Model", "Score"]])
df = df[all_cols]
shown_choices = sorted([col for col in df.columns if col not in ["Model", "Score"]])
return df, gr.update(choices=shown_choices, value=shown_choices), shown_choices
def refresh_table_only(view_type, selected_cols, source):
path = "data/leaderboard_json/afrobench_lite.json" if source == "afrobench_lite" else "data/leaderboard_json/afrobench.json"
if source == "afrobench_lite":
df = create_result_dataframes_lite(path, level=view_type)
else:
df = create_result_dataframes(path, level=view_type)
df.reset_index(inplace=True)
df.rename(columns={"index": "Model"}, inplace=True)
metric_cols = [col for col in df.columns if col != "Model"]
df["Score"] = df[metric_cols].mean(axis=1).round(1)
return df[["Model", "Score"] + [c for c in selected_cols if c in df.columns]]
# Trigger once on launch
def initialize(_):
return refresh_table_and_columns("category", "afrobench")
init_trigger.click(
fn=initialize,
inputs=[init_trigger],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
view_source.change(
fn=update_view_selector,
inputs=[view_source],
outputs=[view_selector, view_selector],
).then(
fn=refresh_table_and_columns,
inputs=[view_selector, view_source],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
view_selector.change(
fn=refresh_table_and_columns,
inputs=[view_selector, view_source],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
shown_columns.change(
fn=refresh_table_only,
inputs=[view_selector, shown_columns, view_source],
outputs=leaderboard_df,
)
demo.load(
fn=initialize,
inputs=[init_trigger],
outputs=[leaderboard_df, shown_columns, shown_columns],
)
gr.Markdown(
"""
**Notes:**
- Score is the average across all the columns you're seeing in the leaderboard, based on the view and filters you’ve selected.
- For more details check the πŸ“ About section.
""",
elem_classes="markdown-text",
)
with gr.TabItem("πŸ“Š Performance Plot", id=1):
with gr.Row():
model_score_plot = gr.Plot(label="Model Score Comparison")
# Update plot when view_source, view_selector, or shown_columns change
view_source.change(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
view_selector.change(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
shown_columns.change(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
demo.load(
fn=plot_leaderboard_scores,
inputs=[view_selector, shown_columns, view_source],
outputs=model_score_plot,
)
with gr.TabItem("πŸ“ About", id=2):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
with gr.TabItem("Submit results πŸš€", id=3):
gr.Markdown(SUBMISSION_TEXT_3)
# demo.launch()
demo.launch(server_name="0.0.0.0", server_port=7860)