# some code blocks are taken from https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard/tree/main import os import gradio as gr from huggingface_hub import HfApi from src.css_html import custom_css from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3 from src.utils import ( plot_leaderboard_scores, create_result_dataframes, create_result_dataframes_lite ) TOKEN = os.environ.get("HF_TOKEN", None) api = HfApi(TOKEN) def filter_items(df, leaderboard_table, query): if query == "all": return df[leaderboard_table.columns] else: query = query[0] filtered_df = df[df["T"].str.contains(query, na=False)] return filtered_df[leaderboard_table.columns] def search_table(df, leaderboard_table, query): filtered_df = df[(df["Model"].str.contains(query, case=False))] return filtered_df[leaderboard_table.columns] demo = gr.Blocks(css=custom_css) with demo: with gr.Row(): gr.Markdown( """

🌍 AfroBench Leaderboard

This leaderboard tracks the performance of multilingual models across 64 African languages, 15 NLP tasks and 22 datasets, covering a range of tasks from POS tagging to question answering, summarization, and machine translation.

It's based on the AfroBench benchmark and is designed to highlight both full-scale evaluations and cost-efficient subsets (AfroBench-Lite).

We aim to support better transparency and tooling for evaluating models in African languages.

""", elem_classes="markdown-text", ) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.Column(): with gr.Tabs(elem_classes="A100-tabs") as A100_tabs: with gr.TabItem("🔍 Evaluation table", id=0): with gr.Column(): view_source = gr.Radio( label="📂 Select Leaderboard Source", choices=["afrobench", "afrobench_lite"], value="afrobench", interactive=True, ) view_selector = gr.Dropdown( label="📂 Select View", choices=["category", "task", "dataset"], value="category", interactive=True, ) with gr.Accordion("➡️ See All Columns", open=False): shown_columns = gr.CheckboxGroup( choices=[], value=[], label="Select columns to display", elem_id="column-select", interactive=True, ) leaderboard_df = gr.Dataframe( label="Leaderboard", interactive=False, elem_id="leaderboard-table", wrap=True, ) view_options_map = { "afrobench": ["category", "task", "dataset"], "afrobench_lite": ["task", "dataset", "language"], } init_trigger = gr.Button(visible=False) def update_view_selector(source): options = view_options_map[source] default = options[0] return gr.update(choices=options, value=default), default def refresh_table_and_columns(view_type, source): path = "data/leaderboard_json/afrobench_lite.json" if source == "afrobench_lite" else "data/leaderboard_json/afrobench.json" if source == "afrobench_lite": df = create_result_dataframes_lite(path, level=view_type) else: df = create_result_dataframes(path, level=view_type) df.reset_index(inplace=True) df.rename(columns={"index": "Model"}, inplace=True) metric_cols = [col for col in df.columns if col != "Model"] df["Score"] = df[metric_cols].mean(axis=1).round(1) all_cols = ["Model", "Score"] + sorted( [col for col in df.columns if col not in ["Model", "Score"]]) df = df[all_cols] shown_choices = sorted([col for col in df.columns if col not in ["Model", "Score"]]) return df, gr.update(choices=shown_choices, value=shown_choices), shown_choices def refresh_table_only(view_type, selected_cols, source): path = "data/leaderboard_json/afrobench_lite.json" if source == "afrobench_lite" else "data/leaderboard_json/afrobench.json" if source == "afrobench_lite": df = create_result_dataframes_lite(path, level=view_type) else: df = create_result_dataframes(path, level=view_type) df.reset_index(inplace=True) df.rename(columns={"index": "Model"}, inplace=True) metric_cols = [col for col in df.columns if col != "Model"] df["Score"] = df[metric_cols].mean(axis=1).round(1) return df[["Model", "Score"] + [c for c in selected_cols if c in df.columns]] # Trigger once on launch def initialize(_): return refresh_table_and_columns("category", "afrobench") init_trigger.click( fn=initialize, inputs=[init_trigger], outputs=[leaderboard_df, shown_columns, shown_columns], ) view_source.change( fn=update_view_selector, inputs=[view_source], outputs=[view_selector, view_selector], ).then( fn=refresh_table_and_columns, inputs=[view_selector, view_source], outputs=[leaderboard_df, shown_columns, shown_columns], ) view_selector.change( fn=refresh_table_and_columns, inputs=[view_selector, view_source], outputs=[leaderboard_df, shown_columns, shown_columns], ) shown_columns.change( fn=refresh_table_only, inputs=[view_selector, shown_columns, view_source], outputs=leaderboard_df, ) demo.load( fn=initialize, inputs=[init_trigger], outputs=[leaderboard_df, shown_columns, shown_columns], ) gr.Markdown( """ **Notes:** - Score is the average across all the columns you're seeing in the leaderboard, based on the view and filters you’ve selected. - For more details check the 📝 About section. """, elem_classes="markdown-text", ) with gr.TabItem("📊 Performance Plot", id=1): with gr.Row(): model_score_plot = gr.Plot(label="Model Score Comparison") # Update plot when view_source, view_selector, or shown_columns change view_source.change( fn=plot_leaderboard_scores, inputs=[view_selector, shown_columns, view_source], outputs=model_score_plot, ) view_selector.change( fn=plot_leaderboard_scores, inputs=[view_selector, shown_columns, view_source], outputs=model_score_plot, ) shown_columns.change( fn=plot_leaderboard_scores, inputs=[view_selector, shown_columns, view_source], outputs=model_score_plot, ) demo.load( fn=plot_leaderboard_scores, inputs=[view_selector, shown_columns, view_source], outputs=model_score_plot, ) with gr.TabItem("📝 About", id=2): gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") with gr.TabItem("Submit results 🚀", id=3): gr.Markdown(SUBMISSION_TEXT_3) # demo.launch() demo.launch(server_name="0.0.0.0", server_port=7860)