|
import gradio as gr |
|
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns,SearchColumns |
|
import pandas as pd |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
|
import re |
|
import plotly.graph_objects as go |
|
|
|
|
|
|
|
from src.about import ( |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
EVALUATION_QUEUE_TEXT, |
|
INTRODUCTION_TEXT, |
|
LLM_BENCHMARKS_TEXT, |
|
TITLE, |
|
FOOTER_TEXT |
|
) |
|
from src.display.css_html_js import custom_css |
|
from src.display.utils import ( |
|
BENCHMARK_COLS, |
|
COLS, |
|
EVAL_COLS, |
|
EVAL_TYPES, |
|
AutoEvalColumn, |
|
|
|
fields, |
|
|
|
|
|
) |
|
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN |
|
from src.populate import get_evaluation_queue_df, get_leaderboard_df |
|
from src.submission.submit import add_new_eval |
|
from src.leaderboard.read_evals import get_model_answers_html_file |
|
|
|
skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring'] |
|
|
|
|
|
def restart_space(): |
|
API.restart_space(repo_id=REPO_ID) |
|
|
|
|
|
""" |
|
try: |
|
print(EVAL_REQUESTS_PATH) |
|
snapshot_download( |
|
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
) |
|
except Exception: |
|
restart_space() |
|
try: |
|
print(EVAL_RESULTS_PATH) |
|
snapshot_download( |
|
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
) |
|
except Exception: |
|
restart_space() |
|
""" |
|
|
|
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) |
|
|
|
( |
|
finished_eval_queue_df, |
|
running_eval_queue_df, |
|
pending_eval_queue_df, |
|
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) |
|
|
|
def hide_skill_columns(dataframe, exceptions=[]): |
|
return dataframe[[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default or c.name in exceptions]] |
|
|
|
|
|
def perform_cell_formatting(dataframe): |
|
return dataframe.style.format({'Contamination Score': "{:.2f}",'Benchmark Score': "{:.2f}",'Speed (words/sec)': "{:.2f}"}).apply( |
|
lambda rows: [ |
|
"background-color: red;" if (value >0) else "background-color: green;" for value in rows |
|
], |
|
subset=["Contamination Score"], |
|
) |
|
|
|
def init_leaderboard(dataframe): |
|
|
|
dataframe = hide_skill_columns(dataframe) |
|
|
|
|
|
styler = perform_cell_formatting(dataframe) |
|
|
|
return gr.Dataframe( |
|
value=styler, |
|
datatype="markdown", |
|
wrap=True, |
|
show_fullscreen_button=False, |
|
interactive=False, |
|
column_widths=[30,50,50,150,60,60,60], |
|
max_height=420, |
|
elem_classes="leaderboard_col_style", |
|
show_search="search" |
|
) |
|
|
|
|
|
def init_skill_leaderboard(dataframe): |
|
|
|
|
|
|
|
|
|
|
|
skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0]) |
|
|
|
def filter_dataframe(skill): |
|
filtered_df = dataframe.sort_values(by=[skill], ascending=False).reset_index(drop=True) |
|
filtered_df = hide_skill_columns(filtered_df, exceptions=[skill]) |
|
filtered_df["Rank"] = range(1, len(filtered_df) + 1) |
|
styler = perform_cell_formatting(filtered_df) |
|
return gr.Dataframe( |
|
value=styler, |
|
datatype="markdown", |
|
wrap=True, |
|
show_fullscreen_button=False, |
|
interactive=False, |
|
column_widths=[30,50,50,150,60,60,60,80], |
|
max_height=420, |
|
elem_classes="leaderboard_col_style" |
|
) |
|
|
|
leaderboard_by_skill = filter_dataframe(skills[0]) |
|
skills_dropdown.change(filter_dataframe, inputs=skills_dropdown, outputs=leaderboard_by_skill) |
|
return leaderboard_by_skill |
|
|
|
|
|
|
|
def init_size_leaderboard(dataframe): |
|
|
|
dataframe = hide_skill_columns(dataframe) |
|
|
|
size_keys = ["Large","Medium","Small","Nano"] |
|
|
|
size_names = ["Large (More than 30B Parameter)","Medium (~30B)","Small (~10B)","Nano (~3B)"] |
|
sizes_dropdown = gr.Dropdown(choices=size_names, label="Select Model Size", value=size_names[0]) |
|
|
|
def filter_dataframe(size_name): |
|
|
|
size_name_mapped_to_key = size_keys[size_names.index(size_name)] |
|
|
|
size_list = size_keys[size_keys.index(size_name_mapped_to_key):] |
|
filtered_df = dataframe[dataframe["Category"].isin(size_list)].reset_index(drop=True) |
|
filtered_df["Rank"] = range(1, len(filtered_df) + 1) |
|
styler = perform_cell_formatting(filtered_df) |
|
return gr.Dataframe( |
|
value=styler, |
|
datatype="markdown", |
|
wrap=True, |
|
show_fullscreen_button=False, |
|
interactive=False, |
|
column_widths=[30,50,50,150,60,60,60], |
|
max_height=420, |
|
elem_classes="leaderboard_col_style" |
|
) |
|
|
|
leaderboard_by_skill = filter_dataframe(size_names[0]) |
|
sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill) |
|
return leaderboard_by_skill |
|
|
|
def strip_html_tags(model_name): |
|
return re.sub('<[^<]+?>', '', model_name) |
|
|
|
|
|
|
|
def get_model_info_blocks(chosen_model_name): |
|
|
|
model_names = LEADERBOARD_DF["Model Name"].unique().tolist() |
|
model_names_clean = [strip_html_tags(model_name) for model_name in model_names] |
|
|
|
model_name_full = model_names[model_names_clean.index(chosen_model_name)] |
|
filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True) |
|
skills_bar_df = pd.DataFrame({ |
|
'Skills': skills, |
|
'Scores': filtered_df[skills].values[0] |
|
}) |
|
|
|
skills_bar_df = skills_bar_df.sort_values(by=['Scores'], ascending=False).reset_index(drop=True) |
|
|
|
|
|
with gr.Accordion("Model Details"): |
|
|
|
with gr.Row(): |
|
model_name = gr.Markdown("""<span class='deep-dive-metric'><b>Model Name:</b> {}</span> """.format(chosen_model_name)) |
|
with gr.Row(): |
|
benchmark_score = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Score:</b>{}/10</span>""".format(filtered_df["Benchmark Score"][0])) |
|
rank = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Rank:</b>{}</span>""".format(filtered_df["Rank"][0])) |
|
speed = gr.Markdown("""<span class='deep-dive-metric'><b>Speed:</b>{} words per second</span>""".format(filtered_df["Speed (words/sec)"][0])) |
|
contamination = gr.Markdown("""<span class='deep-dive-metric'><b>Contamination Score:</b>{}</span>""".format(filtered_df["Contamination Score"][0])) |
|
size = gr.Markdown("""<span class='deep-dive-metric'><b>Size Category:</b>{}</span>""".format(filtered_df["Category"][0])) |
|
|
|
with gr.Row(): |
|
skills_bar = gr.BarPlot( |
|
value=skills_bar_df, |
|
x="Skills", |
|
y="Scores", |
|
width=500, |
|
height=500, |
|
x_label_angle=45, |
|
color="Skills", |
|
color_title=None, |
|
label="Model Skills" |
|
) |
|
|
|
|
|
html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name) |
|
|
|
if html_file_content == "EMPTY": |
|
answers_html = gr.Markdown("") |
|
else: |
|
with gr.Row(): |
|
|
|
|
|
html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL) |
|
html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL) |
|
html_file_content = html_file_content.replace('<html lang="ar" dir="rtl">','<html>') |
|
|
|
answers_html = gr.HTML(html_file_content,max_height=500,show_label=True, |
|
label="Model Responses", container=True, elem_classes="model_responses_container") |
|
|
|
|
|
return model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html |
|
|
|
|
|
|
|
|
|
def init_compare_tab(dataframe): |
|
|
|
model_names = dataframe["Model Name"].unique().tolist() |
|
model_names_clean = [strip_html_tags(model_name) for model_name in model_names] |
|
with gr.Row(): |
|
models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model", |
|
value=model_names_clean[0], multiselect=True) |
|
|
|
|
|
def draw_radar_chart(models): |
|
print(models) |
|
|
|
|
|
fig = go.Figure() |
|
|
|
for model_name in models: |
|
model_name_full = model_names[model_names_clean.index(model_name)] |
|
skill_scores = dataframe[dataframe["Model Name"] == model_name_full][skills].values[0] |
|
|
|
fig.add_trace(go.Scatterpolar( |
|
r=skill_scores, |
|
theta=skills, |
|
fill='toself', |
|
name=model_name, |
|
|
|
)) |
|
|
|
fig.update_layout( |
|
polar=dict( |
|
radialaxis=dict(visible=True) |
|
), |
|
showlegend=True, |
|
height=500, |
|
width=900, |
|
margin=dict(l=0, r=0, t=40, b=40), |
|
legend=dict( |
|
orientation="h", |
|
yanchor="bottom", |
|
y=-0.2, |
|
xanchor="center", |
|
x=0.5 |
|
) |
|
|
|
|
|
) |
|
|
|
return gr.Plot(value=fig) |
|
|
|
radar_chart = draw_radar_chart(models_dropdown.value) |
|
models_dropdown.change(draw_radar_chart, inputs=models_dropdown, outputs=radar_chart) |
|
|
|
|
|
return radar_chart |
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
with demo: |
|
gr.HTML(TITLE, elem_classes="abl_header") |
|
gr.HTML(INTRODUCTION_TEXT, elem_classes="abl_desc_text") |
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("π
Leaderboard - Top Models", elem_id="llm-benchmark-tab-table", id=0): |
|
leaderboard = init_leaderboard(LEADERBOARD_DF) |
|
|
|
with gr.TabItem("π
Top by Size", elem_id="llm-benchmark-tab-size", id=1): |
|
leaderboard = init_size_leaderboard(LEADERBOARD_DF) |
|
|
|
with gr.TabItem("π
Top by Skill", elem_id="llm-benchmark-tab-skills", id=2): |
|
leaderboard = init_skill_leaderboard(LEADERBOARD_DF) |
|
|
|
with gr.TabItem("βοΈ Compare", elem_id="llm-benchmark-tab-compare", id=3): |
|
init_compare_tab(LEADERBOARD_DF) |
|
|
|
with gr.TabItem("π¬ Deep Dive", elem_id="llm-benchmark-tab-compare", id=4): |
|
|
|
|
|
model_names = LEADERBOARD_DF["Model Name"].unique().tolist() |
|
model_names_clean = [strip_html_tags(model_name) for model_name in model_names] |
|
with gr.Row(): |
|
models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model", value=model_names_clean[0]) |
|
|
|
|
|
model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html = get_model_info_blocks(models_dropdown.value) |
|
|
|
models_dropdown.change(get_model_info_blocks, inputs=models_dropdown, outputs=[model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html]) |
|
|
|
with gr.TabItem("π Submit here", elem_id="llm-benchmark-tab-submit", id=5): |
|
with gr.Row(): |
|
gr.Markdown("# Submit your model", elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model_name_textbox = gr.Textbox(label="Model name") |
|
|
|
submit_button = gr.Button("Submit Eval", variant="huggingface" ) |
|
submission_result = gr.Markdown() |
|
submit_button.click( |
|
add_new_eval, |
|
[ |
|
model_name_textbox, |
|
], |
|
submission_result, |
|
) |
|
with gr.Column(): |
|
with gr.Row(): |
|
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Column(): |
|
with gr.Accordion( |
|
f"β
Finished Evaluations ({len(finished_eval_queue_df)})", |
|
open=False, |
|
): |
|
with gr.Row(): |
|
finished_eval_table = gr.components.Dataframe( |
|
value=finished_eval_queue_df, |
|
headers=EVAL_COLS, |
|
datatype=EVAL_TYPES, |
|
row_count=5, |
|
) |
|
with gr.Accordion( |
|
f"π Running Evaluation Queue ({len(running_eval_queue_df)})", |
|
open=False, |
|
): |
|
with gr.Row(): |
|
running_eval_table = gr.components.Dataframe( |
|
value=running_eval_queue_df, |
|
headers=EVAL_COLS, |
|
datatype=EVAL_TYPES, |
|
row_count=5, |
|
) |
|
|
|
with gr.Accordion( |
|
f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})", |
|
open=False, |
|
): |
|
with gr.Row(): |
|
pending_eval_table = gr.components.Dataframe( |
|
value=pending_eval_queue_df, |
|
headers=EVAL_COLS, |
|
datatype=EVAL_TYPES, |
|
row_count=5, |
|
) |
|
|
|
with gr.TabItem("π About", elem_id="llm-benchmark-tab-about", id=6): |
|
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Accordion("π Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
lines=10, |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
with gr.Row(): |
|
gr.HTML(FOOTER_TEXT) |
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(restart_space, "interval", seconds=900) |
|
scheduler.start() |
|
demo.queue(default_concurrency_limit=40).launch() |