karimouda's picture
rtl issue
2d5cded
raw
history blame
14.8 kB
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns,SearchColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
#from huggingface_hub import snapshot_download
import re
import plotly.graph_objects as go
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
FOOTER_TEXT
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
# ModelType,
fields,
#WeightType,
#Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
from src.leaderboard.read_evals import get_model_answers_html_file
skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
"""
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
"""
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def hide_skill_columns(dataframe, exceptions=[]):
return dataframe[[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default or c.name in exceptions]]
def perform_cell_formatting(dataframe):
return dataframe.style.format({'Contamination Score': "{:.2f}",'Benchmark Score': "{:.2f}",'Speed (words/sec)': "{:.2f}"}).apply(
lambda rows: [
"background-color: red;" if (value >0) else "background-color: green;" for value in rows
],
subset=["Contamination Score"],
)
def init_leaderboard(dataframe):
dataframe = hide_skill_columns(dataframe)
styler = perform_cell_formatting(dataframe)
return gr.Dataframe(
value=styler,
datatype="markdown",
wrap=True,
show_fullscreen_button=False,
interactive=False,
column_widths=[30,50,50,150,60,60,60],
max_height=420,
elem_classes="leaderboard_col_style",
show_search="search"
)
def init_skill_leaderboard(dataframe):
## create selector for model skills, based on the selector filter the dataframe
skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
def filter_dataframe(skill):
filtered_df = dataframe.sort_values(by=[skill], ascending=False).reset_index(drop=True)
filtered_df = hide_skill_columns(filtered_df, exceptions=[skill])
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
styler = perform_cell_formatting(filtered_df)
return gr.Dataframe(
value=styler,
datatype="markdown",
wrap=True,
show_fullscreen_button=False,
interactive=False,
column_widths=[30,50,50,150,60,60,60,80],
max_height=420,
elem_classes="leaderboard_col_style"
)
leaderboard_by_skill = filter_dataframe(skills[0])
skills_dropdown.change(filter_dataframe, inputs=skills_dropdown, outputs=leaderboard_by_skill)
return leaderboard_by_skill
def init_size_leaderboard(dataframe):
dataframe = hide_skill_columns(dataframe)
size_keys = ["Large","Medium","Small","Nano"]
size_names = ["Large (More than 30B Parameter)","Medium (~30B)","Small (~10B)","Nano (~3B)"]
sizes_dropdown = gr.Dropdown(choices=size_names, label="Select Model Size", value=size_names[0])
def filter_dataframe(size_name):
##map size name to size key
size_name_mapped_to_key = size_keys[size_names.index(size_name)]
##slice array from 0 to index of size
size_list = size_keys[size_keys.index(size_name_mapped_to_key):]
filtered_df = dataframe[dataframe["Category"].isin(size_list)].reset_index(drop=True)
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
styler = perform_cell_formatting(filtered_df)
return gr.Dataframe(
value=styler,
datatype="markdown",
wrap=True,
show_fullscreen_button=False,
interactive=False,
column_widths=[30,50,50,150,60,60,60],
max_height=420,
elem_classes="leaderboard_col_style"
)
leaderboard_by_skill = filter_dataframe(size_names[0])
sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
return leaderboard_by_skill
def strip_html_tags(model_name):
return re.sub('<[^<]+?>', '', model_name)
def get_model_info_blocks(chosen_model_name):
model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
model_name_full = model_names[model_names_clean.index(chosen_model_name)]
filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
skills_bar_df = pd.DataFrame({
'Skills': skills,
'Scores': filtered_df[skills].values[0]
})
skills_bar_df = skills_bar_df.sort_values(by=['Scores'], ascending=False).reset_index(drop=True)
with gr.Accordion("Model Details"):
with gr.Row():
model_name = gr.Markdown("""<span class='deep-dive-metric'><b>Model Name:</b> {}</span> """.format(chosen_model_name))
with gr.Row():
benchmark_score = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Score:</b>{}/10</span>""".format(filtered_df["Benchmark Score"][0]))
rank = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Rank:</b>{}</span>""".format(filtered_df["Rank"][0]))
speed = gr.Markdown("""<span class='deep-dive-metric'><b>Speed:</b>{} words per second</span>""".format(filtered_df["Speed (words/sec)"][0]))
contamination = gr.Markdown("""<span class='deep-dive-metric'><b>Contamination Score:</b>{}</span>""".format(filtered_df["Contamination Score"][0]))
size = gr.Markdown("""<span class='deep-dive-metric'><b>Size Category:</b>{}</span>""".format(filtered_df["Category"][0]))
with gr.Row():
skills_bar = gr.BarPlot(
value=skills_bar_df,
x="Skills",
y="Scores",
width=500,
height=500,
x_label_angle=45,
color="Skills",
color_title=None,
label="Model Skills"
)
html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
if html_file_content == "EMPTY":
answers_html = gr.Markdown("")
else:
with gr.Row():
##strip style and script tags from html
html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
html_file_content = html_file_content.replace('<html lang="ar" dir="rtl">','<html>')
answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
label="Model Responses", container=True, elem_classes="model_responses_container")
return model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html
def init_compare_tab(dataframe):
model_names = dataframe["Model Name"].unique().tolist()
model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
with gr.Row():
models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model",
value=model_names_clean[0], multiselect=True)
def draw_radar_chart(models):
print(models)
fig = go.Figure()
for model_name in models:
model_name_full = model_names[model_names_clean.index(model_name)]
skill_scores = dataframe[dataframe["Model Name"] == model_name_full][skills].values[0]
fig.add_trace(go.Scatterpolar(
r=skill_scores,
theta=skills,
fill='toself',
name=model_name,
))
fig.update_layout(
polar=dict(
radialaxis=dict(visible=True)
),
showlegend=True,
height=500,
width=900,
margin=dict(l=0, r=0, t=40, b=40),
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.2,
xanchor="center",
x=0.5
)
)
return gr.Plot(value=fig)
radar_chart = draw_radar_chart(models_dropdown.value)
models_dropdown.change(draw_radar_chart, inputs=models_dropdown, outputs=radar_chart)
return radar_chart
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE, elem_classes="abl_header")
gr.HTML(INTRODUCTION_TEXT, elem_classes="abl_desc_text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… Leaderboard - Top Models", elem_id="llm-benchmark-tab-table", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("πŸ… Top by Size", elem_id="llm-benchmark-tab-size", id=1):
leaderboard = init_size_leaderboard(LEADERBOARD_DF)
with gr.TabItem("πŸ… Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
with gr.TabItem("βš–οΈ Compare", elem_id="llm-benchmark-tab-compare", id=3):
init_compare_tab(LEADERBOARD_DF)
with gr.TabItem("πŸ”¬ Deep Dive", elem_id="llm-benchmark-tab-compare", id=4):
model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
with gr.Row():
models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model", value=model_names_clean[0])
model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html = get_model_info_blocks(models_dropdown.value)
models_dropdown.change(get_model_info_blocks, inputs=models_dropdown, outputs=[model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html])
with gr.TabItem("πŸš€ Submit here", elem_id="llm-benchmark-tab-submit", id=5):
with gr.Row():
gr.Markdown("# Submit your model", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
submit_button = gr.Button("Submit Eval", variant="huggingface" )
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
],
submission_result,
)
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Column():
with gr.Accordion(
f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
):
with gr.Row():
finished_eval_table = gr.components.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
):
with gr.Row():
running_eval_table = gr.components.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
):
with gr.Row():
pending_eval_table = gr.components.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-about", id=6):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=10,
elem_id="citation-button",
show_copy_button=True,
)
with gr.Row():
gr.HTML(FOOTER_TEXT)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=900)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()