Spaces:

silma-ai
/

Arabic-LLM-Broad-Leaderboard

Running

App Files Files Community

Arabic-LLM-Broad-Leaderboard / app.py

karimouda

rtl issue

2d5cded 4 months ago

raw

history blame

14.8 kB

	import gradio as gr
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns,SearchColumns
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	#from huggingface_hub import snapshot_download
	import re
	import plotly.graph_objects as go



	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	LLM_BENCHMARKS_TEXT,
	TITLE,
	FOOTER_TEXT
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	BENCHMARK_COLS,
	COLS,
	EVAL_COLS,
	EVAL_TYPES,
	AutoEvalColumn,
	# ModelType,
	fields,
	#WeightType,
	#Precision
	)
	from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
	from src.populate import get_evaluation_queue_df, get_leaderboard_df
	from src.submission.submit import add_new_eval
	from src.leaderboard.read_evals import get_model_answers_html_file

	skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']


	def restart_space():
	API.restart_space(repo_id=REPO_ID)

	### Space initialisation
	"""
	try:
	print(EVAL_REQUESTS_PATH)
	snapshot_download(
	repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()
	try:
	print(EVAL_RESULTS_PATH)
	snapshot_download(
	repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()
	"""

	LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)

	(
	finished_eval_queue_df,
	running_eval_queue_df,
	pending_eval_queue_df,
	) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

	def hide_skill_columns(dataframe, exceptions=[]):
	return dataframe[[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default or c.name in exceptions]]


	def perform_cell_formatting(dataframe):
	return dataframe.style.format({'Contamination Score': "{:.2f}",'Benchmark Score': "{:.2f}",'Speed (words/sec)': "{:.2f}"}).apply(
	lambda rows: [
	"background-color: red;" if (value >0) else "background-color: green;" for value in rows
	],
	subset=["Contamination Score"],
	)

	def init_leaderboard(dataframe):

	dataframe = hide_skill_columns(dataframe)


	styler = perform_cell_formatting(dataframe)

	return gr.Dataframe(
	value=styler,
	datatype="markdown",
	wrap=True,
	show_fullscreen_button=False,
	interactive=False,
	column_widths=[30,50,50,150,60,60,60],
	max_height=420,
	elem_classes="leaderboard_col_style",
	show_search="search"
	)


	def init_skill_leaderboard(dataframe):



	## create selector for model skills, based on the selector filter the dataframe

	skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])

	def filter_dataframe(skill):
	filtered_df = dataframe.sort_values(by=[skill], ascending=False).reset_index(drop=True)
	filtered_df = hide_skill_columns(filtered_df, exceptions=[skill])
	filtered_df["Rank"] = range(1, len(filtered_df) + 1)
	styler = perform_cell_formatting(filtered_df)
	return gr.Dataframe(
	value=styler,
	datatype="markdown",
	wrap=True,
	show_fullscreen_button=False,
	interactive=False,
	column_widths=[30,50,50,150,60,60,60,80],
	max_height=420,
	elem_classes="leaderboard_col_style"
	)

	leaderboard_by_skill = filter_dataframe(skills[0])
	skills_dropdown.change(filter_dataframe, inputs=skills_dropdown, outputs=leaderboard_by_skill)
	return leaderboard_by_skill



	def init_size_leaderboard(dataframe):

	dataframe = hide_skill_columns(dataframe)

	size_keys = ["Large","Medium","Small","Nano"]

	size_names = ["Large (More than 30B Parameter)","Medium (~30B)","Small (~10B)","Nano (~3B)"]
	sizes_dropdown = gr.Dropdown(choices=size_names, label="Select Model Size", value=size_names[0])

	def filter_dataframe(size_name):
	##map size name to size key
	size_name_mapped_to_key = size_keys[size_names.index(size_name)]
	##slice array from 0 to index of size
	size_list = size_keys[size_keys.index(size_name_mapped_to_key):]
	filtered_df = dataframe[dataframe["Category"].isin(size_list)].reset_index(drop=True)
	filtered_df["Rank"] = range(1, len(filtered_df) + 1)
	styler = perform_cell_formatting(filtered_df)
	return gr.Dataframe(
	value=styler,
	datatype="markdown",
	wrap=True,
	show_fullscreen_button=False,
	interactive=False,
	column_widths=[30,50,50,150,60,60,60],
	max_height=420,
	elem_classes="leaderboard_col_style"
	)

	leaderboard_by_skill = filter_dataframe(size_names[0])
	sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
	return leaderboard_by_skill

	def strip_html_tags(model_name):
	return re.sub('<[^<]+?>', '', model_name)



	def get_model_info_blocks(chosen_model_name):

	model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
	model_names_clean = [strip_html_tags(model_name) for model_name in model_names]

	model_name_full = model_names[model_names_clean.index(chosen_model_name)]
	filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
	skills_bar_df = pd.DataFrame({
	'Skills': skills,
	'Scores': filtered_df[skills].values[0]
	})

	skills_bar_df = skills_bar_df.sort_values(by=['Scores'], ascending=False).reset_index(drop=True)


	with gr.Accordion("Model Details"):

	with gr.Row():
	model_name = gr.Markdown("""<span class='deep-dive-metric'><b>Model Name:</b> {}</span> """.format(chosen_model_name))
	with gr.Row():
	benchmark_score = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Score:</b>{}/10</span>""".format(filtered_df["Benchmark Score"][0]))
	rank = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Rank:</b>{}</span>""".format(filtered_df["Rank"][0]))
	speed = gr.Markdown("""<span class='deep-dive-metric'><b>Speed:</b>{} words per second</span>""".format(filtered_df["Speed (words/sec)"][0]))
	contamination = gr.Markdown("""<span class='deep-dive-metric'><b>Contamination Score:</b>{}</span>""".format(filtered_df["Contamination Score"][0]))
	size = gr.Markdown("""<span class='deep-dive-metric'><b>Size Category:</b>{}</span>""".format(filtered_df["Category"][0]))

	with gr.Row():
	skills_bar = gr.BarPlot(
	value=skills_bar_df,
	x="Skills",
	y="Scores",
	width=500,
	height=500,
	x_label_angle=45,
	color="Skills",
	color_title=None,
	label="Model Skills"
	)


	html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)

	if html_file_content == "EMPTY":
	answers_html = gr.Markdown("")
	else:
	with gr.Row():

	##strip style and script tags from html
	html_file_content = re.sub('<style.?>.?</style>', '', html_file_content, flags=re.DOTALL)
	html_file_content = re.sub('<script.?>.?</script>', '', html_file_content, flags=re.DOTALL)
	html_file_content = html_file_content.replace('<html lang="ar" dir="rtl">','<html>')

	answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
	label="Model Responses", container=True, elem_classes="model_responses_container")


	return model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html




	def init_compare_tab(dataframe):

	model_names = dataframe["Model Name"].unique().tolist()
	model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
	with gr.Row():
	models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model",
	value=model_names_clean[0], multiselect=True)


	def draw_radar_chart(models):
	print(models)


	fig = go.Figure()

	for model_name in models:
	model_name_full = model_names[model_names_clean.index(model_name)]
	skill_scores = dataframe[dataframe["Model Name"] == model_name_full][skills].values[0]

	fig.add_trace(go.Scatterpolar(
	r=skill_scores,
	theta=skills,
	fill='toself',
	name=model_name,

	))

	fig.update_layout(
	polar=dict(
	radialaxis=dict(visible=True)
	),
	showlegend=True,
	height=500,
	width=900,
	margin=dict(l=0, r=0, t=40, b=40),
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=-0.2,
	xanchor="center",
	x=0.5
	)


	)

	return gr.Plot(value=fig)

	radar_chart = draw_radar_chart(models_dropdown.value)
	models_dropdown.change(draw_radar_chart, inputs=models_dropdown, outputs=radar_chart)


	return radar_chart


	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE, elem_classes="abl_header")
	gr.HTML(INTRODUCTION_TEXT, elem_classes="abl_desc_text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏅 Leaderboard - Top Models", elem_id="llm-benchmark-tab-table", id=0):
	leaderboard = init_leaderboard(LEADERBOARD_DF)

	with gr.TabItem("🏅 Top by Size", elem_id="llm-benchmark-tab-size", id=1):
	leaderboard = init_size_leaderboard(LEADERBOARD_DF)

	with gr.TabItem("🏅 Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
	leaderboard = init_skill_leaderboard(LEADERBOARD_DF)

	with gr.TabItem("⚖️ Compare", elem_id="llm-benchmark-tab-compare", id=3):
	init_compare_tab(LEADERBOARD_DF)

	with gr.TabItem("🔬 Deep Dive", elem_id="llm-benchmark-tab-compare", id=4):


	model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
	model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
	with gr.Row():
	models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model", value=model_names_clean[0])


	model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html = get_model_info_blocks(models_dropdown.value)

	models_dropdown.change(get_model_info_blocks, inputs=models_dropdown, outputs=[model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html])

	with gr.TabItem("🚀 Submit here", elem_id="llm-benchmark-tab-submit", id=5):
	with gr.Row():
	gr.Markdown("# Submit your model", elem_classes="markdown-text")

	with gr.Row():
	with gr.Column():
	model_name_textbox = gr.Textbox(label="Model name")

	submit_button = gr.Button("Submit Eval", variant="huggingface" )
	submission_result = gr.Markdown()
	submit_button.click(
	add_new_eval,
	[
	model_name_textbox,
	],
	submission_result,
	)
	with gr.Column():
	with gr.Row():
	gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

	with gr.Column():
	with gr.Accordion(
	f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
	open=False,
	):
	with gr.Row():
	finished_eval_table = gr.components.Dataframe(
	value=finished_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)
	with gr.Accordion(
	f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
	open=False,
	):
	with gr.Row():
	running_eval_table = gr.components.Dataframe(
	value=running_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)

	with gr.Accordion(
	f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
	open=False,
	):
	with gr.Row():
	pending_eval_table = gr.components.Dataframe(
	value=pending_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)

	with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-about", id=6):
	gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=10,
	elem_id="citation-button",
	show_copy_button=True,
	)

	with gr.Row():
	gr.HTML(FOOTER_TEXT)

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=900)
	scheduler.start()
	demo.queue(default_concurrency_limit=40).launch()