Spaces:

PKU-Alignment
/

EvalAnything-LeaderBoard

Running

App Files Files Community

EvalAnything-LeaderBoard / app.py

htlou

wip

0474b44 8 months ago

raw

history blame

6.68 kB

	import os
	import json
	import gradio as gr
	import pandas as pd
	import numpy as np

	from pathlib import Path
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download

	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	LLM_BENCHMARKS_TEXT,
	TITLE,
	ABOUT_TEXT
	)
	from src.display.css_html_js import custom_css
	from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink

	# 定义模型性能数据和链接
	model_links = {
	"LLaVA-v1.5-7B†": "https://huggingface.co/liuhaotian/llava-v1.5-7b",
	"Qwen2-VL-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct",
	"Qwen2-Audio-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct",
	"Chameleon-7B†": "https://huggingface.co/facebook/chameleon-7b",
	"Llama3.1-8B-Instruct†": "https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct",
	"Gemini-1.5-Pro†": "https://deepmind.google/technologies/gemini/pro/",
	"GPT-4o†": "https://openai.com/index/hello-gpt-4o/"
	}

	data = {
	"Model": list(model_links.keys()),
	"Perception": [2.66, 2.76, 3.58, 1.44, 1.05, 5.36, 2.66],
	"Reasoning": [2.67, 3.07, 4.53, 2.97, 1.20, 5.67, 3.48],
	"IF": [2.50, 2.40, 3.40, 2.80, 1.20, 6.70, 4.20],
	"Safety": [2.90, 4.05, 2.65, 2.45, 1.35, 6.70, 5.15],
	"AMU Score": [2.68, 3.07, 3.54, 2.41, 1.20, 6.11, 3.87],
	"Modality Selection": [0.182, 0.177, 0.190, 0.156, 0.231, 0.227, 0.266],
	"Instruction Following": [6.61, 7.01, 6.69, 6.09, 7.47, 8.62, 8.62],
	"Modality Synergy": [0.43, 0.58, 0.51, 0.54, 0.60, 0.52, 0.58],
	"AMG Score": [1.56, 2.16, 1.97, 1.57, 3.08, 3.05, 3.96],
	"Overall": [2.12, 2.62, 2.73, 1.99, 2.14, 4.58, 3.92]
	}

	df = pd.DataFrame(data).sort_values(by='Overall', ascending=False)
	total_models = len(df)

	# 定义列组
	COLUMN_GROUPS = {
	"ALL": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score",
	"Modality Selection", "Instruction Following", "Modality Synergy",
	"AMG Score", "Overall"],
	"AMU": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score"],
	"AMG": ["Model", "Modality Selection", "Instruction Following", "Modality Synergy", "AMG Score"]
	}

	def format_table(df):
	"""Format the dataframe for display"""
	# 设置列的显示格式
	float_cols = df.select_dtypes(include=['float64']).columns
	for col in float_cols:
	df[col] = df[col].apply(lambda x: f"{x:.2f}") # 修改为保留2位小数

	bold_columns = ['AMU Score', 'AMG Score', 'Overall']
	for col in bold_columns:
	if col in df.columns:
	df[col] = df[col].apply(lambda x: f'{x}')

	# 添加模型链接
	# df['Model'] = df['Model'].apply(lambda x: f'<a href="{model_links[x]}" target="_blank">{x}</a>')
	df['Model'] = df['Model'].apply(lambda x: f'[{x}]({model_links[x]})')
	# df['Model'] = df.apply(lambda x: model_hyperlink(model_links[x['Model']], x['Model']), axis=1)
	return df

	def regex_table(dataframe, regex, filter_button, column_group="ALL"):
	"""Takes a model name as a regex, then returns only the rows that has that in it."""
	# 深拷贝确保不修改原始数据
	df = dataframe.copy()

	# 选择要显示的列
	columns_to_show = COLUMN_GROUPS.get(column_group, COLUMN_GROUPS["ALL"])
	df = df[columns_to_show]

	# Split regex statement by comma and trim whitespace around regexes
	if regex:
	regex_list = [x.strip() for x in regex.split(",")]
	# Join the list into a single regex pattern with '\|' acting as OR
	combined_regex = '\|'.join(regex_list)
	# Filter based on model name regex
	df = df[df["Model"].str.contains(combined_regex, case=False, na=False)]

	df = df.sort_values(by='Overall' if 'Overall' in columns_to_show else columns_to_show[-1], ascending=False)
	df.reset_index(drop=True, inplace=True)

	# Format numbers and add links
	df = format_table(df)

	# Add index column
	df.insert(0, '', range(1, 1 + len(df)))

	return df

	with gr.Blocks(css=custom_css) as app:
	gr.HTML(TITLE)
	with gr.Row():
	with gr.Column(scale=6):
	gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏆 Model Performance Leaderboard"):
	with gr.Row():
	search_overall = gr.Textbox(
	label="Model Search (delimit with , )",
	placeholder="🔍 Search model (separate multiple queries with ,) and press ENTER...",
	show_label=False
	)
	column_group = gr.Radio(
	choices=list(COLUMN_GROUPS.keys()),
	value="ALL",
	label="Select columns to show"
	)

	with gr.Row():
	performance_table_hidden = gr.Dataframe(
	df,
	headers=df.columns.tolist(),
	elem_id="performance_table_hidden",
	wrap=True,
	visible=False,
	datatype='markdown',
	)
	performance_table = gr.Dataframe(
	regex_table(df.copy(), "", []),
	headers=df.columns.tolist(),
	elem_id="performance_table",
	wrap=True,
	show_label=False,
	datatype='markdown',
	)

	with gr.TabItem("About"):
	with gr.Row():
	gr.Markdown(ABOUT_TEXT)

	with gr.Accordion("📚 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	lines=7,
	label="Copy the following to cite these results.",
	elem_id="citation-button",
	show_copy_button=True,
	)

	# Set up event handlers
	def update_table(search_text, selected_group):
	return regex_table(df, search_text, [], selected_group)

	search_overall.change(
	update_table,
	inputs=[search_overall, column_group],
	outputs=performance_table
	)

	column_group.change(
	update_table,
	inputs=[search_overall, column_group],
	outputs=performance_table
	)

	# Set up scheduler
	scheduler = BackgroundScheduler()
	scheduler.add_job(lambda: None, "interval", seconds=18000) # every 5 hours
	scheduler.start()

	# Launch the app
	app.launch(share=True)