Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

leaderboard / app.py

atticusg

Update app.py

ee05469 verified 10 months ago

raw

history blame

10.4 kB

	import json
	import gzip
	import gradio as gr
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download
	from io import StringIO

	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	LLM_BENCHMARKS_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	BENCHMARK_COLS,
	BENCHMARK_COLS_MULTIMODAL,
	BENCHMARK_COLS_MIB,
	COLS,
	COLS_MIB,
	COLS_MULTIMODAL,
	EVAL_COLS,
	EVAL_TYPES,
	AutoEvalColumn,
	AutoEvalColumn_mib,
	fields,
	)
	from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
	from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib
	from src.submission.submit import add_new_eval



	def restart_space():
	API.restart_space(repo_id=REPO_ID)



	### Space initialisation
	try:
	print(EVAL_REQUESTS_PATH)
	snapshot_download(
	repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()

	# print("EVAL_RESULTS_PATH")
	# try:
	# print(EVAL_RESULTS_PATH)
	# snapshot_download(
	# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	# )
	# except Exception:
	# restart_space()


	try:
	print(RESULTS_REPO_MIB_SUBGRAPH)
	snapshot_download(
	repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()


	try:
	print(RESULTS_REPO_MIB_CAUSALGRAPH)
	snapshot_download(
	repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()



	LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
	LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)

	# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
	# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)

	(
	finished_eval_queue_df,
	running_eval_queue_df,
	pending_eval_queue_df,
	) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)


	def init_leaderboard_mib(dataframe, track):
	print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")

	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")

	# filter for correct track
	# dataframe = dataframe.loc[dataframe["Track"] == track]

	print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")

	return Leaderboard(
	value=dataframe,
	datatype=[c.type for c in fields(AutoEvalColumn_mib)],
	select_columns=SelectColumns(
	default_selection=[c.name for c in fields(AutoEvalColumn_mib) if c.displayed_by_default],
	cant_deselect=[c.name for c in fields(AutoEvalColumn_mib) if c.never_hidden],
	label="Select Columns to Display:",
	),
	search_columns=["Method"], # Changed from AutoEvalColumn_mib.model.name to "Method"
	hide_columns=[c.name for c in fields(AutoEvalColumn_mib) if c.hidden],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	)


	def init_leaderboard_mib_causal(dataframe, track):
	"""Creates a leaderboard summary for causal intervention results"""
	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")

	# Read and process JSON data
	json_data = json.loads(dataframe.to_json(orient='records'))[0]

	# Process results into summary format
	summary_data = []
	method_name = json_data['method_name']

	# Extract scores for MCQA task
	for model_result in json_data['results']:
	model_id = model_result['model_id']
	task_data = model_result['task_scores']['MCQA']

	# Calculate best layer performance
	best_scores = calculate_best_layer_scores(task_data)

	summary_row = {
	'Method': method_name,
	'Model': model_id,
	'Best Output Token Score': best_scores['output_token'],
	'Best Output Location Score': best_scores['output_location'],
	'Best Layer': best_scores['best_layer']
	}
	summary_data.append(summary_row)

	# Convert to DataFrame
	results_df = pd.DataFrame(summary_data)

	# Round numeric columns to 3 decimal places
	numeric_cols = ['Best Output Token Score', 'Best Output Location Score']
	results_df[numeric_cols] = results_df[numeric_cols].round(3)

	return Leaderboard(
	value=results_df,
	datatype=['text', 'text', 'number', 'number', 'number'],
	select_columns=SelectColumns(
	default_selection=['Method', 'Model', 'Best Output Token Score', 'Best Output Location Score', 'Best Layer'],
	cant_deselect=['Method', 'Model'],
	label="Select Metrics to Display:",
	),
	search_columns=['Method', 'Model'],
	interactive=False,
	)

	def calculate_best_layer_scores(task_data):
	"""Calculate the best scores across all layers for each intervention type"""
	best_output_token = 0
	best_output_location = 0
	best_layer = 0

	for layer_data in task_data:
	layer_num = int(layer_data['layer'])
	layer_scores = layer_data['layer_scores']

	# Calculate average scores for each intervention type
	output_token_avg = sum(cf['score'] for cf in layer_scores[0]['counterfactual_scores']) / len(layer_scores[0]['counterfactual_scores'])
	output_location_avg = sum(cf['score'] for cf in layer_scores[1]['counterfactual_scores']) / len(layer_scores[1]['counterfactual_scores'])

	# Update best scores
	if output_token_avg > best_output_token or output_location_avg > best_output_location:
	best_output_token = max(best_output_token, output_token_avg)
	best_output_location = max(best_output_location, output_location_avg)
	best_layer = layer_num

	return {
	'output_token': best_output_token,
	'output_location': best_output_location,
	'best_layer': best_layer
	}

	def init_leaderboard(dataframe, track):
	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")
	# filter for correct track
	dataframe = dataframe.loc[dataframe["Track"] == track]

	# print(f"\n\n\n dataframe is {dataframe}\n\n\n")

	return Leaderboard(
	value=dataframe,
	datatype=[c.type for c in fields(AutoEvalColumn)],
	select_columns=SelectColumns(
	default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
	cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
	label="Select Columns to Display:",
	),
	search_columns=[AutoEvalColumn.model.name],
	hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	)

	def process_json(temp_file):
	if temp_file is None:
	return {}

	# Handle file upload
	try:
	file_path = temp_file.name
	if file_path.endswith('.gz'):
	with gzip.open(file_path, 'rt') as f:
	data = json.load(f)
	else:
	with open(file_path, 'r') as f:
	data = json.load(f)
	except Exception as e:
	raise gr.Error(f"Error processing file: {str(e)}")

	gr.Markdown("Upload successful!")
	return data


	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	# with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
	# leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
	# with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
	# leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
	# with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
	# leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")

	# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
	# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

	# with gr.TabItem("👶 Submit", elem_id="llm-benchmark-tab-table", id=5):
	# with gr.Column():
	# with gr.Row():
	# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

	with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
	leaderboard = init_leaderboard_mib(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
	# leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")

	with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
	leaderboard = init_leaderboard_mib_causal(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")

	# with gr.Row():
	# with gr.Accordion("📙 Citation", open=False):
	# citation_button = gr.Textbox(
	# value=CITATION_BUTTON_TEXT,
	# label=CITATION_BUTTON_LABEL,
	# lines=20,
	# elem_id="citation-button",
	# show_copy_button=True,
	# )

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.start()
	demo.launch(share=True, ssr_mode=False)