Spaces:

dtcxzyw
/

llvm-apr-benchmark-leaderboard

Runtime error

App Files Files Community

llvm-apr-benchmark-leaderboard / app.py

dtcxzyw

Update

345bfd4 unverified 3 months ago

raw

history blame

8.8 kB

	import gradio as gr
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download
	from datasets import load_dataset
	import json

	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	COLS,
	AutoEvalColumn,
	fields,
	)
	from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN
	from src.populate import get_leaderboard_df


	def restart_space():
	API.restart_space(repo_id=REPO_ID)


	### Space initialisation
	try:
	print(EVAL_REQUESTS_PATH)
	snapshot_download(
	repo_id=QUEUE_REPO,
	local_dir=EVAL_REQUESTS_PATH,
	repo_type="dataset",
	tqdm_class=None,
	etag_timeout=30,
	token=TOKEN,
	)
	dataset = load_dataset("dtcxzyw/llvm-apr-benchmark")
	except Exception:
	restart_space()

	total_issues = dataset.num_rows["test"]
	bug_id_to_time = dict()
	bug_id_to_type = dict()
	bug_id_to_patch = dict()
	bug_id_by_cat = {
	"crash": [],
	"miscompilation": [],
	"hang": [],
	}
	bug_id_to_comp = dict()
	bug_id_to_title = dict()
	comp_bug_count = dict()
	for issue in dataset["test"]:
	bug_id_to_time[issue["bug_id"]] = pd.to_datetime(issue["knowledge_cutoff"])
	bug_id_by_cat[issue["bug_type"]].append(issue["bug_id"])
	bug_id_to_type[issue["bug_id"]] = issue["bug_type"]
	bug_id_to_comp[issue["bug_id"]] = issue["hints"]["components"]
	for comp in issue["hints"]["components"]:
	comp_bug_count[comp] = comp_bug_count.get(comp, 0) + 1
	bug_id_to_title[issue["bug_id"]] = "Issue " + issue["bug_id"] + ": " + issue["issue"]["title"]
	bug_id_to_patch[issue["bug_id"]] = issue["patch"]
	timeline_xs = []
	timeline_ys = []
	timeline_cols = []
	timeline_bugids = []
	model_cnt = 0
	for bug_id, time in bug_id_to_time.items():
	timeline_ys.append(0)
	timeline_cols.append("All")
	timeline_bugids.append(bug_id)
	cat_cnt = 4
	for cat, bug_ids in bug_id_by_cat.items():
	cat_cnt -= 1
	for bug_id in bug_ids:
	timeline_ys.append(cat_cnt)
	timeline_cols.append(str(cat).capitalize())
	timeline_bugids.append(bug_id)
	LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, total_issues)
	fixed_bug_ids = set()
	fixed_bug_ids_count = dict()
	fixed_bug_ids_fast = set()
	for row in LEADERBOARD_DF.itertuples():
	print(row)
	model_cnt += 1
	for fix in row.fixed_bug_ids:
	timeline_ys.append(-model_cnt)
	timeline_cols.append(row.method_id)
	timeline_bugids.append(fix)
	fixed_bug_ids.add(fix)
	fixed_bug_ids_count[fix] = fixed_bug_ids_count.get(fix, 0) + 1
	for fix in row.fixed_bug_ids_fast:
	fixed_bug_ids_fast.add(fix)
	unique_bug_ids = set([bug_id for bug_id, count in fixed_bug_ids_count.items() if count == 1])
	timeline_bugtypes = []
	for bug_id in timeline_bugids:
	timeline_xs.append(bug_id_to_time[bug_id])
	timeline_bugtypes.append(bug_id_to_type[bug_id])
	timeline_df = pd.DataFrame(
	{
	"time": timeline_xs,
	"model": timeline_ys,
	"method_name": timeline_cols,
	"bug_id": timeline_bugids,
	"bug_type": timeline_bugtypes,
	}
	)
	fixed_by_cat = dict()
	fixed_by_cat_fast = dict()
	for bug_id in fixed_bug_ids:
	fixed_by_cat[bug_id_to_type[bug_id]] = fixed_by_cat.get(bug_id_to_type[bug_id], 0) + 1
	for bug_id in fixed_bug_ids_fast:
	fixed_by_cat_fast[bug_id_to_type[bug_id]] = fixed_by_cat_fast.get(bug_id_to_type[bug_id], 0) + 1
	fixed_by_cat["All"] = len(fixed_bug_ids)
	bug_id_by_cat["All"] = [0] * total_issues
	fixed_by_cat_fast["All"] = len(fixed_bug_ids_fast)
	fixed_by_cat_df = pd.DataFrame(
	{
	"Category": [str(cat).capitalize() for cat in fixed_by_cat.keys()],
	"Total": [len(bug_id_by_cat[cat]) for cat in fixed_by_cat.keys()],
	"Repaired": list(fixed_by_cat.values()),
	"Repair Rate (%)": [
	round(fixed_by_cat[cat] / len(bug_id_by_cat[cat]) * 100, 1) for cat in fixed_by_cat.keys()
	],
	"Repaired (Fast)": [fixed_by_cat_fast.get(cat, 0) for cat in fixed_by_cat.keys()],
	"Repair Rate (Fast) (%)": [
	round(fixed_by_cat_fast.get(cat, 0) / len(bug_id_by_cat[cat]) * 100, 1) for cat in fixed_by_cat.keys()
	],
	}
	)
	fixed_by_cat_df.sort_values("Total", inplace=True, ascending=False)
	fixed_by_comp = dict()
	for bug_id in fixed_bug_ids:
	for comp in bug_id_to_comp[bug_id]:
	fixed_by_comp[comp] = fixed_by_comp.get(comp, 0) + 1
	fixed_by_comp_fast = dict()
	for bug_id in fixed_bug_ids_fast:
	for comp in bug_id_to_comp[bug_id]:
	fixed_by_comp_fast[comp] = fixed_by_comp_fast.get(comp, 0) + 1
	fixed_by_comp_df = pd.DataFrame(
	{
	"Component": list(comp_bug_count.keys()),
	"Total": list(comp_bug_count.values()),
	"Repaired": [fixed_by_comp.get(comp, 0) for comp in comp_bug_count.keys()],
	"Repair Rate (%)": [
	round(fixed_by_comp.get(comp, 0) / comp_bug_count[comp] * 100, 1) for comp in comp_bug_count.keys()
	],
	"Repaired (Fast)": [fixed_by_comp_fast.get(comp, 0) for comp in comp_bug_count.keys()],
	"Repair Rate (Fast) (%)": [
	round(fixed_by_comp_fast.get(comp, 0) / comp_bug_count[comp] * 100, 1) for comp in comp_bug_count.keys()
	],
	}
	)
	fixed_by_comp_df.sort_values("Total", inplace=True, ascending=False)
	unique_bugs_df = pd.DataFrame(
	{
	"Model": [c.method_id for c in LEADERBOARD_DF.itertuples()],
	"Unique Bugs Fixed": [
	len(set(c.fixed_bug_ids).intersection(unique_bug_ids)) for c in LEADERBOARD_DF.itertuples()
	],
	}
	)
	unique_bugs_df.sort_values("Unique Bugs Fixed", inplace=True, ascending=False)


	def init_leaderboard(dataframe):
	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")
	return Leaderboard(
	value=dataframe,
	datatype=[c.type for c in fields(AutoEvalColumn)],
	select_columns=SelectColumns(
	default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
	cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
	label="Select Columns to Display:",
	),
	search_columns=[AutoEvalColumn.method_name.name],
	hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
	filter_columns=[
	ColumnFilter(AutoEvalColumn.with_hint.name, type="checkboxgroup", label="Hint"),
	],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	)


	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT + f"\nTotal issues: {total_issues}\n", elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
	leaderboard = init_leaderboard(LEADERBOARD_DF[COLS])
	gr.ScatterPlot(
	timeline_df,
	x="time",
	y="model",
	color="method_name",
	x_label="Time",
	y_label="Model",
	title="Timeline",
	y_lim=(-model_cnt - 1, 4),
	tooltip=["bug_id", "method_name", "time", "bug_type"],
	)
	gr.Dataframe(fixed_by_cat_df)
	gr.Dataframe(fixed_by_comp_df)
	gr.Dataframe(unique_bugs_df)
	fixed_bug_title_id_pairs = [(bug_id_to_title[bug_id], bug_id) for bug_id in sorted(fixed_bug_ids)]
	inspect_issue = gr.Dropdown(fixed_bug_title_id_pairs, label="Inspct Issue", interactive=True)
	golden_patch = gr.Code("", language="cpp", label="Golden Patch")
	inspect_issue.change(
	fn=lambda bug_id: bug_id_to_patch.get(bug_id, f"Not Available (bug_id = {bug_id})"),
	inputs=inspect_issue,
	outputs=golden_patch,
	)

	with gr.TabItem("🚀 Submission", elem_id="llm-benchmark-tab-table", id=1):
	gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=6,
	elem_id="citation-button",
	show_copy_button=True,
	)

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.start()
	demo.queue(default_concurrency_limit=40).launch()