|
import copy as cp
|
|
import json, sys
|
|
from collections import defaultdict
|
|
from urllib.request import urlopen
|
|
|
|
import gradio as gr
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from meta_data import DEFAULT_TASK
|
|
|
|
|
|
def listinstr(lst, s):
|
|
assert isinstance(lst, list)
|
|
for item in lst:
|
|
if item in s:
|
|
return True
|
|
return False
|
|
|
|
|
|
def load_results():
|
|
|
|
with open('results.json', 'r') as file:
|
|
data = json.load(file)
|
|
return data
|
|
|
|
|
|
def nth_large(val, vals):
|
|
return sum([1 for v in vals if v > val]) + 1
|
|
|
|
def format_timestamp(timestamp):
|
|
date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:]
|
|
return date
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def BUILD_L2_DF(results, benchmark):
|
|
results=results[benchmark]
|
|
model_list=[]
|
|
all_fields=list(results.keys())
|
|
for task in results:
|
|
model_list+=list(results[task].keys())
|
|
model_list=list(set(model_list))
|
|
|
|
res = defaultdict(list)
|
|
if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified"]:
|
|
res['Model']=model_list
|
|
elif benchmark=="SWE-bench-verified":
|
|
res['Agent']=model_list
|
|
elif benchmark == "PrimeVul":
|
|
used=[]
|
|
for task in all_fields:
|
|
for model in results[task]:
|
|
for extra in results[task][model]:
|
|
if [model,extra] not in used:
|
|
res['Model'].append(model)
|
|
res['Method'].append(extra)
|
|
used.append([model,extra])
|
|
else:
|
|
used=[]
|
|
for task in all_fields:
|
|
for model in results[task]:
|
|
for extra in results[task][model]:
|
|
if [model,extra] not in used:
|
|
res['Model'].append(model)
|
|
res['Agent'].append(extra)
|
|
used.append([model,extra])
|
|
|
|
if benchmark not in ["RedCode","NYU CTF Bench",'PrimeVul']:
|
|
for task in all_fields:
|
|
for model in model_list:
|
|
if model in results[task]:
|
|
res[task].append(results[task][model])
|
|
else:
|
|
res[task].append(None)
|
|
else:
|
|
for task in all_fields:
|
|
for model, extra in used:
|
|
if model in results[task] and extra in results[task][model]:
|
|
res[task].append(results[task][model][extra])
|
|
else:
|
|
res[task].append(None)
|
|
|
|
df = pd.DataFrame(res)
|
|
rank_criteria=all_fields[0]
|
|
valid, missing = df[~pd.isna(df[rank_criteria])], df[pd.isna(df[rank_criteria])]
|
|
valid = valid.sort_values(rank_criteria)
|
|
valid = valid.iloc[::-1]
|
|
if len(all_fields):
|
|
missing = missing.iloc[::-1]
|
|
df = pd.concat([valid, missing])
|
|
|
|
required_fields = all_fields
|
|
|
|
check_box = {}
|
|
if benchmark=="SWE-bench-verified":
|
|
check_box['essential'] = ['Agent']
|
|
elif benchmark=='PrimeVul':
|
|
check_box['essential'] = ['Model','Method']
|
|
elif benchmark in ["RedCode","NYU CTF Bench"]:
|
|
check_box['essential'] = ['Model','Agent']
|
|
else:
|
|
check_box['essential'] = ['Model']
|
|
|
|
check_box['required'] = required_fields
|
|
check_box['all'] = all_fields
|
|
type_map = defaultdict(lambda: 'number')
|
|
check_box['type_map'] = type_map
|
|
return df, check_box
|
|
|
|
|
|
def generate_table(results, fields):
|
|
model_list=[]
|
|
task_list=fields
|
|
benchmark_list=[]
|
|
for task in results:
|
|
for benchmark in results[task]:
|
|
if benchmark!='category':
|
|
benchmark_list+=[benchmark]
|
|
model_list+=list(results[task][benchmark].keys())
|
|
model_list=list(set(model_list))
|
|
|
|
res = defaultdict(list)
|
|
res['Model']=model_list
|
|
|
|
average_score={}
|
|
cnt={}
|
|
for task in task_list:
|
|
task_score=[]
|
|
for model in model_list:
|
|
score=[]
|
|
for benchmark in results[task]:
|
|
if benchmark != 'category':
|
|
if model not in results[task][benchmark]:
|
|
score.append(None)
|
|
elif not isinstance(results[task][benchmark][model], (int, float)):
|
|
score.append((results[task][benchmark][model]["autonomous"]+results[task][benchmark][model]["assisted"])/2)
|
|
else:
|
|
score.append(results[task][benchmark][model])
|
|
if not any (item is not None for item in score):
|
|
score=None
|
|
else:
|
|
score=np.mean([s for s in score if s is not None])
|
|
if model not in average_score:
|
|
average_score[model]=score
|
|
cnt[model]=1
|
|
else:
|
|
average_score[model]=((average_score[model]*cnt[model])+score)/(cnt[model]+1)
|
|
cnt[model]+=1
|
|
task_score.append(score)
|
|
res[task]=task_score
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(res)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return df
|
|
|