import copy as cp import json, sys from collections import defaultdict from urllib.request import urlopen import gradio as gr import numpy as np import pandas as pd from meta_data import DEFAULT_TASK def listinstr(lst, s): assert isinstance(lst, list) for item in lst: if item in s: return True return False def load_results(): #data = json.loads(urlopen(URL).read()) with open('results.json', 'r') as file: data = json.load(file) return data def nth_large(val, vals): return sum([1 for v in vals if v > val]) + 1 def format_timestamp(timestamp): date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:] return date # def BUILD_L1_DF(results, fields): # check_box = {} # check_box['essential'] = ['Model'] # # revise there to set default dataset # check_box['required'] = DEFAULT_TASK # check_box['all'] = DEFAULT_TASK # type_map = defaultdict(lambda: 'number') # check_box['type_map'] = type_map # df = generate_table(results, fields) # return df, check_box def BUILD_L2_DF(results, benchmark): results=results[benchmark] model_list=[] all_fields=list(results.keys()) for task in results: model_list+=list(results[task].keys()) model_list=list(set(model_list)) res = defaultdict(list) if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified"]: res['Model']=model_list elif benchmark=="SWE-bench-verified": res['Agent']=model_list elif benchmark == "PrimeVul": used=[] for task in all_fields: for model in results[task]: for extra in results[task][model]: if [model,extra] not in used: res['Model'].append(model) res['Method'].append(extra) used.append([model,extra]) else: used=[] for task in all_fields: for model in results[task]: for extra in results[task][model]: if [model,extra] not in used: res['Model'].append(model) res['Agent'].append(extra) used.append([model,extra]) if benchmark not in ["RedCode","NYU CTF Bench",'PrimeVul']: for task in all_fields: for model in model_list: if model in results[task]: res[task].append(results[task][model]) else: res[task].append(None) else: for task in all_fields: for model, extra in used: if model in results[task] and extra in results[task][model]: res[task].append(results[task][model][extra]) else: res[task].append(None) df = pd.DataFrame(res) rank_criteria=all_fields[0] valid, missing = df[~pd.isna(df[rank_criteria])], df[pd.isna(df[rank_criteria])] valid = valid.sort_values(rank_criteria) valid = valid.iloc[::-1] if len(all_fields): missing = missing.iloc[::-1] df = pd.concat([valid, missing]) required_fields = all_fields check_box = {} if benchmark=="SWE-bench-verified": check_box['essential'] = ['Agent'] elif benchmark=='PrimeVul': check_box['essential'] = ['Model','Method'] elif benchmark in ["RedCode","NYU CTF Bench"]: check_box['essential'] = ['Model','Agent'] else: check_box['essential'] = ['Model'] check_box['required'] = required_fields check_box['all'] = all_fields type_map = defaultdict(lambda: 'number') check_box['type_map'] = type_map return df, check_box def generate_table(results, fields): model_list=[] task_list=fields benchmark_list=[] for task in results: for benchmark in results[task]: if benchmark!='category': benchmark_list+=[benchmark] model_list+=list(results[task][benchmark].keys()) model_list=list(set(model_list)) res = defaultdict(list) res['Model']=model_list average_score={} cnt={} for task in task_list: task_score=[] for model in model_list: score=[] for benchmark in results[task]: if benchmark != 'category': if model not in results[task][benchmark]: score.append(None) elif not isinstance(results[task][benchmark][model], (int, float)): score.append((results[task][benchmark][model]["autonomous"]+results[task][benchmark][model]["assisted"])/2) else: score.append(results[task][benchmark][model]) if not any (item is not None for item in score): score=None else: score=np.mean([s for s in score if s is not None]) if model not in average_score: average_score[model]=score cnt[model]=1 else: average_score[model]=((average_score[model]*cnt[model])+score)/(cnt[model]+1) cnt[model]+=1 task_score.append(score) res[task]=task_score #res['Avg Score']=[average_score[model] for model in model_list] #res['Avg Rank'] = [sorted(res['Avg Score'], reverse=True).index(score) + 1 for score in res['Avg Score']] df = pd.DataFrame(res) # valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])] # valid = valid.sort_values('Avg Score') # valid = valid.iloc[::-1] # if len(fields): # missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0]) # missing = missing.iloc[::-1] # df = pd.concat([valid, missing]) return df