bhys commited on
Commit
774e9c9
·
verified ·
1 Parent(s): 956d5e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -57
app.py CHANGED
@@ -23,7 +23,7 @@ DATA_DATASET = f"{OWNER}/CTFAIA"
23
  INTERNAL_DATA_DATASET = f"{OWNER}/CTFAIA_internal"
24
  SUBMISSION_DATASET = f"{OWNER}/CTFAIA_submissions_internal"
25
  CONTACT_DATASET = f"{OWNER}/contact_info"
26
- RESULTS_DATASET = f"{OWNER}/CTFAIA_results_public"
27
  LEADERBOARD_PATH = f"{OWNER}/agent_ctf_leaderboard"
28
  api = HfApi()
29
 
@@ -31,13 +31,13 @@ YEAR_VERSION = "2024"
31
 
32
  os.makedirs("scored", exist_ok=True)
33
 
34
- all_version = ['20240423']
35
 
36
  contact_infos = load_dataset(
37
  CONTACT_DATASET,
38
  token=TOKEN,
39
- download_mode="force_redownload",
40
- ignore_verifications=True
41
  )
42
 
43
  all_gold_dataset = {}
@@ -49,7 +49,7 @@ for dataset_version in all_version:
49
  dataset_version,
50
  token=TOKEN,
51
  download_mode="force_redownload",
52
- ignore_verifications=True,
53
  trust_remote_code=True
54
  )
55
  all_gold_results[dataset_version] = {
@@ -61,7 +61,7 @@ for dataset_version in all_version:
61
  dataset_version,
62
  token=TOKEN,
63
  download_mode="force_redownload",
64
- ignore_verifications=True,
65
  trust_remote_code=True
66
  )
67
 
@@ -69,23 +69,25 @@ for dataset_version in all_version:
69
  def get_dataframe_from_results(eval_results, split):
70
  local_df = eval_results[split]
71
  local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
72
- local_df = local_df.remove_columns(["system_prompt", "url"])
73
  local_df = local_df.rename_column("model", "Model name")
74
  local_df = local_df.rename_column("model_family", "Model family")
75
- local_df = local_df.rename_column("score", "Average score (%)")
76
- for i in [1, 2, 3]:
77
- local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
78
  df = pd.DataFrame(local_df)
79
- df = df.sort_values(by=["Average score (%)"], ascending=False)
80
 
81
- numeric_cols = [c for c in local_df.column_names if "score" in c]
 
82
  df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
83
- # df = df.style.format("{:.2%}", subset=numeric_cols)
84
-
85
  return df
86
 
87
 
88
  eval_dataframe = {}
 
89
  for dataset_version in all_version:
90
  eval_dataframe[dataset_version] = get_dataframe_from_results(
91
  eval_results=eval_results[dataset_version],
@@ -97,14 +99,28 @@ def restart_space():
97
  api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
98
 
99
 
100
- TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
 
103
  def add_new_eval(
104
  dataset_version: str,
105
  model: str,
106
  model_family: str,
107
- system_prompt: str,
108
  url: str,
109
  path_to_file: str,
110
  organisation: str,
@@ -118,7 +134,14 @@ def add_new_eval(
118
 
119
  print("Adding new eval")
120
 
121
- # Check if the combination model/org already exists and prints a warning message if yes
 
 
 
 
 
 
 
122
  if model.lower() in set(
123
  [m.lower() for m in eval_results[dataset_version][val_or_test]["model"]]) and organisation.lower() in set(
124
  [o.lower() for o in eval_results[dataset_version][val_or_test]["organisation"]]):
@@ -127,23 +150,19 @@ def add_new_eval(
127
  if path_to_file is None:
128
  return format_warning("Please attach a file.")
129
 
130
- # Save submitted file
131
- api.upload_file(
132
- repo_id=SUBMISSION_DATASET,
133
- path_or_fileobj=path_to_file.name,
134
- path_in_repo=f"{organisation}/{model}/{dataset_version}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
135
- repo_type="dataset",
136
- token=TOKEN
137
- )
138
-
139
  # Gold answers
140
  gold_results = all_gold_results[dataset_version]
 
141
 
142
  # Compute score
143
  file_path = path_to_file.name
144
- scores = {"all": 0, 1: 0, 2: 0, 3: 0}
145
- num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
146
- total_scores = {"all": 0, 1: 0, 2: 0, 3: 0}
 
 
 
 
147
  with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
148
  with open(file_path, 'r') as f:
149
  for ix, line in enumerate(f):
@@ -158,12 +177,11 @@ def add_new_eval(
158
  task_name = task["task_name"]
159
  try:
160
  level = int(gold_results[val_or_test][task_name]["Level"])
 
161
  except KeyError:
162
  return format_error(
163
  f"{task_name} not found in split {val_or_test}. Are you sure you submitted the correct file?")
164
 
165
- score = question_scorer(task, gold_results[val_or_test][task_name])
166
-
167
  scored_file.write(
168
  json.dumps({
169
  "id": task_name,
@@ -173,14 +191,39 @@ def add_new_eval(
173
  }) + "\n"
174
  )
175
 
176
- scores["all"] += score
177
- scores[level] += score
178
- num_questions["all"] += 1
179
- num_questions[level] += 1
180
- for task_name, task in gold_results[val_or_test].items():
181
- level = int(task['Level'])
182
- total_scores["all"] += 10
183
- total_scores[level] += 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  # Save scored file
186
  api.upload_file(
@@ -195,14 +238,15 @@ def add_new_eval(
195
  eval_entry = {
196
  "model": model,
197
  "model_family": model_family,
198
- "system_prompt": system_prompt,
199
  "url": url,
200
  "organisation": organisation,
201
- "score": scores["all"] / total_scores["all"],
202
- "score_level1": scores[1] / total_scores[1] if total_scores[1] else 0,
203
- "score_level2": scores[2] / total_scores[2] if total_scores[2] else 0,
204
- "score_level3": scores[3] / total_scores[3] if total_scores[3] else 0,
 
205
  }
 
206
  eval_results[dataset_version][val_or_test] = eval_results[dataset_version][val_or_test].add_item(eval_entry)
207
  eval_results[dataset_version].push_to_hub(RESULTS_DATASET, config_name=dataset_version, token=TOKEN)
208
 
@@ -228,22 +272,21 @@ def refresh():
228
  dataset_version,
229
  token=TOKEN,
230
  download_mode="force_redownload",
231
- ignore_verifications=True
232
  )
233
- leaderboard_tables = []
 
 
234
  for dataset_version in all_version:
235
- eval_dataframe[dataset_version] = get_dataframe_from_results(
236
  eval_results=eval_results[dataset_version],
237
  split="validation"
238
  )
239
- with gr.Tab(dataset_version):
240
- leaderboard_tables.append(
241
- gr.components.Dataframe(
242
- value=eval_dataframe[dataset_version], datatype=TYPES, interactive=False,
243
- column_widths=["20%"]
244
- )
245
- )
246
- return leaderboard_tables
247
 
248
 
249
  def upload_file(files):
@@ -286,7 +329,6 @@ with demo:
286
  level_of_test = gr.Radio(all_version, value=all_version[0], label="dataset_version")
287
  model_name_textbox = gr.Textbox(label="Model name", value='')
288
  model_family_textbox = gr.Textbox(label="Model family", value='')
289
- system_prompt_textbox = gr.Textbox(label="System prompt example", value='')
290
  url_textbox = gr.Textbox(label="Url to model information", value='')
291
  with gr.Column():
292
  organisation = gr.Textbox(label="Organisation", value='')
@@ -303,7 +345,6 @@ with demo:
303
  level_of_test,
304
  model_name_textbox,
305
  model_family_textbox,
306
- system_prompt_textbox,
307
  url_textbox,
308
  file_output,
309
  organisation,
 
23
  INTERNAL_DATA_DATASET = f"{OWNER}/CTFAIA_internal"
24
  SUBMISSION_DATASET = f"{OWNER}/CTFAIA_submissions_internal"
25
  CONTACT_DATASET = f"{OWNER}/contact_info"
26
+ RESULTS_DATASET = f"{OWNER}/test_result"
27
  LEADERBOARD_PATH = f"{OWNER}/agent_ctf_leaderboard"
28
  api = HfApi()
29
 
 
31
 
32
  os.makedirs("scored", exist_ok=True)
33
 
34
+ all_version = ['20240602']
35
 
36
  contact_infos = load_dataset(
37
  CONTACT_DATASET,
38
  token=TOKEN,
39
+ # download_mode="force_redownload",
40
+ verification_mode="no_checks"
41
  )
42
 
43
  all_gold_dataset = {}
 
49
  dataset_version,
50
  token=TOKEN,
51
  download_mode="force_redownload",
52
+ verification_mode="no_checks",
53
  trust_remote_code=True
54
  )
55
  all_gold_results[dataset_version] = {
 
61
  dataset_version,
62
  token=TOKEN,
63
  download_mode="force_redownload",
64
+ verification_mode="no_checks",
65
  trust_remote_code=True
66
  )
67
 
 
69
  def get_dataframe_from_results(eval_results, split):
70
  local_df = eval_results[split]
71
  local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
72
+ local_df = local_df.remove_columns(["url"])
73
  local_df = local_df.rename_column("model", "Model name")
74
  local_df = local_df.rename_column("model_family", "Model family")
75
+ # local_df = local_df.rename_column("score", "Average score (%)")
76
+ # for i in [1, 2, 3]:
77
+ # local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
78
  df = pd.DataFrame(local_df)
79
+ df = df.sort_values(by=["completion_level"], ascending=False)
80
 
81
+ numeric_cols = [c for c in local_df.column_names if
82
+ c in ["success_rate", "completion_level", "expertise", "reasoning", "comprehension"]]
83
  df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
84
+ df = df.style.format("{:.2%}", subset=numeric_cols)
85
+ print(type(df))
86
  return df
87
 
88
 
89
  eval_dataframe = {}
90
+
91
  for dataset_version in all_version:
92
  eval_dataframe[dataset_version] = get_dataframe_from_results(
93
  eval_results=eval_results[dataset_version],
 
99
  api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
100
 
101
 
102
+ TYPES = ["markdown", "str", "str", "str", "number", "number", "number", "number"]
103
+ LEVELS = ["all", 1, 2, 3]
104
+
105
+
106
+ def round_and_pad(number, ndigits=2):
107
+ # 四舍五入到指定的小数位数
108
+ rounded_number = round(number, ndigits)
109
+ # 转换为字符串
110
+ number_str = str(rounded_number)
111
+ # 分离整数部分和小数部分
112
+ integer_part, decimal_part = number_str.split('.')
113
+ # 如果小数部分不足指定的位数,补零
114
+ while len(decimal_part) < ndigits:
115
+ decimal_part += '0'
116
+ # 拼接回去,并转换回数字
117
+ return '.'.join([integer_part, decimal_part])
118
 
119
 
120
  def add_new_eval(
121
  dataset_version: str,
122
  model: str,
123
  model_family: str,
 
124
  url: str,
125
  path_to_file: str,
126
  organisation: str,
 
134
 
135
  print("Adding new eval")
136
 
137
+ Check if the
138
+ combination
139
+ model / org
140
+ already
141
+ exists and prints
142
+ a
143
+ warning
144
+ message if yes
145
  if model.lower() in set(
146
  [m.lower() for m in eval_results[dataset_version][val_or_test]["model"]]) and organisation.lower() in set(
147
  [o.lower() for o in eval_results[dataset_version][val_or_test]["organisation"]]):
 
150
  if path_to_file is None:
151
  return format_warning("Please attach a file.")
152
 
 
 
 
 
 
 
 
 
 
153
  # Gold answers
154
  gold_results = all_gold_results[dataset_version]
155
+ print(gold_results)
156
 
157
  # Compute score
158
  file_path = path_to_file.name
159
+ success_rate = {'all': 0, 1: 0, 2: 0, 3: 0}
160
+ completion_level = {'all': 0, 1: 0, 2: 0, 3: 0}
161
+ expertise = {'all': 0, 1: 0, 2: 0, 3: 0}
162
+ reasoning = {'all': 0, 1: 0, 2: 0, 3: 0}
163
+ comprehension = {'all': 0, 1: 0, 2: 0, 3: 0}
164
+ num = {'all': 0, 1: 0, 2: 0, 3: 0}
165
+
166
  with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
167
  with open(file_path, 'r') as f:
168
  for ix, line in enumerate(f):
 
177
  task_name = task["task_name"]
178
  try:
179
  level = int(gold_results[val_or_test][task_name]["Level"])
180
+ score = question_scorer(task, gold_results[val_or_test][task_name])
181
  except KeyError:
182
  return format_error(
183
  f"{task_name} not found in split {val_or_test}. Are you sure you submitted the correct file?")
184
 
 
 
185
  scored_file.write(
186
  json.dumps({
187
  "id": task_name,
 
191
  }) + "\n"
192
  )
193
 
194
+ num[level] += 1
195
+ completion_level[level] += score[0]
196
+ expertise[level] += score[1]
197
+ reasoning[level] += score[2]
198
+ comprehension[level] += score[3]
199
+
200
+ num['all'] += 1
201
+ completion_level['all'] += score[0]
202
+ expertise['all'] += score[1]
203
+ reasoning['all'] += score[2]
204
+ comprehension['all'] += score[3]
205
+
206
+ if score[0] == 10:
207
+ success_rate[level] += 1
208
+ success_rate['all'] += 1
209
+
210
+ for key in LEVELS:
211
+ success_rate[key] = round_and_pad(success_rate[key] / num[key])
212
+ completion_level[key] = round_and_pad(completion_level[key] / num[key])
213
+ expertise[key] = round_and_pad(expertise[key] / num[key])
214
+ reasoning[key] = round_and_pad(reasoning[key] / num[key])
215
+ comprehension[key] = round_and_pad(comprehension[key] / num[key])
216
+
217
+ print(success_rate, completion_level, expertise, reasoning, comprehension)
218
+
219
+ # Save submitted file
220
+ api.upload_file(
221
+ repo_id=SUBMISSION_DATASET,
222
+ path_or_fileobj=path_to_file.name,
223
+ path_in_repo=f"{organisation}/{model}/{dataset_version}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
224
+ repo_type="dataset",
225
+ token=TOKEN
226
+ )
227
 
228
  # Save scored file
229
  api.upload_file(
 
238
  eval_entry = {
239
  "model": model,
240
  "model_family": model_family,
 
241
  "url": url,
242
  "organisation": organisation,
243
+ "success_rate": success_rate["all"],
244
+ "completion_level": completion_level["all"],
245
+ "expertise": expertise["all"],
246
+ "reasoning": reasoning["all"],
247
+ "comprehension": comprehension["all"]
248
  }
249
+
250
  eval_results[dataset_version][val_or_test] = eval_results[dataset_version][val_or_test].add_item(eval_entry)
251
  eval_results[dataset_version].push_to_hub(RESULTS_DATASET, config_name=dataset_version, token=TOKEN)
252
 
 
272
  dataset_version,
273
  token=TOKEN,
274
  download_mode="force_redownload",
275
+ verification_mode="no_checks"
276
  )
277
+
278
+ new_eval_dataframe = {}
279
+ new_leaderboard_tables = []
280
  for dataset_version in all_version:
281
+ new_eval_dataframe[dataset_version] = get_dataframe_from_results(
282
  eval_results=eval_results[dataset_version],
283
  split="validation"
284
  )
285
+ new_leaderboard_tables.append(new_eval_dataframe[dataset_version])
286
+ if len(new_leaderboard_tables) == 1:
287
+ return new_leaderboard_tables[0]
288
+ else:
289
+ return new_leaderboard_tables
 
 
 
290
 
291
 
292
  def upload_file(files):
 
329
  level_of_test = gr.Radio(all_version, value=all_version[0], label="dataset_version")
330
  model_name_textbox = gr.Textbox(label="Model name", value='')
331
  model_family_textbox = gr.Textbox(label="Model family", value='')
 
332
  url_textbox = gr.Textbox(label="Url to model information", value='')
333
  with gr.Column():
334
  organisation = gr.Textbox(label="Organisation", value='')
 
345
  level_of_test,
346
  model_name_textbox,
347
  model_family_textbox,
 
348
  url_textbox,
349
  file_output,
350
  organisation,