Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,7 @@ from content import format_error, format_warning, format_log, TITLE, INTRODUCTIO
|
|
17 |
CITATION_BUTTON_TEXT, model_hyperlink
|
18 |
|
19 |
TOKEN = os.environ.get("TOKEN", None)
|
|
|
20 |
|
21 |
OWNER = "autogenCTF"
|
22 |
DATA_DATASET = f"{OWNER}/CTFAIA"
|
@@ -78,11 +79,10 @@ def get_dataframe_from_results(eval_results, split):
|
|
78 |
df = pd.DataFrame(local_df)
|
79 |
df = df.sort_values(by=["completion_level"], ascending=False)
|
80 |
|
81 |
-
numeric_cols = [c for c in local_df.column_names
|
82 |
-
c in ["success_rate", "completion_level"
|
83 |
df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
|
84 |
df = df.style.format("{:.2%}", subset=numeric_cols)
|
85 |
-
print(type(df))
|
86 |
return df
|
87 |
|
88 |
|
@@ -134,6 +134,7 @@ def add_new_eval(
|
|
134 |
|
135 |
print("Adding new eval")
|
136 |
|
|
|
137 |
if model.lower() in set(
|
138 |
[m.lower() for m in eval_results[dataset_version][val_or_test]["model"]]) and organisation.lower() in set(
|
139 |
[o.lower() for o in eval_results[dataset_version][val_or_test]["organisation"]]):
|
@@ -155,6 +156,7 @@ def add_new_eval(
|
|
155 |
comprehension = {'all': 0, 1: 0, 2: 0, 3: 0}
|
156 |
num = {'all': 0, 1: 0, 2: 0, 3: 0}
|
157 |
|
|
|
158 |
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
159 |
with open(file_path, 'r') as f:
|
160 |
for ix, line in enumerate(f):
|
@@ -171,8 +173,7 @@ def add_new_eval(
|
|
171 |
level = int(gold_results[val_or_test][task_name]["Level"])
|
172 |
score = question_scorer(task, gold_results[val_or_test][task_name])
|
173 |
except KeyError:
|
174 |
-
return format_error(
|
175 |
-
f"{task_name} not found in split {val_or_test}. Are you sure you submitted the correct file?")
|
176 |
|
177 |
scored_file.write(
|
178 |
json.dumps({
|
@@ -200,8 +201,8 @@ def add_new_eval(
|
|
200 |
success_rate['all'] += 1
|
201 |
|
202 |
for key in LEVELS:
|
203 |
-
success_rate[key] = round_and_pad(success_rate[key] / num[key])
|
204 |
-
completion_level[key] = round_and_pad(completion_level[key] / num[key])
|
205 |
expertise[key] = round_and_pad(expertise[key] / num[key])
|
206 |
reasoning[key] = round_and_pad(reasoning[key] / num[key])
|
207 |
comprehension[key] = round_and_pad(comprehension[key] / num[key])
|
|
|
17 |
CITATION_BUTTON_TEXT, model_hyperlink
|
18 |
|
19 |
TOKEN = os.environ.get("TOKEN", None)
|
20 |
+
print(TOKEN)
|
21 |
|
22 |
OWNER = "autogenCTF"
|
23 |
DATA_DATASET = f"{OWNER}/CTFAIA"
|
|
|
79 |
df = pd.DataFrame(local_df)
|
80 |
df = df.sort_values(by=["completion_level"], ascending=False)
|
81 |
|
82 |
+
numeric_cols = [c for c in local_df.column_names
|
83 |
+
if c in ["success_rate", "completion_level"]]
|
84 |
df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
|
85 |
df = df.style.format("{:.2%}", subset=numeric_cols)
|
|
|
86 |
return df
|
87 |
|
88 |
|
|
|
134 |
|
135 |
print("Adding new eval")
|
136 |
|
137 |
+
# Check if the combination model/org already exists and prints a warning message if yes
|
138 |
if model.lower() in set(
|
139 |
[m.lower() for m in eval_results[dataset_version][val_or_test]["model"]]) and organisation.lower() in set(
|
140 |
[o.lower() for o in eval_results[dataset_version][val_or_test]["organisation"]]):
|
|
|
156 |
comprehension = {'all': 0, 1: 0, 2: 0, 3: 0}
|
157 |
num = {'all': 0, 1: 0, 2: 0, 3: 0}
|
158 |
|
159 |
+
# with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
160 |
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
161 |
with open(file_path, 'r') as f:
|
162 |
for ix, line in enumerate(f):
|
|
|
173 |
level = int(gold_results[val_or_test][task_name]["Level"])
|
174 |
score = question_scorer(task, gold_results[val_or_test][task_name])
|
175 |
except KeyError:
|
176 |
+
return format_error(f"{task_name} not found in split {val_or_test}. Are you sure you submitted the correct file?")
|
|
|
177 |
|
178 |
scored_file.write(
|
179 |
json.dumps({
|
|
|
201 |
success_rate['all'] += 1
|
202 |
|
203 |
for key in LEVELS:
|
204 |
+
success_rate[key] = round_and_pad(success_rate[key] / num[key] / 100)
|
205 |
+
completion_level[key] = round_and_pad(completion_level[key] / num[key] / 1000)
|
206 |
expertise[key] = round_and_pad(expertise[key] / num[key])
|
207 |
reasoning[key] = round_and_pad(reasoning[key] / num[key])
|
208 |
comprehension[key] = round_and_pad(comprehension[key] / num[key])
|