Update app.py
Browse files
app.py
CHANGED
@@ -73,16 +73,17 @@ def get_dataframe_from_results(eval_results, split):
|
|
73 |
local_df = local_df.remove_columns(["url"])
|
74 |
local_df = local_df.rename_column("model", "Model name")
|
75 |
local_df = local_df.rename_column("model_family", "Model family")
|
76 |
-
# local_df = local_df.rename_column("score", "Average score (%)")
|
77 |
-
# for i in [1, 2, 3]:
|
78 |
-
# local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
|
79 |
df = pd.DataFrame(local_df)
|
80 |
df = df.sort_values(by=["completion_level"], ascending=False)
|
81 |
|
82 |
-
numeric_cols = [c for c in local_df.column_names
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
86 |
return df
|
87 |
|
88 |
|
@@ -103,20 +104,6 @@ TYPES = ["markdown", "str", "str", "str", "number", "number", "number", "number"
|
|
103 |
LEVELS = ["all", 1, 2, 3]
|
104 |
|
105 |
|
106 |
-
def round_and_pad(number, ndigits=2):
|
107 |
-
# 四舍五入到指定的小数位数
|
108 |
-
rounded_number = round(number, ndigits)
|
109 |
-
# 转换为字符串
|
110 |
-
number_str = str(rounded_number)
|
111 |
-
# 分离整数部分和小数部分
|
112 |
-
integer_part, decimal_part = number_str.split('.')
|
113 |
-
# 如果小数部分不足指定的位数,补零
|
114 |
-
while len(decimal_part) < ndigits:
|
115 |
-
decimal_part += '0'
|
116 |
-
# 拼接回去,并转换回数字
|
117 |
-
return '.'.join([integer_part, decimal_part])
|
118 |
-
|
119 |
-
|
120 |
def add_new_eval(
|
121 |
dataset_version: str,
|
122 |
model: str,
|
@@ -156,7 +143,6 @@ def add_new_eval(
|
|
156 |
comprehension = {'all': 0, 1: 0, 2: 0, 3: 0}
|
157 |
num = {'all': 0, 1: 0, 2: 0, 3: 0}
|
158 |
|
159 |
-
# with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
160 |
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
161 |
with open(file_path, 'r') as f:
|
162 |
for ix, line in enumerate(f):
|
@@ -173,7 +159,8 @@ def add_new_eval(
|
|
173 |
level = int(gold_results[val_or_test][task_name]["Level"])
|
174 |
score = question_scorer(task, gold_results[val_or_test][task_name])
|
175 |
except KeyError:
|
176 |
-
return format_error(
|
|
|
177 |
|
178 |
scored_file.write(
|
179 |
json.dumps({
|
@@ -201,11 +188,11 @@ def add_new_eval(
|
|
201 |
success_rate['all'] += 1
|
202 |
|
203 |
for key in LEVELS:
|
204 |
-
success_rate[key] =
|
205 |
-
completion_level[key] =
|
206 |
-
expertise[key] =
|
207 |
-
reasoning[key] =
|
208 |
-
comprehension[key] =
|
209 |
|
210 |
print(success_rate, completion_level, expertise, reasoning, comprehension)
|
211 |
|
@@ -265,7 +252,8 @@ def refresh():
|
|
265 |
dataset_version,
|
266 |
token=TOKEN,
|
267 |
download_mode="force_redownload",
|
268 |
-
verification_mode="no_checks"
|
|
|
269 |
)
|
270 |
|
271 |
new_eval_dataframe = {}
|
|
|
73 |
local_df = local_df.remove_columns(["url"])
|
74 |
local_df = local_df.rename_column("model", "Model name")
|
75 |
local_df = local_df.rename_column("model_family", "Model family")
|
|
|
|
|
|
|
76 |
df = pd.DataFrame(local_df)
|
77 |
df = df.sort_values(by=["completion_level"], ascending=False)
|
78 |
|
79 |
+
numeric_cols = [c for c in local_df.column_names if c in ["expertise", "reasoning", "comprehension"]]
|
80 |
+
df[numeric_cols] = df[numeric_cols].round(decimals=2)
|
81 |
+
|
82 |
+
percent_cols = [c for c in local_df.column_names if c in ["success_rate", "completion_level"]]
|
83 |
+
df = df.style.format("{:.2%}", subset=percent_cols)
|
84 |
+
|
85 |
+
df = df[["Model name", "Model family", "organisation", "completion_level", "success_rate", "expertise", "reasoning",
|
86 |
+
"comprehension"]]
|
87 |
return df
|
88 |
|
89 |
|
|
|
104 |
LEVELS = ["all", 1, 2, 3]
|
105 |
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
def add_new_eval(
|
108 |
dataset_version: str,
|
109 |
model: str,
|
|
|
143 |
comprehension = {'all': 0, 1: 0, 2: 0, 3: 0}
|
144 |
num = {'all': 0, 1: 0, 2: 0, 3: 0}
|
145 |
|
|
|
146 |
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
147 |
with open(file_path, 'r') as f:
|
148 |
for ix, line in enumerate(f):
|
|
|
159 |
level = int(gold_results[val_or_test][task_name]["Level"])
|
160 |
score = question_scorer(task, gold_results[val_or_test][task_name])
|
161 |
except KeyError:
|
162 |
+
return format_error(
|
163 |
+
f"{task_name} not found in split {val_or_test}. Are you sure you submitted the correct file?")
|
164 |
|
165 |
scored_file.write(
|
166 |
json.dumps({
|
|
|
188 |
success_rate['all'] += 1
|
189 |
|
190 |
for key in LEVELS:
|
191 |
+
success_rate[key] = success_rate[key] / num[key]
|
192 |
+
completion_level[key] = completion_level[key] / num[key] / 10
|
193 |
+
expertise[key] = expertise[key] / num[key]
|
194 |
+
reasoning[key] = reasoning[key] / num[key]
|
195 |
+
comprehension[key] = comprehension[key] / num[key]
|
196 |
|
197 |
print(success_rate, completion_level, expertise, reasoning, comprehension)
|
198 |
|
|
|
252 |
dataset_version,
|
253 |
token=TOKEN,
|
254 |
download_mode="force_redownload",
|
255 |
+
verification_mode="no_checks",
|
256 |
+
trust_remote_code=True
|
257 |
)
|
258 |
|
259 |
new_eval_dataframe = {}
|