Spaces:
Running
Running
lixuejing
commited on
Commit
·
6981fa7
1
Parent(s):
dba0a90
update
Browse files- app.py +22 -68
- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +2 -0
- src/populate.py +3 -3
app.py
CHANGED
@@ -32,7 +32,7 @@ from src.display.utils import (
|
|
32 |
BENCHMARK_QUOTACOLS
|
33 |
)
|
34 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
35 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
36 |
from src.submission.submit import add_new_eval
|
37 |
from src.scripts.update_all_request_files import update_dynamic_files
|
38 |
from src.tools.collections import update_collections
|
@@ -77,6 +77,16 @@ def init_space():
|
|
77 |
#update_collections(original_df.copy())
|
78 |
leaderboard_df = original_df.copy()
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
#plot_df = create_plot_df(create_scores_df(raw_data))
|
81 |
|
82 |
(
|
@@ -86,12 +96,10 @@ def init_space():
|
|
86 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
87 |
|
88 |
#return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
89 |
-
return leaderboard_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
90 |
|
91 |
-
leaderboard_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
92 |
-
#return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
93 |
|
94 |
-
#leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
95 |
|
96 |
|
97 |
# Searching and filtering
|
@@ -231,6 +239,13 @@ leaderboard_df = filter_models(
|
|
231 |
hide_models=[], # Deleted, merges, flagged, MoEs
|
232 |
)
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
|
236 |
demo = gr.Blocks(css=custom_css)
|
@@ -265,36 +280,6 @@ with demo:
|
|
265 |
elem_id="column-select",
|
266 |
interactive=True,
|
267 |
)
|
268 |
-
#with gr.Row():
|
269 |
-
# hide_models = gr.CheckboxGroup(
|
270 |
-
# label="Hide models",
|
271 |
-
# choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
|
272 |
-
# value=[],
|
273 |
-
# interactive=True
|
274 |
-
# )
|
275 |
-
#with gr.Column(min_width=320):
|
276 |
-
# #with gr.Box(elem_id="box-filter"):
|
277 |
-
# filter_columns_type = gr.CheckboxGroup(
|
278 |
-
# label="Model types",
|
279 |
-
# choices=[t.to_str() for t in ModelType],
|
280 |
-
# value=[t.to_str() for t in ModelType],
|
281 |
-
# interactive=True,
|
282 |
-
# elem_id="filter-columns-type",
|
283 |
-
# )
|
284 |
-
# filter_columns_precision = gr.CheckboxGroup(
|
285 |
-
# label="Precision",
|
286 |
-
# choices=[i.value.name for i in Precision],
|
287 |
-
# value=[i.value.name for i in Precision],
|
288 |
-
# interactive=True,
|
289 |
-
# elem_id="filter-columns-precision",
|
290 |
-
# )
|
291 |
-
# filter_columns_size = gr.CheckboxGroup(
|
292 |
-
# label="Model sizes (in billions of parameters)",
|
293 |
-
# choices=list(NUMERIC_INTERVALS.keys()),
|
294 |
-
# value=list(NUMERIC_INTERVALS.keys()),
|
295 |
-
# interactive=True,
|
296 |
-
# elem_id="filter-columns-size",
|
297 |
-
# )
|
298 |
|
299 |
|
300 |
leaderboard_table = gr.components.Dataframe(
|
@@ -382,40 +367,10 @@ with demo:
|
|
382 |
elem_id="column-select",
|
383 |
interactive=True,
|
384 |
)
|
385 |
-
#with gr.Row():
|
386 |
-
# hide_models = gr.CheckboxGroup(
|
387 |
-
# label="Hide models",
|
388 |
-
# choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
|
389 |
-
# value=[],
|
390 |
-
# interactive=True
|
391 |
-
# )
|
392 |
-
#with gr.Column(min_width=320):
|
393 |
-
# #with gr.Box(elem_id="box-filter"):
|
394 |
-
# filter_columns_type = gr.CheckboxGroup(
|
395 |
-
# label="Model types",
|
396 |
-
# choices=[t.to_str() for t in ModelType],
|
397 |
-
# value=[t.to_str() for t in ModelType],
|
398 |
-
# interactive=True,
|
399 |
-
# elem_id="filter-columns-type",
|
400 |
-
# )
|
401 |
-
# filter_columns_precision = gr.CheckboxGroup(
|
402 |
-
# label="Precision",
|
403 |
-
# choices=[i.value.name for i in Precision],
|
404 |
-
# value=[i.value.name for i in Precision],
|
405 |
-
# interactive=True,
|
406 |
-
# elem_id="filter-columns-precision",
|
407 |
-
# )
|
408 |
-
# filter_columns_size = gr.CheckboxGroup(
|
409 |
-
# label="Model sizes (in billions of parameters)",
|
410 |
-
# choices=list(NUMERIC_INTERVALS.keys()),
|
411 |
-
# value=list(NUMERIC_INTERVALS.keys()),
|
412 |
-
# interactive=True,
|
413 |
-
# elem_id="filter-columns-size",
|
414 |
-
# )
|
415 |
|
416 |
|
417 |
leaderboard_table = gr.components.Dataframe(
|
418 |
-
value=
|
419 |
[c.name for c in fields(AutoEvalColumnQuota) if c.never_hidden]
|
420 |
+ shown_columns.value
|
421 |
+ [AutoEvalColumnQuota.dummy.name]
|
@@ -430,8 +385,7 @@ with demo:
|
|
430 |
|
431 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
432 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
433 |
-
value=
|
434 |
-
#value=leaderboard_df[QUOTACOLS],
|
435 |
headers=QUOTACOLS,
|
436 |
datatype=QUOTATYPES,
|
437 |
visible=False,
|
|
|
32 |
BENCHMARK_QUOTACOLS
|
33 |
)
|
34 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
35 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_quota
|
36 |
from src.submission.submit import add_new_eval
|
37 |
from src.scripts.update_all_request_files import update_dynamic_files
|
38 |
from src.tools.collections import update_collections
|
|
|
77 |
#update_collections(original_df.copy())
|
78 |
leaderboard_df = original_df.copy()
|
79 |
|
80 |
+
raw_data_quota, original_df_quota = get_leaderboard_df(
|
81 |
+
results_path=EVAL_RESULTS_PATH,
|
82 |
+
requests_path=EVAL_REQUESTS_PATH,
|
83 |
+
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
84 |
+
cols=list(set(QUOTACOLS+COLS)),
|
85 |
+
benchmark_cols=list(set(BENCHMARK_QUOTACOLS+BENCHMARK_COLS))
|
86 |
+
)
|
87 |
+
#update_collections(original_df.copy())
|
88 |
+
leaderboard_df_quota = original_df_quota.copy()
|
89 |
+
|
90 |
#plot_df = create_plot_df(create_scores_df(raw_data))
|
91 |
|
92 |
(
|
|
|
96 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
97 |
|
98 |
#return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
99 |
+
return leaderboard_df, original_df, leaderboard_df_quota, original_df_quota,finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
100 |
|
101 |
+
leaderboard_df, original_df, leaderboard_df_quota, original_df_quota, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
|
|
102 |
|
|
|
103 |
|
104 |
|
105 |
# Searching and filtering
|
|
|
239 |
hide_models=[], # Deleted, merges, flagged, MoEs
|
240 |
)
|
241 |
|
242 |
+
leaderboard_df_quota = filter_models(
|
243 |
+
df=leaderboard_df_quota,
|
244 |
+
type_query=[t.to_str(" : ") for t in ModelType],
|
245 |
+
size_query=list(NUMERIC_INTERVALS.keys()),
|
246 |
+
precision_query=[i.value.name for i in Precision],
|
247 |
+
hide_models=[], # Deleted, merges, flagged, MoEs
|
248 |
+
)
|
249 |
|
250 |
|
251 |
demo = gr.Blocks(css=custom_css)
|
|
|
280 |
elem_id="column-select",
|
281 |
interactive=True,
|
282 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
|
285 |
leaderboard_table = gr.components.Dataframe(
|
|
|
367 |
elem_id="column-select",
|
368 |
interactive=True,
|
369 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
|
371 |
|
372 |
leaderboard_table = gr.components.Dataframe(
|
373 |
+
value=leaderboard_df_quota[
|
374 |
[c.name for c in fields(AutoEvalColumnQuota) if c.never_hidden]
|
375 |
+ shown_columns.value
|
376 |
+ [AutoEvalColumnQuota.dummy.name]
|
|
|
385 |
|
386 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
387 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
388 |
+
value=original_df_quota[QUOTACOLS],
|
|
|
389 |
headers=QUOTACOLS,
|
390 |
datatype=QUOTATYPES,
|
391 |
visible=False,
|
src/display/utils.py
CHANGED
@@ -51,7 +51,7 @@ auto_eval_column_quota_dict = []
|
|
51 |
auto_eval_column_quota_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
52 |
auto_eval_column_quota_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
53 |
#Scores
|
54 |
-
auto_eval_column_quota_dict.append(["average_quota", ColumnContent, ColumnContent("
|
55 |
for task in Quotas:
|
56 |
auto_eval_column_quota_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
57 |
# Model information
|
|
|
51 |
auto_eval_column_quota_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
52 |
auto_eval_column_quota_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
53 |
#Scores
|
54 |
+
auto_eval_column_quota_dict.append(["average_quota", ColumnContent, ColumnContent("AverageSampled ⬆️", "number", True)])
|
55 |
for task in Quotas:
|
56 |
auto_eval_column_quota_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
57 |
# Model information
|
src/leaderboard/read_evals.py
CHANGED
@@ -168,6 +168,8 @@ class EvalResult:
|
|
168 |
else:
|
169 |
average_quota = average_quota/nums
|
170 |
|
|
|
|
|
171 |
data_dict = {
|
172 |
"eval_name": self.eval_name, # not a column, just a save name,
|
173 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
168 |
else:
|
169 |
average_quota = average_quota/nums
|
170 |
|
171 |
+
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name, average)
|
172 |
+
print("AutoEvalColumnQuota.average_quota.name",AutoEvalColumnQuota.average_quota.name,average_quota)
|
173 |
data_dict = {
|
174 |
"eval_name": self.eval_name, # not a column, just a save name,
|
175 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
src/populate.py
CHANGED
@@ -18,8 +18,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
|
|
18 |
#all_data_json.append(baseline_row)
|
19 |
filter_models_flags(all_data_json)
|
20 |
df = pd.DataFrame.from_records(all_data_json)
|
21 |
-
|
22 |
-
|
23 |
df = df[cols].round(decimals=2)
|
24 |
|
25 |
# filter out if any of the benchmarks have not been produced
|
@@ -37,7 +37,7 @@ def get_leaderboard_df_quota(results_path: str, requests_path: str, dynamic_path
|
|
37 |
filter_models_flags(all_data_json)
|
38 |
df = pd.DataFrame.from_records(all_data_json)
|
39 |
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
|
40 |
-
df = df.sort_values(by=[AutoEvalColumnQuota.
|
41 |
df = df[cols].round(decimals=2)
|
42 |
|
43 |
# filter out if any of the benchmarks have not been produced
|
|
|
18 |
#all_data_json.append(baseline_row)
|
19 |
filter_models_flags(all_data_json)
|
20 |
df = pd.DataFrame.from_records(all_data_json)
|
21 |
+
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
|
22 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
23 |
df = df[cols].round(decimals=2)
|
24 |
|
25 |
# filter out if any of the benchmarks have not been produced
|
|
|
37 |
filter_models_flags(all_data_json)
|
38 |
df = pd.DataFrame.from_records(all_data_json)
|
39 |
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
|
40 |
+
df = df.sort_values(by=[AutoEvalColumnQuota.average_quota.name], ascending=False)
|
41 |
df = df[cols].round(decimals=2)
|
42 |
|
43 |
# filter out if any of the benchmarks have not been produced
|