Spaces:
Running
Running
lixuejing
commited on
Commit
·
8b5b385
1
Parent(s):
6500fc4
update
Browse files- src/leaderboard/read_evals.py +12 -1
- src/populate.py +1 -0
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -82,6 +82,17 @@ class EvalResult:
|
|
82 |
|
83 |
mean_acc = np.mean(accs) if len(accs) > 0 else 0
|
84 |
results[task.benchmark] = mean_acc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
return self(
|
87 |
eval_name=result_key,
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Quotas
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
82 |
|
83 |
mean_acc = np.mean(accs) if len(accs) > 0 else 0
|
84 |
results[task.benchmark] = mean_acc
|
85 |
+
|
86 |
+
for task in Quotas:
|
87 |
+
task = task.value
|
88 |
+
|
89 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
90 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
91 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
92 |
+
continue
|
93 |
+
|
94 |
+
mean_acc = np.mean(accs) if len(accs) > 0 else 0
|
95 |
+
results[task.benchmark] = mean_acc
|
96 |
|
97 |
return self(
|
98 |
eval_name=result_key,
|
src/populate.py
CHANGED
@@ -24,6 +24,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
|
|
24 |
print("df", df)
|
25 |
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
|
26 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
27 |
df = df[cols].round(decimals=2)
|
28 |
|
29 |
# filter out if any of the benchmarks have not been produced
|
|
|
24 |
print("df", df)
|
25 |
print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
|
26 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
27 |
+
print("cols", cols)
|
28 |
df = df[cols].round(decimals=2)
|
29 |
|
30 |
# filter out if any of the benchmarks have not been produced
|