Spaces:
Running
Running
Update src/leaderboard/read_evals.py
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -80,6 +80,19 @@ class EvalResult:
|
|
80 |
mean_acc = np.mean(accs) * 100.0
|
81 |
results[task.benchmark] = mean_acc
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
return self(
|
84 |
eval_name=result_key,
|
85 |
full_model=full_model,
|
@@ -134,6 +147,9 @@ class EvalResult:
|
|
134 |
for task in Tasks:
|
135 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
136 |
|
|
|
|
|
|
|
137 |
return data_dict
|
138 |
|
139 |
|
|
|
80 |
mean_acc = np.mean(accs) * 100.0
|
81 |
results[task.benchmark] = mean_acc
|
82 |
|
83 |
+
mix_accs = []
|
84 |
+
for task in MixTasks:
|
85 |
+
task = task.value
|
86 |
+
|
87 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
88 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
89 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
90 |
+
continue
|
91 |
+
|
92 |
+
mix_accs.append(np.mean(accs) * 100.0)
|
93 |
+
results['Mix-fr'] = mean(mix_accs)
|
94 |
+
|
95 |
+
|
96 |
return self(
|
97 |
eval_name=result_key,
|
98 |
full_model=full_model,
|
|
|
147 |
for task in Tasks:
|
148 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
149 |
|
150 |
+
for task in MixTasks:
|
151 |
+
data_dict['Mix-fr'] = self.results['Mix-fr']
|
152 |
+
|
153 |
return data_dict
|
154 |
|
155 |
|