lixuejing commited on
Commit
8b5b385
·
1 Parent(s): 6500fc4
Files changed (2) hide show
  1. src/leaderboard/read_evals.py +12 -1
  2. src/populate.py +1 -0
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -82,6 +82,17 @@ class EvalResult:
82
 
83
  mean_acc = np.mean(accs) if len(accs) > 0 else 0
84
  results[task.benchmark] = mean_acc
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  return self(
87
  eval_name=result_key,
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Quotas
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
82
 
83
  mean_acc = np.mean(accs) if len(accs) > 0 else 0
84
  results[task.benchmark] = mean_acc
85
+
86
+ for task in Quotas:
87
+ task = task.value
88
+
89
+ # We average all scores of a given metric (not all metrics are present in all files)
90
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
91
+ if accs.size == 0 or any([acc is None for acc in accs]):
92
+ continue
93
+
94
+ mean_acc = np.mean(accs) if len(accs) > 0 else 0
95
+ results[task.benchmark] = mean_acc
96
 
97
  return self(
98
  eval_name=result_key,
src/populate.py CHANGED
@@ -24,6 +24,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
24
  print("df", df)
25
  print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
26
  #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
27
  df = df[cols].round(decimals=2)
28
 
29
  # filter out if any of the benchmarks have not been produced
 
24
  print("df", df)
25
  print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
26
  #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
27
+ print("cols", cols)
28
  df = df[cols].round(decimals=2)
29
 
30
  # filter out if any of the benchmarks have not been produced