lixuejing commited on
Commit
9d5b710
·
1 Parent(s): b678721
src/display/utils.py CHANGED
@@ -27,7 +27,7 @@ auto_eval_column_dict = []
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
  #Scores
30
- #auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
@@ -51,7 +51,7 @@ auto_eval_column_quota_dict = []
51
  auto_eval_column_quota_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
52
  auto_eval_column_quota_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
53
  #Scores
54
- #auto_eval_column_quota_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
55
  for task in Quotas:
56
  auto_eval_column_quota_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
57
  # Model information
 
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
  #Scores
30
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
 
51
  auto_eval_column_quota_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
52
  auto_eval_column_quota_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
53
  #Scores
54
+ auto_eval_column_quota_dict.append(["average_quota", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
55
  for task in Quotas:
56
  auto_eval_column_quota_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
57
  # Model information
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Quotas
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -99,7 +99,11 @@ class EvalResult:
99
 
100
  mean_acc = np.mean(accs) if len(accs) > 0 else 0
101
  print("mean_acc", task.metric, mean_acc)
102
- results[task.metric] = mean_acc
 
 
 
 
103
 
104
  return self(
105
  eval_name=result_key,
@@ -144,7 +148,7 @@ class EvalResult:
144
  average = 0
145
  nums = 0
146
  for k,v in self.results.items():
147
- if k not in ["Visual Grounding","Counting","State & Activity Understanding","Dynamic","Relative direction","Multi-view matching","Relative distance","Depth estimation","Relative shape","Size estimation","Trajectory","Future prediction","Goal Decomposition","Navigation"]:
148
  if v is not None and v != 0:
149
  average += v
150
  nums += 1
@@ -152,6 +156,17 @@ class EvalResult:
152
  average = 0
153
  else:
154
  average = average/nums
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  data_dict = {
157
  "eval_name": self.eval_name, # not a column, just a save name,
@@ -163,7 +178,8 @@ class EvalResult:
163
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
164
  AutoEvalColumn.dummy.name: self.full_model,
165
  AutoEvalColumn.revision.name: self.revision,
166
- #AutoEvalColumn.average.name: average,
 
167
 
168
  #AutoEvalColumn.license.name: self.license,
169
  #AutoEvalColumn.likes.name: self.likes,
@@ -186,13 +202,14 @@ class EvalResult:
186
 
187
  for task in Quotas:
188
  #data_dict[task.value.col_name] = self.results.get(task.value.metric, 0)
189
- if task.value.col_name != "CLCC-H":
190
  data_dict[task.value.col_name] = self.results.get(task.value.metric, 0)
191
  else:
192
- if self.results.get(task.value.metric, 0) == 0:
193
- data_dict[task.value.col_name] = "-"
194
- else:
195
- data_dict[task.value.col_name] = "%.2f" % self.results.get(task.value.metric, 0)
 
196
 
197
  return data_dict
198
 
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, AutoEvalColumnQuota, ModelType, Tasks, Precision, WeightType, Quotas
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
99
 
100
  mean_acc = np.mean(accs) if len(accs) > 0 else 0
101
  print("mean_acc", task.metric, mean_acc)
102
+ if task.metric == "overall":
103
+ results[task.benchmark] = mean_acc
104
+ else:
105
+ results[task.metric] = mean_acc
106
+
107
 
108
  return self(
109
  eval_name=result_key,
 
148
  average = 0
149
  nums = 0
150
  for k,v in self.results.items():
151
+ if k not in ["Perception","SpatialReasoning","Prediction","Planning","Visual Grounding","Counting","State & Activity Understanding","Dynamic","Relative direction","Multi-view matching","Relative distance","Depth estimation","Relative shape","Size estimation","Trajectory","Future prediction","Goal Decomposition","Navigation"]:
152
  if v is not None and v != 0:
153
  average += v
154
  nums += 1
 
156
  average = 0
157
  else:
158
  average = average/nums
159
+
160
+ nums,average_quota=0,0
161
+ for k,v in self.results.items():
162
+ if k in ["Perception","SpatialReasoning","Prediction","Planning"]:
163
+ f v is not None and v != 0:
164
+ average_quota += v
165
+ nums += 1
166
+ if nums ==0:
167
+ average_quota = 0
168
+ else:
169
+ average_quota = average_quota/nums
170
 
171
  data_dict = {
172
  "eval_name": self.eval_name, # not a column, just a save name,
 
178
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
179
  AutoEvalColumn.dummy.name: self.full_model,
180
  AutoEvalColumn.revision.name: self.revision,
181
+ AutoEvalColumn.average.name: average,
182
+ AutoEvalColumnQuota.average_quota.name: average_quota,
183
 
184
  #AutoEvalColumn.license.name: self.license,
185
  #AutoEvalColumn.likes.name: self.likes,
 
202
 
203
  for task in Quotas:
204
  #data_dict[task.value.col_name] = self.results.get(task.value.metric, 0)
205
+ if task.value.metric != "overall":
206
  data_dict[task.value.col_name] = self.results.get(task.value.metric, 0)
207
  else:
208
+ data_dict[task.value.col_name] = self.results.get(task.value.bench, 0)
209
+ #if self.results.get(task.value.benchmark, 0) == 0:
210
+ # data_dict[task.value.col_name] = "-"
211
+ #else:
212
+ # data_dict[task.value.col_name] = "%.2f" % self.results.get(task.value.metric, 0)
213
 
214
  return data_dict
215
 
src/populate.py CHANGED
@@ -27,6 +27,23 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
27
  return raw_data, df
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
31
  """Creates the different dataframes for the evaluation queues requestes"""
32
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
 
27
  return raw_data, df
28
 
29
 
30
+ def get_leaderboard_df_quota(results_path: str, requests_path: str, dynamic_path: str,cols: list, benchmark_cols: list) -> pd.DataFrame:
31
+ """Creates a dataframe from all the individual experiment results"""
32
+ raw_data = get_raw_eval_results(results_path, requests_path, dynamic_path)
33
+ for v in raw_data:
34
+ print(v.to_dict())
35
+ all_data_json = [v.to_dict() for v in raw_data]
36
+ #all_data_json.append(baseline_row)
37
+ filter_models_flags(all_data_json)
38
+ df = pd.DataFrame.from_records(all_data_json)
39
+ print("AutoEvalColumn.average.name",AutoEvalColumn.average.name)
40
+ df = df.sort_values(by=[AutoEvalColumnQuota.average.name], ascending=False)
41
+ df = df[cols].round(decimals=2)
42
+
43
+ # filter out if any of the benchmarks have not been produced
44
+ df = df[has_no_nan_values(df, benchmark_cols)]
45
+ return raw_data, df
46
+
47
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
48
  """Creates the different dataframes for the evaluation queues requestes"""
49
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]