jasonshaoshun commited on
Commit
06e8556
·
1 Parent(s): 9ebccf5
Files changed (5) hide show
  1. app.py +32 -16
  2. src/about.py +1 -1
  3. src/display/utils.py +4 -4
  4. src/leaderboard/read_evals.py +61 -83
  5. src/populate.py +6 -34
app.py CHANGED
@@ -30,7 +30,7 @@ from src.display.utils import (
30
  fields,
31
  )
32
  from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
33
- from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib
34
  from src.submission.submit import add_new_eval
35
 
36
 
@@ -49,15 +49,6 @@ try:
49
  except Exception:
50
  restart_space()
51
 
52
- # print("EVAL_RESULTS_PATH")
53
- # try:
54
- # print(EVAL_RESULTS_PATH)
55
- # snapshot_download(
56
- # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
57
- # )
58
- # except Exception:
59
- # restart_space()
60
-
61
 
62
  try:
63
  print(RESULTS_REPO_MIB_SUBGRAPH)
@@ -78,8 +69,8 @@ except Exception:
78
 
79
 
80
 
81
- LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
82
- # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causal(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
83
 
84
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
85
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
@@ -91,7 +82,32 @@ LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib(EVAL_RESULTS_MIB_SUBGRAPH_P
91
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
92
 
93
 
94
- def init_leaderboard_mib(dataframe, track):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
96
 
97
  if dataframe is None or dataframe.empty:
@@ -116,6 +132,7 @@ def init_leaderboard_mib(dataframe, track):
116
  interactive=False,
117
  )
118
 
 
119
  def init_leaderboard(dataframe, track):
120
  if dataframe is None or dataframe.empty:
121
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -180,11 +197,10 @@ with demo:
180
  # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
181
 
182
  with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
183
- leaderboard = init_leaderboard_mib(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
184
- # leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
185
 
186
  # with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
187
- # leaderboard = init_leaderboard_mib_causal(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
188
 
189
  # with gr.Row():
190
  # with gr.Accordion("📙 Citation", open=False):
 
30
  fields,
31
  )
32
  from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
33
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
34
  from src.submission.submit import add_new_eval
35
 
36
 
 
49
  except Exception:
50
  restart_space()
51
 
 
 
 
 
 
 
 
 
 
52
 
53
  try:
54
  print(RESULTS_REPO_MIB_SUBGRAPH)
 
69
 
70
 
71
 
72
+ LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
73
+ LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
74
 
75
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
76
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
 
82
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
83
 
84
 
85
+ def init_leaderboard_mib_subgraph(dataframe, track):
86
+ print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
87
+
88
+ if dataframe is None or dataframe.empty:
89
+ raise ValueError("Leaderboard DataFrame is empty or None.")
90
+
91
+ # filter for correct track
92
+ # dataframe = dataframe.loc[dataframe["Track"] == track]
93
+
94
+ print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
95
+
96
+ return Leaderboard(
97
+ value=dataframe,
98
+ datatype=[c.type for c in fields(AutoEvalColumn_mib)],
99
+ select_columns=SelectColumns(
100
+ default_selection=[c.name for c in fields(AutoEvalColumn_mib) if c.displayed_by_default],
101
+ cant_deselect=[c.name for c in fields(AutoEvalColumn_mib) if c.never_hidden],
102
+ label="Select Columns to Display:",
103
+ ),
104
+ search_columns=["Method"], # Changed from AutoEvalColumn_mib.model.name to "Method"
105
+ hide_columns=[c.name for c in fields(AutoEvalColumn_mib) if c.hidden],
106
+ bool_checkboxgroup_label="Hide models",
107
+ interactive=False,
108
+ )
109
+
110
+ def init_leaderboard_mib_causalgraph(dataframe, track):
111
  print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
112
 
113
  if dataframe is None or dataframe.empty:
 
132
  interactive=False,
133
  )
134
 
135
+
136
  def init_leaderboard(dataframe, track):
137
  if dataframe is None or dataframe.empty:
138
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
197
  # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
198
 
199
  with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
200
+ leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
 
201
 
202
  # with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
203
+ # leaderboard = init_leaderboard_mib_causalgraph(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
204
 
205
  # with gr.Row():
206
  # with gr.Accordion("📙 Citation", open=False):
src/about.py CHANGED
@@ -27,7 +27,7 @@ class Tasks(Enum):
27
  task3 = Task("ewok", "acc", "EWoK")
28
 
29
 
30
- class TasksMIB(Enum):
31
  task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
32
  task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
33
 
 
27
  task3 = Task("ewok", "acc", "EWoK")
28
 
29
 
30
+ class TasksMib_Subgraph(Enum):
31
  task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
32
  task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
33
 
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks, TasksMultimodal, TasksMIB
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -35,7 +35,7 @@ auto_eval_column_dict_mib = []
35
  auto_eval_column_dict_mib.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
36
 
37
  # For each task and model combination
38
- for task in TasksMIB:
39
  for model in task.value.models:
40
  col_name = f"{task.value.benchmark}_{model}" # ioi_meta_llama, mcqa_qwen, etc.
41
  auto_eval_column_dict_mib.append([
@@ -54,9 +54,9 @@ AutoEvalColumn_mib = make_dataclass("AutoEvalColumn_mib", auto_eval_column_dict_
54
  # Column selection for display
55
  COLS_MIB = [c.name for c in fields(AutoEvalColumn_mib) if not c.hidden]
56
 
57
- # BENCHMARK_COLS_MIB = [t.value.col_name for t in TasksMIB]
58
  BENCHMARK_COLS_MIB = []
59
- for task in TasksMIB:
60
  for model in task.value.models:
61
  col_name = f"{task.value.col_name}_{model.replace('-', '_')}"
62
  BENCHMARK_COLS_MIB.append(col_name)
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
35
  auto_eval_column_dict_mib.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
36
 
37
  # For each task and model combination
38
+ for task in TasksMib_Subgraph:
39
  for model in task.value.models:
40
  col_name = f"{task.value.benchmark}_{model}" # ioi_meta_llama, mcqa_qwen, etc.
41
  auto_eval_column_dict_mib.append([
 
54
  # Column selection for display
55
  COLS_MIB = [c.name for c in fields(AutoEvalColumn_mib) if not c.hidden]
56
 
57
+ # BENCHMARK_COLS_MIB = [t.value.col_name for t in TasksMib_Subgraph]
58
  BENCHMARK_COLS_MIB = []
59
+ for task in TasksMib_Subgraph:
60
  for model in task.value.models:
61
  col_name = f"{task.value.col_name}_{model.replace('-', '_')}"
62
  BENCHMARK_COLS_MIB.append(col_name)
src/leaderboard/read_evals.py CHANGED
@@ -13,29 +13,9 @@ from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  from typing import List, Dict
16
- from src.about import TasksMIB
17
-
18
-
19
- # def compute_area(edge_counts, faithfulnesses, log_scale=True):
20
- # percentages = [e / max(edge_counts) for e in edge_counts]
21
- # area_under = 0.
22
- # area_from_100 = 0.
23
- # for i in range(len(faithfulnesses) - 1):
24
- # i_1, i_2 = i, i+1
25
- # x_1 = percentages[i_1]
26
- # x_2 = percentages[i_2]
27
- # # area from point to 100
28
- # if log_scale:
29
- # x_1 = math.log(x_1)
30
- # x_2 = math.log(x_2)
31
- # trapezoidal = (percentages[i_2] - percentages[i_1]) * \
32
- # (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
33
- # area_from_100 += trapezoidal
34
-
35
- # trapezoidal = (percentages[i_2] - percentages[i_1]) * ((faithfulnesses[i_1] + faithfulnesses[i_2]) / 2)
36
- # area_under += trapezoidal
37
- # average = sum(faithfulnesses) / len(faithfulnesses)
38
- # return (area_under, area_from_100, average)
39
  def compute_area(edge_counts, faithfulnesses, log_scale=True):
40
  # Return None if either list is empty
41
  if not edge_counts or not faithfulnesses:
@@ -62,7 +42,7 @@ def compute_area(edge_counts, faithfulnesses, log_scale=True):
62
  return (area_under, area_from_100, average)
63
 
64
  @dataclass
65
- class EvalResult_MIB:
66
  """Represents one full evaluation for a method across all models in MIB."""
67
  eval_name: str # method name as identifier
68
  method_name: str # name of the interpretation method
@@ -104,63 +84,13 @@ class EvalResult_MIB:
104
  "faithfulness": scores[task]["faithfulness"]
105
  }
106
 
107
- return EvalResult_MIB(
108
  eval_name=method_name,
109
  method_name=method_name,
110
  results=results
111
  )
112
 
113
 
114
-
115
- # def to_dict(self):
116
- # """Converts the Eval Result to a dict for dataframe display"""
117
- # data_dict = {
118
- # "eval_name": self.eval_name,
119
- # "Method": self.method_name,
120
- # }
121
-
122
- # all_scores = []
123
- # required_entries = {
124
- # 'ioi_meta_llama': False,
125
- # 'ioi_qwen': False,
126
- # 'ioi_gpt2': False,
127
- # 'mcqa_meta_llama': False,
128
- # 'mcqa_qwen': False,
129
- # 'mcqa_gpt2': False
130
- # }
131
-
132
- # # For each task (ioi, mcqa)
133
- # for task, task_results in self.results.items():
134
- # # Get the models that have results for this task
135
- # models = task_results.keys()
136
-
137
- # for model in models:
138
- # col_name = f"{task}_{model}"
139
- # metrics = task_results[model]
140
- # if metrics:
141
- # edge_counts = metrics["edge_counts"]
142
- # faithfulness = metrics["faithfulness"]
143
- # if isinstance(faithfulness[0], list):
144
- # faithfulness = faithfulness[0]
145
-
146
- # # Use compute_area
147
- # area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
148
- # score = area_under * 100
149
- # data_dict[col_name] = round(score, 2)
150
- # all_scores.append(score)
151
- # required_entries[col_name] = True
152
- # else:
153
- # data_dict[col_name] = '-'
154
-
155
- # # Only show average if all six required entries are present
156
- # if all(required_entries.values()):
157
- # data_dict["Average"] = round(np.mean(all_scores), 2)
158
- # else:
159
- # data_dict["Average"] = '-'
160
-
161
- # return data_dict
162
-
163
-
164
 
165
  def to_dict(self):
166
  """Converts the Eval Result to a dict for dataframe display"""
@@ -211,13 +141,7 @@ class EvalResult_MIB:
211
  return data_dict
212
 
213
 
214
-
215
-
216
-
217
-
218
-
219
-
220
- def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
221
  """From the path of the results folder root, extract all needed info for MIB results"""
222
  model_result_filepaths = []
223
 
@@ -243,7 +167,7 @@ def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[Eval
243
  eval_results = []
244
  for model_result_filepath in model_result_filepaths:
245
  try:
246
- eval_result = EvalResult_MIB("", "", {}) # Create empty instance
247
  result = eval_result.init_from_json_file(model_result_filepath)
248
  print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
249
  # Verify the result can be converted to dict format
@@ -264,6 +188,60 @@ def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[Eval
264
 
265
 
266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  @dataclass
268
  class EvalResult:
269
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
 
13
 
14
 
15
  from typing import List, Dict
16
+
17
+
18
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def compute_area(edge_counts, faithfulnesses, log_scale=True):
20
  # Return None if either list is empty
21
  if not edge_counts or not faithfulnesses:
 
42
  return (area_under, area_from_100, average)
43
 
44
  @dataclass
45
+ class EvalResult_MIB_SUBGRAPH:
46
  """Represents one full evaluation for a method across all models in MIB."""
47
  eval_name: str # method name as identifier
48
  method_name: str # name of the interpretation method
 
84
  "faithfulness": scores[task]["faithfulness"]
85
  }
86
 
87
+ return EvalResult_MIB_SUBGRAPH(
88
  eval_name=method_name,
89
  method_name=method_name,
90
  results=results
91
  )
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def to_dict(self):
96
  """Converts the Eval Result to a dict for dataframe display"""
 
141
  return data_dict
142
 
143
 
144
+ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
 
 
 
 
 
 
145
  """From the path of the results folder root, extract all needed info for MIB results"""
146
  model_result_filepaths = []
147
 
 
167
  eval_results = []
168
  for model_result_filepath in model_result_filepaths:
169
  try:
170
+ eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
171
  result = eval_result.init_from_json_file(model_result_filepath)
172
  print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
173
  # Verify the result can be converted to dict format
 
188
 
189
 
190
 
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+ @dataclass
202
+ class EvalResult_MIB_CAUSALGRAPH:
203
+ """Represents one full evaluation for a method across all models in MIB."""
204
+ eval_name: str # method name as identifier
205
+ method_name: str # name of the interpretation method
206
+ results: Dict # nested dict of results {task: {model: {metric: scores}}}
207
+
208
+
209
+ def init_from_json_file(self, json_filepath):
210
+
211
+
212
+
213
+
214
+ def to_dict(self):
215
+
216
+ return data_dict
217
+
218
+
219
+ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
220
+ """From the path of the results folder root, extract all needed info for MIB results"""
221
+ model_result_filepaths = []
222
+
223
+ print(f"results_path is {results_path}")
224
+
225
+
226
+ return eval_results
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
  @dataclass
246
  class EvalResult:
247
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
src/populate.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
@@ -42,39 +42,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
42
 
43
 
44
 
45
- # def get_leaderboard_df_mib(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
46
- # """Creates a dataframe from all the individual experiment results"""
47
- # print(f"results_path is {results_path}, requests_path is {requests_path}")
48
- # raw_data = get_raw_eval_results(results_path, requests_path)
49
- # print(f"raw_data is {raw_data}")
50
- # all_data_json = [v.to_dict() for v in raw_data]
51
- # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
52
- # all_data_json_filtered = []
53
- # for item in all_data_json:
54
- # item["Track"] = item["eval_name"].split("_")[-1]
55
- # if "VQA" in benchmark_cols and "VQA" in item:
56
- # all_data_json_filtered.append(item)
57
- # if "VQA" not in benchmark_cols and "VQA" not in item:
58
- # all_data_json_filtered.append(item)
59
- # all_data_json_filtered.append(item)
60
-
61
- # all_data_json = all_data_json_filtered
62
-
63
- # df = pd.DataFrame.from_records(all_data_json)
64
- # df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
65
-
66
- # print(f"df is {df}")
67
-
68
- # df = df[cols].round(decimals=1)
69
-
70
- # # filter out if any of the benchmarks have not been produced
71
- # df = df[has_no_nan_values(df, benchmark_cols)]
72
- # return df
73
-
74
- def get_leaderboard_df_mib(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
75
  """Creates a dataframe from all the MIB experiment results"""
76
  print(f"results_path is {results_path}, requests_path is {requests_path}")
77
- raw_data = get_raw_eval_results_mib(results_path, requests_path)
78
  print(f"raw_data is {raw_data}")
79
 
80
  # Convert each result to dict format
@@ -94,10 +65,11 @@ def get_leaderboard_df_mib(results_path: str, requests_path: str, cols: list, be
94
 
95
  return df
96
 
97
- def get_leaderboard_df_mib_causal(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
98
  """Creates a dataframe from all the MIB experiment results"""
99
  print(f"results_path is {results_path}, requests_path is {requests_path}")
100
- raw_data = get_raw_eval_results_mib(results_path, requests_path)
 
101
  return raw_data
102
 
103
 
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
42
 
43
 
44
 
45
+ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  """Creates a dataframe from all the MIB experiment results"""
47
  print(f"results_path is {results_path}, requests_path is {requests_path}")
48
+ raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
49
  print(f"raw_data is {raw_data}")
50
 
51
  # Convert each result to dict format
 
65
 
66
  return df
67
 
68
+ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
69
  """Creates a dataframe from all the MIB experiment results"""
70
  print(f"results_path is {results_path}, requests_path is {requests_path}")
71
+ raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
72
+ # Implement the rest of the code
73
  return raw_data
74
 
75