Aaron Mueller commited on
Commit
2817fcb
·
1 Parent(s): 4493851

support all model/task combinations

Browse files
app.py CHANGED
@@ -45,7 +45,7 @@ def restart_space():
45
 
46
  ### Space initialisation
47
  try:
48
- print(EVAL_REQUESTS_PATH)
49
  snapshot_download(
50
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
51
  )
@@ -54,7 +54,7 @@ except Exception:
54
 
55
 
56
  try:
57
- print(RESULTS_REPO_MIB_SUBGRAPH)
58
  snapshot_download(
59
  repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
60
  )
@@ -63,7 +63,7 @@ except Exception:
63
 
64
 
65
  try:
66
- print(RESULTS_REPO_MIB_CAUSALGRAPH)
67
  snapshot_download(
68
  repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
69
  )
@@ -95,7 +95,7 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
95
 
96
 
97
  def init_leaderboard_mib_subgraph(dataframe, track):
98
- print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
99
 
100
  if dataframe is None or dataframe.empty:
101
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -103,7 +103,7 @@ def init_leaderboard_mib_subgraph(dataframe, track):
103
  # filter for correct track
104
  # dataframe = dataframe.loc[dataframe["Track"] == track]
105
 
106
- print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
107
 
108
  return Leaderboard(
109
  value=dataframe,
@@ -120,20 +120,20 @@ def init_leaderboard_mib_subgraph(dataframe, track):
120
  )
121
 
122
  def init_leaderboard_mib_causalgraph(dataframe, track):
123
- print("Debugging column issues:")
124
- print("\nActual DataFrame columns:")
125
- print(dataframe.columns.tolist())
126
 
127
- print("\nExpected columns for Leaderboard:")
128
  expected_cols = [c.name for c in fields(AutoEvalColumn_mib_causalgraph)]
129
- print(expected_cols)
130
 
131
- print("\nMissing columns:")
132
  missing_cols = [col for col in expected_cols if col not in dataframe.columns]
133
- print(missing_cols)
134
 
135
- print("\nSample of DataFrame content:")
136
- print(dataframe.head().to_string())
137
 
138
  return Leaderboard(
139
  value=dataframe,
@@ -150,9 +150,9 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
150
  )
151
 
152
  def init_leaderboard_mib_causalgraph(dataframe, track):
153
- print("Debugging column issues:")
154
- print("\nActual DataFrame columns:")
155
- print(dataframe.columns.tolist())
156
 
157
  # Create only necessary columns
158
  return Leaderboard(
 
45
 
46
  ### Space initialisation
47
  try:
48
+ # print(EVAL_REQUESTS_PATH)
49
  snapshot_download(
50
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
51
  )
 
54
 
55
 
56
  try:
57
+ # print(RESULTS_REPO_MIB_SUBGRAPH)
58
  snapshot_download(
59
  repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
60
  )
 
63
 
64
 
65
  try:
66
+ # print(RESULTS_REPO_MIB_CAUSALGRAPH)
67
  snapshot_download(
68
  repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
69
  )
 
95
 
96
 
97
  def init_leaderboard_mib_subgraph(dataframe, track):
98
+ # print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
99
 
100
  if dataframe is None or dataframe.empty:
101
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
103
  # filter for correct track
104
  # dataframe = dataframe.loc[dataframe["Track"] == track]
105
 
106
+ # print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
107
 
108
  return Leaderboard(
109
  value=dataframe,
 
120
  )
121
 
122
  def init_leaderboard_mib_causalgraph(dataframe, track):
123
+ # print("Debugging column issues:")
124
+ # print("\nActual DataFrame columns:")
125
+ # print(dataframe.columns.tolist())
126
 
127
+ # print("\nExpected columns for Leaderboard:")
128
  expected_cols = [c.name for c in fields(AutoEvalColumn_mib_causalgraph)]
129
+ # print(expected_cols)
130
 
131
+ # print("\nMissing columns:")
132
  missing_cols = [col for col in expected_cols if col not in dataframe.columns]
133
+ # print(missing_cols)
134
 
135
+ # print("\nSample of DataFrame content:")
136
+ # print(dataframe.head().to_string())
137
 
138
  return Leaderboard(
139
  value=dataframe,
 
150
  )
151
 
152
  def init_leaderboard_mib_causalgraph(dataframe, track):
153
+ # print("Debugging column issues:")
154
+ # print("\nActual DataFrame columns:")
155
+ # print(dataframe.columns.tolist())
156
 
157
  # Create only necessary columns
158
  return Leaderboard(
caulsal_metric.py CHANGED
@@ -235,9 +235,9 @@ if __name__ == "__main__":
235
  folder_path = "./json_files"
236
  detailed_df, aggregated_df, intervention_averaged_df = process_json_folder(folder_path)
237
 
238
- print("Detailed Results (including duplicates):")
239
- print(detailed_df)
240
- print("\nAggregated Results (max scores per method):")
241
- print(aggregated_df)
242
- print("\nIntervention-Averaged Results:")
243
- print(intervention_averaged_df)
 
235
  folder_path = "./json_files"
236
  detailed_df, aggregated_df, intervention_averaged_df = process_json_folder(folder_path)
237
 
238
+ # print("Detailed Results (including duplicates):")
239
+ # print(detailed_df)
240
+ # print("\nAggregated Results (max scores per method):")
241
+ # print(aggregated_df)
242
+ # print("\nIntervention-Averaged Results:")
243
+ # print(intervention_averaged_df)
src/about.py CHANGED
@@ -40,8 +40,12 @@ class TaskMIB_Subgraph:
40
  metrics: list[str] # metrics to store (edge_counts, faithfulness)
41
 
42
  class TasksMib_Subgraph(Enum):
43
- task0 = TaskMIB_Subgraph("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
44
- task1 = TaskMIB_Subgraph("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
 
 
 
 
45
 
46
 
47
 
 
40
  metrics: list[str] # metrics to store (edge_counts, faithfulness)
41
 
42
  class TasksMib_Subgraph(Enum):
43
+ task0 = TaskMIB_Subgraph("ioi", ["gpt2", "qwen2_5", "gemma2", "llama3"], "IOI", ["edge_counts", "faithfulness"])
44
+ task1 = TaskMIB_Subgraph("mcqa", ["qwen2_5", "gemma2", "llama3"], "MCQA", ["edge_counts", "faithfulness"])
45
+ task2 = TaskMIB_Subgraph("arithmetic_addition", ["llama3"], "arithmetic_addition", ["edge_counts", "faithfulness"])
46
+ task3 = TaskMIB_Subgraph("arithmetic_subtraction", ["llama3"], "arithmetic_subtraction", ["edge_counts", "faithfulness"])
47
+ task4 = TaskMIB_Subgraph("arc_easy", ["gemma2", "llama3"], "arc_easy", ["edge_counts", "faithfulness"])
48
+ task5 = TaskMIB_Subgraph("arc_challenge", ["llama3"], "arc_challenge", ["edge_counts", "faithfulness"])
49
 
50
 
51
 
src/display/utils.py CHANGED
@@ -68,7 +68,7 @@ auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnConten
68
  # For each task and model combination
69
  for task in TasksMib_Subgraph:
70
  for model in task.value.models:
71
- col_name = f"{task.value.benchmark}_{model}" # ioi_meta_llama, mcqa_qwen, etc.
72
  auto_eval_column_dict_mib_subgraph.append([
73
  col_name,
74
  ColumnContent,
 
68
  # For each task and model combination
69
  for task in TasksMib_Subgraph:
70
  for model in task.value.models:
71
+ col_name = f"{task.value.benchmark}_{model}" # ioi_gpt2, mcqa_qwen2.5, etc.
72
  auto_eval_column_dict_mib_subgraph.append([
73
  col_name,
74
  ColumnContent,
src/leaderboard/read_evals.py CHANGED
@@ -29,9 +29,9 @@ def compute_area(edge_counts, faithfulnesses, log_scale=True):
29
  x_1 = percentages[i_1]
30
  x_2 = percentages[i_2]
31
  # area from point to 100
32
- if log_scale:
33
- x_1 = math.log(x_1)
34
- x_2 = math.log(x_2)
35
  trapezoidal = (percentages[i_2] - percentages[i_1]) * \
36
  (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
37
  area_from_100 += trapezoidal
@@ -58,7 +58,7 @@ class EvalResult_MIB_SUBGRAPH:
58
 
59
  # Initialize results dictionary with the exact structure from JSON
60
  results = {}
61
- for task in ["ioi", "mcqa"]: # Use exact task names from JSON
62
  results[task] = {}
63
 
64
  # Process each model's results maintaining original structure
@@ -67,17 +67,19 @@ class EvalResult_MIB_SUBGRAPH:
67
  if "/" in model_id:
68
  org = model_id.split("/")[0]
69
  if org == "meta-llama":
70
- model_name = "meta_llama"
71
  elif org == "Qwen":
72
- model_name = "qwen"
73
  elif "gpt" in model_id.lower():
74
  model_name = "gpt2"
 
 
75
  else:
76
- model_name = model_id
77
 
78
  # Keep exact scores structure from JSON
79
  scores = model_result.get("scores", {})
80
- for task in ["ioi", "mcqa"]:
81
  if task in scores:
82
  results[task][model_name] = {
83
  "edge_counts": scores[task]["edge_counts"],
@@ -100,10 +102,16 @@ class EvalResult_MIB_SUBGRAPH:
100
  }
101
 
102
  # Initialize all possible columns with '-'
103
- expected_models = ["meta_llama", "qwen", "gpt2"]
104
- expected_tasks = ["ioi", "mcqa"]
105
  for task in expected_tasks:
106
  for model in expected_models:
 
 
 
 
 
 
107
  data_dict[f"{task}_{model}"] = '-'
108
 
109
  all_scores = []
@@ -117,24 +125,30 @@ class EvalResult_MIB_SUBGRAPH:
117
  faithfulness = metrics["faithfulness"]
118
  if isinstance(faithfulness[0], list):
119
  faithfulness = faithfulness[0]
120
-
121
  result = compute_area(metrics["edge_counts"], faithfulness)
122
  if result is None or result[0] is None:
123
  continue
124
 
125
  area_under, _, _ = result
126
- score = area_under * 100
127
  data_dict[col_name] = round(score, 2)
128
  all_scores.append(score)
129
 
130
  # All entries must be present for average
131
  required_entries = [
132
- data_dict['ioi_meta_llama'] != '-',
133
- data_dict['ioi_qwen'] != '-',
134
  data_dict['ioi_gpt2'] != '-',
135
- data_dict['mcqa_meta_llama'] != '-',
136
- data_dict['mcqa_qwen'] != '-',
137
- data_dict['mcqa_gpt2'] != '-'
 
 
 
 
 
 
138
  ]
139
 
140
  data_dict["Average"] = round(np.mean(all_scores), 2) if all(required_entries) else '-'
@@ -145,10 +159,10 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
145
  """From the path of the results folder root, extract all needed info for MIB results"""
146
  model_result_filepaths = []
147
 
148
- print(f"results_path is {results_path}")
149
 
150
  for root, dirnames, files in os.walk(results_path):
151
- print(f"root is {root}, dirnames is {dirnames}, files is {files}")
152
  # We should only have json files in model results
153
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
154
  continue
@@ -162,14 +176,14 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
162
  for file in files:
163
  model_result_filepaths.append(os.path.join(root, file))
164
 
165
- print(f"model_result_filepaths is {model_result_filepaths}")
166
 
167
  eval_results = []
168
  for model_result_filepath in model_result_filepaths:
169
  try:
170
  eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
171
  result = eval_result.init_from_json_file(model_result_filepath)
172
- print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
173
  # Verify the result can be converted to dict format
174
  result.to_dict()
175
  eval_results.append(result)
@@ -424,10 +438,10 @@ class EvalResult_MIB_CAUSALGRAPH:
424
  def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
425
  model_result_filepaths = []
426
 
427
- print(f"Scanning directory: {results_path}")
428
  for root, dirnames, files in os.walk(results_path):
429
- print(f"Current directory: {root}")
430
- print(f"Found files: {files}")
431
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
432
  continue
433
 
@@ -439,21 +453,21 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
439
  for file in files:
440
  model_result_filepaths.append(os.path.join(root, file))
441
 
442
- print(f"Found json files: {model_result_filepaths}")
443
 
444
  eval_results = []
445
  for filepath in model_result_filepaths:
446
  try:
447
  eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
448
  result = eval_result.init_from_json_file(filepath)
449
- print(f"Processed file {filepath}")
450
- print(f"Got result: {result}")
451
  eval_results.append(result)
452
  except Exception as e:
453
  print(f"Error processing {filepath}: {e}")
454
  continue
455
 
456
- print(f"Total results processed: {len(eval_results)}")
457
  return eval_results
458
 
459
 
 
29
  x_1 = percentages[i_1]
30
  x_2 = percentages[i_2]
31
  # area from point to 100
32
+ # if log_scale:
33
+ # x_1 = math.log(x_1)
34
+ # x_2 = math.log(x_2)
35
  trapezoidal = (percentages[i_2] - percentages[i_1]) * \
36
  (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
37
  area_from_100 += trapezoidal
 
58
 
59
  # Initialize results dictionary with the exact structure from JSON
60
  results = {}
61
+ for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]: # Use exact task names from JSON
62
  results[task] = {}
63
 
64
  # Process each model's results maintaining original structure
 
67
  if "/" in model_id:
68
  org = model_id.split("/")[0]
69
  if org == "meta-llama":
70
+ model_name = "llama3"
71
  elif org == "Qwen":
72
+ model_name = "qwen2_5"
73
  elif "gpt" in model_id.lower():
74
  model_name = "gpt2"
75
+ elif org == "google":
76
+ model_name = "gemma2"
77
  else:
78
+ model_name = model_id.replace(".", "_")
79
 
80
  # Keep exact scores structure from JSON
81
  scores = model_result.get("scores", {})
82
+ for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
83
  if task in scores:
84
  results[task][model_name] = {
85
  "edge_counts": scores[task]["edge_counts"],
 
102
  }
103
 
104
  # Initialize all possible columns with '-'
105
+ expected_models = ["llama3", "qwen2_5", "gpt2", "gemma2"]
106
+ expected_tasks = ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]
107
  for task in expected_tasks:
108
  for model in expected_models:
109
+ if model == "gpt2" and task != "ioi":
110
+ continue
111
+ if model == "qwen2_5" and task.startswith(("arithmetic", "arc")):
112
+ continue
113
+ if model == "gemma2" and (task.startswith("arithmetic") or task == "arc_challenge"):
114
+ continue
115
  data_dict[f"{task}_{model}"] = '-'
116
 
117
  all_scores = []
 
125
  faithfulness = metrics["faithfulness"]
126
  if isinstance(faithfulness[0], list):
127
  faithfulness = faithfulness[0]
128
+
129
  result = compute_area(metrics["edge_counts"], faithfulness)
130
  if result is None or result[0] is None:
131
  continue
132
 
133
  area_under, _, _ = result
134
+ score = area_under
135
  data_dict[col_name] = round(score, 2)
136
  all_scores.append(score)
137
 
138
  # All entries must be present for average
139
  required_entries = [
140
+ data_dict['ioi_llama3'] != '-',
141
+ data_dict['ioi_qwen2_5'] != '-',
142
  data_dict['ioi_gpt2'] != '-',
143
+ data_dict['ioi_gemma2'] != '-',
144
+ data_dict['mcqa_llama3'] != '-',
145
+ data_dict['mcqa_qwen2_5'] != '-',
146
+ data_dict['mcqa_gemma2'] != '-',
147
+ data_dict['arithmetic_addition_llama3'] != '-',
148
+ data_dict['arithmetic_subtraction_llama3'] != '-',
149
+ data_dict['arc_easy_gemma2'] != '-',
150
+ data_dict['arc_easy_llama3'] != '-',
151
+ data_dict['arc_challenge_llama3'] != '-'
152
  ]
153
 
154
  data_dict["Average"] = round(np.mean(all_scores), 2) if all(required_entries) else '-'
 
159
  """From the path of the results folder root, extract all needed info for MIB results"""
160
  model_result_filepaths = []
161
 
162
+ # print(f"results_path is {results_path}")
163
 
164
  for root, dirnames, files in os.walk(results_path):
165
+ # print(f"root is {root}, dirnames is {dirnames}, files is {files}")
166
  # We should only have json files in model results
167
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
168
  continue
 
176
  for file in files:
177
  model_result_filepaths.append(os.path.join(root, file))
178
 
179
+ # print(f"model_result_filepaths is {model_result_filepaths}")
180
 
181
  eval_results = []
182
  for model_result_filepath in model_result_filepaths:
183
  try:
184
  eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
185
  result = eval_result.init_from_json_file(model_result_filepath)
186
+ # print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
187
  # Verify the result can be converted to dict format
188
  result.to_dict()
189
  eval_results.append(result)
 
438
  def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
439
  model_result_filepaths = []
440
 
441
+ # print(f"Scanning directory: {results_path}")
442
  for root, dirnames, files in os.walk(results_path):
443
+ # print(f"Current directory: {root}")
444
+ # print(f"Found files: {files}")
445
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
446
  continue
447
 
 
453
  for file in files:
454
  model_result_filepaths.append(os.path.join(root, file))
455
 
456
+ # print(f"Found json files: {model_result_filepaths}")
457
 
458
  eval_results = []
459
  for filepath in model_result_filepaths:
460
  try:
461
  eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
462
  result = eval_result.init_from_json_file(filepath)
463
+ # print(f"Processed file {filepath}")
464
+ # print(f"Got result: {result}")
465
  eval_results.append(result)
466
  except Exception as e:
467
  print(f"Error processing {filepath}: {e}")
468
  continue
469
 
470
+ # print(f"Total results processed: {len(eval_results)}")
471
  return eval_results
472
 
473
 
src/populate.py CHANGED
@@ -10,11 +10,11 @@ from src.about import TasksMib_Causalgraph
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- print(f"results_path is {results_path}, requests_path is {requests_path}")
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
- print(f"raw_data is {raw_data}")
16
  all_data_json = [v.to_dict() for v in raw_data]
17
- print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
18
  all_data_json_filtered = []
19
  for item in all_data_json:
20
  item["Track"] = item["eval_name"].split("_")[-1]
@@ -32,7 +32,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
32
  # df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
33
  # df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
34
 
35
- print(f"df is {df}")
36
 
37
  # df = df[cols].round(decimals=1)
38
 
@@ -44,13 +44,13 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
44
 
45
  def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
46
  """Creates a dataframe from all the MIB experiment results"""
47
- print(f"results_path is {results_path}, requests_path is {requests_path}")
48
  raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
49
- print(f"raw_data is {raw_data}")
50
 
51
  # Convert each result to dict format
52
  all_data_json = [v.to_dict() for v in raw_data]
53
- print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
54
 
55
  # Convert to dataframe
56
  df = pd.DataFrame.from_records(all_data_json)
@@ -242,7 +242,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
242
  # return detailed_df, aggregated_df, intervention_averaged_df
243
 
244
  def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
245
- print(f"results_path is {results_path}, requests_path is {requests_path}")
246
  raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
247
 
248
  # Convert each result to dict format for detailed df
@@ -250,7 +250,7 @@ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, co
250
  detailed_df = pd.DataFrame.from_records(all_data_json)
251
 
252
  # Print the actual columns for debugging
253
- print("Original columns:", detailed_df.columns.tolist())
254
 
255
  # Rename columns to match schema
256
  column_mapping = {}
@@ -271,7 +271,7 @@ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, co
271
  # Create intervention-averaged df
272
  intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
273
 
274
- print("Transformed columns:", detailed_df.columns.tolist())
275
 
276
  return detailed_df, aggregated_df, intervention_averaged_df
277
 
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ # print(f"results_path is {results_path}, requests_path is {requests_path}")
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
+ # print(f"raw_data is {raw_data}")
16
  all_data_json = [v.to_dict() for v in raw_data]
17
+ # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
18
  all_data_json_filtered = []
19
  for item in all_data_json:
20
  item["Track"] = item["eval_name"].split("_")[-1]
 
32
  # df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
33
  # df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
34
 
35
+ # print(f"df is {df}")
36
 
37
  # df = df[cols].round(decimals=1)
38
 
 
44
 
45
  def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
46
  """Creates a dataframe from all the MIB experiment results"""
47
+ # print(f"results_path is {results_path}, requests_path is {requests_path}")
48
  raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
49
+ # print(f"raw_data is {raw_data}")
50
 
51
  # Convert each result to dict format
52
  all_data_json = [v.to_dict() for v in raw_data]
53
+ # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
54
 
55
  # Convert to dataframe
56
  df = pd.DataFrame.from_records(all_data_json)
 
242
  # return detailed_df, aggregated_df, intervention_averaged_df
243
 
244
  def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
245
+ # print(f"results_path is {results_path}, requests_path is {requests_path}")
246
  raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
247
 
248
  # Convert each result to dict format for detailed df
 
250
  detailed_df = pd.DataFrame.from_records(all_data_json)
251
 
252
  # Print the actual columns for debugging
253
+ # print("Original columns:", detailed_df.columns.tolist())
254
 
255
  # Rename columns to match schema
256
  column_mapping = {}
 
271
  # Create intervention-averaged df
272
  intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
273
 
274
+ # print("Transformed columns:", detailed_df.columns.tolist())
275
 
276
  return detailed_df, aggregated_df, intervention_averaged_df
277