jasonshaoshun commited on
Commit
e46e945
Β·
1 Parent(s): 61542b8
Files changed (5) hide show
  1. app.py +3 -3
  2. src/about.py +38 -11
  3. src/display/utils.py +72 -24
  4. src/leaderboard/read_evals.py +97 -0
  5. src/populate.py +15 -3
app.py CHANGED
@@ -74,7 +74,7 @@ except Exception:
74
 
75
  LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
76
 
77
- # LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
78
 
79
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
80
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
@@ -203,8 +203,8 @@ with demo:
203
  with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
204
  leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
205
 
206
- # with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
207
- # leaderboard = init_leaderboard_mib_causalgraph(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
208
 
209
  # with gr.Row():
210
  # with gr.Accordion("πŸ“™ Citation", open=False):
 
74
 
75
  LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
76
 
77
+ LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
78
 
79
  # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
80
  # LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
 
203
  with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
204
  leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
205
 
206
+ with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
207
+ leaderboard = init_leaderboard_mib_causalgraph(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
208
 
209
  # with gr.Row():
210
  # with gr.Accordion("πŸ“™ Citation", open=False):
src/about.py CHANGED
@@ -8,13 +8,6 @@ class Task:
8
  col_name: str
9
 
10
 
11
- @dataclass
12
- class TaskMIB:
13
- benchmark: str # task name in json (ioi/arithmetic)
14
- models: list[str] # list of models to show as sub-columns
15
- col_name: str # display name in leaderboard
16
- metrics: list[str] # metrics to store (edge_counts, faithfulness)
17
-
18
 
19
 
20
  # Select your tasks here
@@ -27,10 +20,6 @@ class Tasks(Enum):
27
  task3 = Task("ewok", "acc", "EWoK")
28
 
29
 
30
- class TasksMib_Subgraph(Enum):
31
- task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
32
- task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
33
-
34
 
35
  class TasksMultimodal(Enum):
36
  task0 = Task("blimp", "acc", "BLiMP")
@@ -41,6 +30,44 @@ class TasksMultimodal(Enum):
41
  task5 = Task("winoground", "acc", "Winoground")
42
  task6 = Task("devbench", "acc", "DevBench")
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  NUM_FEWSHOT = 0 # Change with your few shot
45
  # ---------------------------------------------------
46
 
 
8
  col_name: str
9
 
10
 
 
 
 
 
 
 
 
11
 
12
 
13
  # Select your tasks here
 
20
  task3 = Task("ewok", "acc", "EWoK")
21
 
22
 
 
 
 
 
23
 
24
  class TasksMultimodal(Enum):
25
  task0 = Task("blimp", "acc", "BLiMP")
 
30
  task5 = Task("winoground", "acc", "Winoground")
31
  task6 = Task("devbench", "acc", "DevBench")
32
 
33
+
34
+
35
+ @dataclass
36
+ class TaskMIB_Subgraph:
37
+ benchmark: str # task name in json (ioi/arithmetic)
38
+ models: list[str] # list of models to show as sub-columns
39
+ col_name: str # display name in leaderboard
40
+ metrics: list[str] # metrics to store (edge_counts, faithfulness)
41
+
42
+ class TasksMib_Subgraph(Enum):
43
+ task0 = TaskMIB_Subgraph("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
44
+ task1 = TaskMIB_Subgraph("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
45
+
46
+
47
+ @dataclass
48
+ class TaskMIB_Causalgraph:
49
+ benchmark: str # MCQA
50
+ models: list[str] # LlamaForCausalLM
51
+ layers: list[str] # 0-31
52
+ col_name: str # display name in leaderboard
53
+ interventions: list[str] # output_token, output_location
54
+ counterfactuals: list[str] # symbol_counterfactual, randomLetter_counterfactual, etc.
55
+ metrics: list[str] # score <- Added this field
56
+
57
+ class TasksMib_Causalgraph(Enum):
58
+ task0 = TaskMIB_Causalgraph(
59
+ "MCQA",
60
+ ["LlamaForCausalLM"],
61
+ [str(i) for i in range(32)], # 0-31 layers
62
+ "mcqa",
63
+ ["output_token", "output_location"],
64
+ ["symbol_counterfactual", "randomLetter_counterfactual",
65
+ "answerPosition_counterfactual", "answerPosition_symbol_counterfactual"],
66
+ ["score"] # Added this
67
+ )
68
+
69
+
70
+
71
  NUM_FEWSHOT = 0 # Change with your few shot
72
  # ---------------------------------------------------
73
 
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -21,10 +21,41 @@ class ColumnContent:
21
  never_hidden: bool = False
22
 
23
  ## Leaderboard columns
24
- auto_eval_column_dict_mib_subgraph = []
25
  auto_eval_column_dict = []
26
  auto_eval_column_dict_multimodal = []
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
 
@@ -67,6 +98,45 @@ COLS_MIB_CAUSALGRAPH = []
67
  BENCHMARK_COLS_MIB_CAUSALGRAPH = []
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
 
@@ -76,32 +146,10 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
76
 
77
 
78
 
79
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
80
- auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
81
- auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
82
- #Scores
83
- for task in Tasks:
84
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
85
- # Model information
86
- auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
87
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
88
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
89
 
90
- auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
91
- auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
92
- auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
93
- for task in TasksMultimodal:
94
- auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
95
- if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
96
- auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
97
- auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
98
- auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
99
- auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
100
 
101
 
102
 
103
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
104
- AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_column_dict_multimodal, frozen=True)
105
 
106
  ## For the queue columns in the submission tab
107
  @dataclass(frozen=True)
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph, TasksMib_Causalgraph
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
21
  never_hidden: bool = False
22
 
23
  ## Leaderboard columns
 
24
  auto_eval_column_dict = []
25
  auto_eval_column_dict_multimodal = []
26
 
27
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
29
+ auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
30
+ #Scores
31
+ for task in Tasks:
32
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
+ # Model information
34
+ auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
35
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
36
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
37
+
38
+ auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
39
+ auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
40
+ auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
41
+ for task in TasksMultimodal:
42
+ auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
43
+ if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
44
+ auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
45
+ auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
46
+ auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
47
+ auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
48
+
49
+
50
+
51
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
52
+ AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_column_dict_multimodal, frozen=True)
53
+
54
+
55
+
56
+
57
+
58
+
59
 
60
 
61
 
 
98
  BENCHMARK_COLS_MIB_CAUSALGRAPH = []
99
 
100
 
101
+ # Initialize the MIB causal graph columns
102
+ auto_eval_column_dict_mib_causalgraph = []
103
+
104
+ # Method name column
105
+ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
106
+
107
+ # For each layer-intervention-counterfactual combination
108
+ for task in TasksMib_Causalgraph:
109
+ for model in task.value.models:
110
+ for layer in task.value.layers:
111
+ for intervention in task.value.interventions:
112
+ for counterfactual in task.value.counterfactuals:
113
+ # Create column name like "layer0_output_token_symbol_counterfactual"
114
+ col_name = f"layer{layer}_{intervention}_{counterfactual}"
115
+ field_name = col_name.lower()
116
+ auto_eval_column_dict_mib_causalgraph.append([
117
+ field_name,
118
+ ColumnContent,
119
+ ColumnContent(col_name, "number", True)
120
+ ])
121
+
122
+ # Create the dataclass for MIB causal graph columns
123
+ AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)
124
+
125
+ # Column selection for display
126
+ COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
127
+ BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"layer{layer}_{intervention}_{counterfactual}"
128
+ for task in TasksMib_Causalgraph
129
+ for model in task.value.models
130
+ for layer in task.value.layers
131
+ for intervention in task.value.interventions
132
+ for counterfactual in task.value.counterfactuals]
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
 
141
 
142
 
 
146
 
147
 
148
 
 
 
 
 
 
 
 
 
 
 
149
 
 
 
 
 
 
 
 
 
 
 
150
 
151
 
152
 
 
 
153
 
154
  ## For the queue columns in the submission tab
155
  @dataclass(frozen=True)
src/leaderboard/read_evals.py CHANGED
@@ -182,7 +182,104 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
182
 
183
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
 
188
 
 
182
 
183
 
184
 
185
+ @dataclass
186
+ class EvalResult_MIB_CAUSALGRAPH:
187
+ """Represents one full evaluation for a method in MIB causalgraph."""
188
+ eval_name: str # method name as identifier
189
+ method_name: str # name of the interpretation method (e.g., "baseline_patching")
190
+ results: Dict # nested dict of results {model_id: {task_scores: [{layer, scores}]}}
191
+
192
+ def init_from_json_file(self, json_filepath):
193
+ """Inits results from the method result file"""
194
+ with open(json_filepath) as fp:
195
+ data = json.load(fp)
196
+
197
+ method_name = data.get("method_name")
198
+ results = {}
199
+
200
+ # Get results for each model
201
+ for model_result in data.get("results", []):
202
+ model_id = model_result.get("model_id", "")
203
+ task_scores = model_result.get("task_scores", {})
204
+
205
+ # Process MCQA task scores
206
+ mcqa_scores = {}
207
+ for layer_data in task_scores.get("MCQA", []):
208
+ layer = layer_data.get("layer")
209
+ layer_scores = layer_data.get("layer_scores", [])
210
+
211
+ # Store scores for each intervention and counterfactual
212
+ for intervention_data in layer_scores:
213
+ intervention = intervention_data["intervention"][0] # e.g., "output_token"
214
+ counterfactual_scores = intervention_data["counterfactual_scores"]
215
+
216
+ for cf_score in counterfactual_scores:
217
+ counterfactual = cf_score["counterfactual"][0] # e.g., "symbol_counterfactual"
218
+ score = cf_score["score"]
219
+
220
+ # Create key for this combination
221
+ key = f"layer{layer}_{intervention}_{counterfactual}"
222
+ mcqa_scores[key] = score
223
+
224
+ results[model_id] = mcqa_scores
225
+
226
+ return EvalResult_MIB_CAUSALGRAPH(
227
+ eval_name=method_name,
228
+ method_name=method_name,
229
+ results=results
230
+ )
231
+
232
+ def to_dict(self):
233
+ """Converts the Eval Result to a dict for dataframe display"""
234
+ data_dict = {
235
+ "eval_name": self.eval_name,
236
+ "Method": self.method_name,
237
+ }
238
+
239
+ # For each model, add all layer/intervention/counterfactual combinations
240
+ for model_id, scores in self.results.items():
241
+ for score_key, score_value in scores.items():
242
+ data_dict[score_key] = score_value
243
+
244
+ return data_dict
245
+
246
+ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
247
+ """From the path of the results folder root, extract all needed info for MIB causalgraph results"""
248
+ model_result_filepaths = []
249
+
250
+ print(f"results_path is {results_path}")
251
+
252
+ for root, dirnames, files in os.walk(results_path):
253
+ print(f"root is {root}, dirnames is {dirnames}, files is {files}")
254
+ # We should only have json files in model results
255
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
256
+ continue
257
+
258
+ # Sort the files by date - keeping original sorting logic
259
+ try:
260
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
261
+ except dateutil.parser._parser.ParserError:
262
+ files = [files[-1]]
263
+
264
+ for file in files:
265
+ model_result_filepaths.append(os.path.join(root, file))
266
 
267
+ print(f"model_result_filepaths is {model_result_filepaths}")
268
+
269
+ eval_results = []
270
+ for model_result_filepath in model_result_filepaths:
271
+ try:
272
+ eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {}) # Create empty instance
273
+ result = eval_result.init_from_json_file(model_result_filepath)
274
+ print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
275
+ # Verify the result can be converted to dict format
276
+ result.to_dict()
277
+ eval_results.append(result)
278
+ except Exception as e:
279
+ print(f"Error processing {model_result_filepath}: {e}")
280
+ continue
281
+
282
+ return eval_results
283
 
284
 
285
 
src/populate.py CHANGED
@@ -66,11 +66,23 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
66
  return df
67
 
68
  def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
69
- """Creates a dataframe from all the MIB experiment results"""
70
  print(f"results_path is {results_path}, requests_path is {requests_path}")
71
  raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
72
- # Implement the rest of the code
73
- return raw_data
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
 
76
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
66
  return df
67
 
68
  def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
69
+ """Creates a dataframe from all the MIB causal graph experiment results"""
70
  print(f"results_path is {results_path}, requests_path is {requests_path}")
71
  raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
72
+ print(f"raw_data is {raw_data}")
73
+
74
+ # Convert each result to dict format
75
+ all_data_json = [v.to_dict() for v in raw_data]
76
+ print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
77
+
78
+ # Convert to dataframe
79
+ df = pd.DataFrame.from_records(all_data_json)
80
+
81
+ # Round numeric columns to 2 decimal places
82
+ numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
83
+ df[numeric_cols] = df[numeric_cols].round(2)
84
+
85
+ return df
86
 
87
 
88
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: