Spaces:
Running
Running
jasonshaoshun
commited on
Commit
Β·
e46e945
1
Parent(s):
61542b8
debug
Browse files- app.py +3 -3
- src/about.py +38 -11
- src/display/utils.py +72 -24
- src/leaderboard/read_evals.py +97 -0
- src/populate.py +15 -3
app.py
CHANGED
@@ -74,7 +74,7 @@ except Exception:
|
|
74 |
|
75 |
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
76 |
|
77 |
-
|
78 |
|
79 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
80 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
@@ -203,8 +203,8 @@ with demo:
|
|
203 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
204 |
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
205 |
|
206 |
-
|
207 |
-
|
208 |
|
209 |
# with gr.Row():
|
210 |
# with gr.Accordion("π Citation", open=False):
|
|
|
74 |
|
75 |
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
76 |
|
77 |
+
LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
|
78 |
|
79 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
80 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
|
|
203 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
204 |
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
205 |
|
206 |
+
with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
207 |
+
leaderboard = init_leaderboard_mib_causalgraph(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
|
208 |
|
209 |
# with gr.Row():
|
210 |
# with gr.Accordion("π Citation", open=False):
|
src/about.py
CHANGED
@@ -8,13 +8,6 @@ class Task:
|
|
8 |
col_name: str
|
9 |
|
10 |
|
11 |
-
@dataclass
|
12 |
-
class TaskMIB:
|
13 |
-
benchmark: str # task name in json (ioi/arithmetic)
|
14 |
-
models: list[str] # list of models to show as sub-columns
|
15 |
-
col_name: str # display name in leaderboard
|
16 |
-
metrics: list[str] # metrics to store (edge_counts, faithfulness)
|
17 |
-
|
18 |
|
19 |
|
20 |
# Select your tasks here
|
@@ -27,10 +20,6 @@ class Tasks(Enum):
|
|
27 |
task3 = Task("ewok", "acc", "EWoK")
|
28 |
|
29 |
|
30 |
-
class TasksMib_Subgraph(Enum):
|
31 |
-
task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
|
32 |
-
task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
|
33 |
-
|
34 |
|
35 |
class TasksMultimodal(Enum):
|
36 |
task0 = Task("blimp", "acc", "BLiMP")
|
@@ -41,6 +30,44 @@ class TasksMultimodal(Enum):
|
|
41 |
task5 = Task("winoground", "acc", "Winoground")
|
42 |
task6 = Task("devbench", "acc", "DevBench")
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
NUM_FEWSHOT = 0 # Change with your few shot
|
45 |
# ---------------------------------------------------
|
46 |
|
|
|
8 |
col_name: str
|
9 |
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
|
13 |
# Select your tasks here
|
|
|
20 |
task3 = Task("ewok", "acc", "EWoK")
|
21 |
|
22 |
|
|
|
|
|
|
|
|
|
23 |
|
24 |
class TasksMultimodal(Enum):
|
25 |
task0 = Task("blimp", "acc", "BLiMP")
|
|
|
30 |
task5 = Task("winoground", "acc", "Winoground")
|
31 |
task6 = Task("devbench", "acc", "DevBench")
|
32 |
|
33 |
+
|
34 |
+
|
35 |
+
@dataclass
|
36 |
+
class TaskMIB_Subgraph:
|
37 |
+
benchmark: str # task name in json (ioi/arithmetic)
|
38 |
+
models: list[str] # list of models to show as sub-columns
|
39 |
+
col_name: str # display name in leaderboard
|
40 |
+
metrics: list[str] # metrics to store (edge_counts, faithfulness)
|
41 |
+
|
42 |
+
class TasksMib_Subgraph(Enum):
|
43 |
+
task0 = TaskMIB_Subgraph("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
|
44 |
+
task1 = TaskMIB_Subgraph("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
|
45 |
+
|
46 |
+
|
47 |
+
@dataclass
|
48 |
+
class TaskMIB_Causalgraph:
|
49 |
+
benchmark: str # MCQA
|
50 |
+
models: list[str] # LlamaForCausalLM
|
51 |
+
layers: list[str] # 0-31
|
52 |
+
col_name: str # display name in leaderboard
|
53 |
+
interventions: list[str] # output_token, output_location
|
54 |
+
counterfactuals: list[str] # symbol_counterfactual, randomLetter_counterfactual, etc.
|
55 |
+
metrics: list[str] # score <- Added this field
|
56 |
+
|
57 |
+
class TasksMib_Causalgraph(Enum):
|
58 |
+
task0 = TaskMIB_Causalgraph(
|
59 |
+
"MCQA",
|
60 |
+
["LlamaForCausalLM"],
|
61 |
+
[str(i) for i in range(32)], # 0-31 layers
|
62 |
+
"mcqa",
|
63 |
+
["output_token", "output_location"],
|
64 |
+
["symbol_counterfactual", "randomLetter_counterfactual",
|
65 |
+
"answerPosition_counterfactual", "answerPosition_symbol_counterfactual"],
|
66 |
+
["score"] # Added this
|
67 |
+
)
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
NUM_FEWSHOT = 0 # Change with your few shot
|
72 |
# ---------------------------------------------------
|
73 |
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -21,10 +21,41 @@ class ColumnContent:
|
|
21 |
never_hidden: bool = False
|
22 |
|
23 |
## Leaderboard columns
|
24 |
-
auto_eval_column_dict_mib_subgraph = []
|
25 |
auto_eval_column_dict = []
|
26 |
auto_eval_column_dict_multimodal = []
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
|
30 |
|
@@ -67,6 +98,45 @@ COLS_MIB_CAUSALGRAPH = []
|
|
67 |
BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
68 |
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
|
72 |
|
@@ -76,32 +146,10 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
|
76 |
|
77 |
|
78 |
|
79 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
80 |
-
auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
81 |
-
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
82 |
-
#Scores
|
83 |
-
for task in Tasks:
|
84 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
85 |
-
# Model information
|
86 |
-
auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
87 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
88 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
89 |
|
90 |
-
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
91 |
-
auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
92 |
-
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
93 |
-
for task in TasksMultimodal:
|
94 |
-
auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
95 |
-
if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
|
96 |
-
auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
97 |
-
auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
|
98 |
-
auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
99 |
-
auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
100 |
|
101 |
|
102 |
|
103 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
104 |
-
AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_column_dict_multimodal, frozen=True)
|
105 |
|
106 |
## For the queue columns in the submission tab
|
107 |
@dataclass(frozen=True)
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph, TasksMib_Causalgraph
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
21 |
never_hidden: bool = False
|
22 |
|
23 |
## Leaderboard columns
|
|
|
24 |
auto_eval_column_dict = []
|
25 |
auto_eval_column_dict_multimodal = []
|
26 |
|
27 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
+
auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
29 |
+
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
30 |
+
#Scores
|
31 |
+
for task in Tasks:
|
32 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
+
# Model information
|
34 |
+
auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
35 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
36 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
37 |
+
|
38 |
+
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
39 |
+
auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
40 |
+
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
41 |
+
for task in TasksMultimodal:
|
42 |
+
auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
43 |
+
if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
|
44 |
+
auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
45 |
+
auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
|
46 |
+
auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
47 |
+
auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
52 |
+
AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_column_dict_multimodal, frozen=True)
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
|
60 |
|
61 |
|
|
|
98 |
BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
99 |
|
100 |
|
101 |
+
# Initialize the MIB causal graph columns
|
102 |
+
auto_eval_column_dict_mib_causalgraph = []
|
103 |
+
|
104 |
+
# Method name column
|
105 |
+
auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
106 |
+
|
107 |
+
# For each layer-intervention-counterfactual combination
|
108 |
+
for task in TasksMib_Causalgraph:
|
109 |
+
for model in task.value.models:
|
110 |
+
for layer in task.value.layers:
|
111 |
+
for intervention in task.value.interventions:
|
112 |
+
for counterfactual in task.value.counterfactuals:
|
113 |
+
# Create column name like "layer0_output_token_symbol_counterfactual"
|
114 |
+
col_name = f"layer{layer}_{intervention}_{counterfactual}"
|
115 |
+
field_name = col_name.lower()
|
116 |
+
auto_eval_column_dict_mib_causalgraph.append([
|
117 |
+
field_name,
|
118 |
+
ColumnContent,
|
119 |
+
ColumnContent(col_name, "number", True)
|
120 |
+
])
|
121 |
+
|
122 |
+
# Create the dataclass for MIB causal graph columns
|
123 |
+
AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)
|
124 |
+
|
125 |
+
# Column selection for display
|
126 |
+
COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
|
127 |
+
BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"layer{layer}_{intervention}_{counterfactual}"
|
128 |
+
for task in TasksMib_Causalgraph
|
129 |
+
for model in task.value.models
|
130 |
+
for layer in task.value.layers
|
131 |
+
for intervention in task.value.interventions
|
132 |
+
for counterfactual in task.value.counterfactuals]
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
|
141 |
|
142 |
|
|
|
146 |
|
147 |
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
|
152 |
|
|
|
|
|
153 |
|
154 |
## For the queue columns in the submission tab
|
155 |
@dataclass(frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
@@ -182,7 +182,104 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
182 |
|
183 |
|
184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
|
188 |
|
|
|
182 |
|
183 |
|
184 |
|
185 |
+
@dataclass
|
186 |
+
class EvalResult_MIB_CAUSALGRAPH:
|
187 |
+
"""Represents one full evaluation for a method in MIB causalgraph."""
|
188 |
+
eval_name: str # method name as identifier
|
189 |
+
method_name: str # name of the interpretation method (e.g., "baseline_patching")
|
190 |
+
results: Dict # nested dict of results {model_id: {task_scores: [{layer, scores}]}}
|
191 |
+
|
192 |
+
def init_from_json_file(self, json_filepath):
|
193 |
+
"""Inits results from the method result file"""
|
194 |
+
with open(json_filepath) as fp:
|
195 |
+
data = json.load(fp)
|
196 |
+
|
197 |
+
method_name = data.get("method_name")
|
198 |
+
results = {}
|
199 |
+
|
200 |
+
# Get results for each model
|
201 |
+
for model_result in data.get("results", []):
|
202 |
+
model_id = model_result.get("model_id", "")
|
203 |
+
task_scores = model_result.get("task_scores", {})
|
204 |
+
|
205 |
+
# Process MCQA task scores
|
206 |
+
mcqa_scores = {}
|
207 |
+
for layer_data in task_scores.get("MCQA", []):
|
208 |
+
layer = layer_data.get("layer")
|
209 |
+
layer_scores = layer_data.get("layer_scores", [])
|
210 |
+
|
211 |
+
# Store scores for each intervention and counterfactual
|
212 |
+
for intervention_data in layer_scores:
|
213 |
+
intervention = intervention_data["intervention"][0] # e.g., "output_token"
|
214 |
+
counterfactual_scores = intervention_data["counterfactual_scores"]
|
215 |
+
|
216 |
+
for cf_score in counterfactual_scores:
|
217 |
+
counterfactual = cf_score["counterfactual"][0] # e.g., "symbol_counterfactual"
|
218 |
+
score = cf_score["score"]
|
219 |
+
|
220 |
+
# Create key for this combination
|
221 |
+
key = f"layer{layer}_{intervention}_{counterfactual}"
|
222 |
+
mcqa_scores[key] = score
|
223 |
+
|
224 |
+
results[model_id] = mcqa_scores
|
225 |
+
|
226 |
+
return EvalResult_MIB_CAUSALGRAPH(
|
227 |
+
eval_name=method_name,
|
228 |
+
method_name=method_name,
|
229 |
+
results=results
|
230 |
+
)
|
231 |
+
|
232 |
+
def to_dict(self):
|
233 |
+
"""Converts the Eval Result to a dict for dataframe display"""
|
234 |
+
data_dict = {
|
235 |
+
"eval_name": self.eval_name,
|
236 |
+
"Method": self.method_name,
|
237 |
+
}
|
238 |
+
|
239 |
+
# For each model, add all layer/intervention/counterfactual combinations
|
240 |
+
for model_id, scores in self.results.items():
|
241 |
+
for score_key, score_value in scores.items():
|
242 |
+
data_dict[score_key] = score_value
|
243 |
+
|
244 |
+
return data_dict
|
245 |
+
|
246 |
+
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
247 |
+
"""From the path of the results folder root, extract all needed info for MIB causalgraph results"""
|
248 |
+
model_result_filepaths = []
|
249 |
+
|
250 |
+
print(f"results_path is {results_path}")
|
251 |
+
|
252 |
+
for root, dirnames, files in os.walk(results_path):
|
253 |
+
print(f"root is {root}, dirnames is {dirnames}, files is {files}")
|
254 |
+
# We should only have json files in model results
|
255 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
256 |
+
continue
|
257 |
+
|
258 |
+
# Sort the files by date - keeping original sorting logic
|
259 |
+
try:
|
260 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
261 |
+
except dateutil.parser._parser.ParserError:
|
262 |
+
files = [files[-1]]
|
263 |
+
|
264 |
+
for file in files:
|
265 |
+
model_result_filepaths.append(os.path.join(root, file))
|
266 |
|
267 |
+
print(f"model_result_filepaths is {model_result_filepaths}")
|
268 |
+
|
269 |
+
eval_results = []
|
270 |
+
for model_result_filepath in model_result_filepaths:
|
271 |
+
try:
|
272 |
+
eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {}) # Create empty instance
|
273 |
+
result = eval_result.init_from_json_file(model_result_filepath)
|
274 |
+
print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
|
275 |
+
# Verify the result can be converted to dict format
|
276 |
+
result.to_dict()
|
277 |
+
eval_results.append(result)
|
278 |
+
except Exception as e:
|
279 |
+
print(f"Error processing {model_result_filepath}: {e}")
|
280 |
+
continue
|
281 |
+
|
282 |
+
return eval_results
|
283 |
|
284 |
|
285 |
|
src/populate.py
CHANGED
@@ -66,11 +66,23 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
|
|
66 |
return df
|
67 |
|
68 |
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
69 |
-
"""Creates a dataframe from all the MIB experiment results"""
|
70 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
71 |
raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
|
76 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
66 |
return df
|
67 |
|
68 |
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
69 |
+
"""Creates a dataframe from all the MIB causal graph experiment results"""
|
70 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
71 |
raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
72 |
+
print(f"raw_data is {raw_data}")
|
73 |
+
|
74 |
+
# Convert each result to dict format
|
75 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
76 |
+
print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
|
77 |
+
|
78 |
+
# Convert to dataframe
|
79 |
+
df = pd.DataFrame.from_records(all_data_json)
|
80 |
+
|
81 |
+
# Round numeric columns to 2 decimal places
|
82 |
+
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
|
83 |
+
df[numeric_cols] = df[numeric_cols].round(2)
|
84 |
+
|
85 |
+
return df
|
86 |
|
87 |
|
88 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|