Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
53c7136
1
Parent(s):
202dbe2
debug
Browse files- app.py +2 -2
- src/about.py +3 -3
- src/display/utils.py +30 -8
- src/leaderboard/read_evals.py +26 -31
- src/populate.py +51 -18
app.py
CHANGED
|
@@ -75,7 +75,6 @@ except Exception:
|
|
| 75 |
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
| 76 |
|
| 77 |
# LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
|
| 78 |
-
|
| 79 |
# In app.py, modify the LEADERBOARD initialization
|
| 80 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
| 81 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
|
|
@@ -84,6 +83,7 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
|
|
| 84 |
BENCHMARK_COLS_MIB_CAUSALGRAPH
|
| 85 |
)
|
| 86 |
|
|
|
|
| 87 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 88 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
| 89 |
|
|
@@ -210,7 +210,7 @@ with demo:
|
|
| 210 |
|
| 211 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
| 212 |
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
| 213 |
-
|
| 214 |
# Then modify the Causal Graph tab section
|
| 215 |
with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
| 216 |
with gr.Tabs() as causalgraph_tabs:
|
|
|
|
| 75 |
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
| 76 |
|
| 77 |
# LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
|
|
|
|
| 78 |
# In app.py, modify the LEADERBOARD initialization
|
| 79 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
| 80 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
|
|
|
|
| 83 |
BENCHMARK_COLS_MIB_CAUSALGRAPH
|
| 84 |
)
|
| 85 |
|
| 86 |
+
|
| 87 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 88 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
| 89 |
|
|
|
|
| 210 |
|
| 211 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
| 212 |
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
| 213 |
+
|
| 214 |
# Then modify the Causal Graph tab section
|
| 215 |
with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
| 216 |
with gr.Tabs() as causalgraph_tabs:
|
src/about.py
CHANGED
|
@@ -47,7 +47,7 @@ class TasksMib_Subgraph(Enum):
|
|
| 47 |
@dataclass
|
| 48 |
class TaskMIB_Causalgraph:
|
| 49 |
benchmark: str # MCQA
|
| 50 |
-
models: list[str] #
|
| 51 |
layers: list[str] # 0-31
|
| 52 |
col_name: str # display name in leaderboard
|
| 53 |
interventions: list[str] # output_token, output_location
|
|
@@ -57,8 +57,8 @@ class TaskMIB_Causalgraph:
|
|
| 57 |
class TasksMib_Causalgraph(Enum):
|
| 58 |
task0 = TaskMIB_Causalgraph(
|
| 59 |
"MCQA",
|
| 60 |
-
["LlamaForCausalLM"],
|
| 61 |
-
[str(i) for i in range(32)],
|
| 62 |
"mcqa",
|
| 63 |
["output_token", "output_location"],
|
| 64 |
["symbol_counterfactual", "randomLetter_counterfactual",
|
|
|
|
| 47 |
@dataclass
|
| 48 |
class TaskMIB_Causalgraph:
|
| 49 |
benchmark: str # MCQA
|
| 50 |
+
models: list[str] # List of all models
|
| 51 |
layers: list[str] # 0-31
|
| 52 |
col_name: str # display name in leaderboard
|
| 53 |
interventions: list[str] # output_token, output_location
|
|
|
|
| 57 |
class TasksMib_Causalgraph(Enum):
|
| 58 |
task0 = TaskMIB_Causalgraph(
|
| 59 |
"MCQA",
|
| 60 |
+
["LlamaForCausalLM", "Qwen2ForCausalLM", "Gemma2ForCausalLM"], # Updated model list
|
| 61 |
+
[str(i) for i in range(32)], # 0-31 layers
|
| 62 |
"mcqa",
|
| 63 |
["output_token", "output_location"],
|
| 64 |
["symbol_counterfactual", "randomLetter_counterfactual",
|
src/display/utils.py
CHANGED
|
@@ -102,7 +102,22 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
|
| 102 |
|
| 103 |
|
| 104 |
|
| 105 |
-
# Initialize the MIB causal graph columns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
auto_eval_column_dict_mib_causalgraph = []
|
| 107 |
|
| 108 |
# Method name column
|
|
@@ -111,13 +126,20 @@ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnCon
|
|
| 111 |
# For each model-task-intervention combination
|
| 112 |
for task in TasksMib_Causalgraph:
|
| 113 |
for model in task.value.models:
|
| 114 |
-
for
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
# Create the dataclass
|
| 123 |
AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
|
| 105 |
+
# # Initialize the MIB causal graph columns
|
| 106 |
+
# auto_eval_column_dict_mib_causalgraph = []
|
| 107 |
+
|
| 108 |
+
# # Method name column
|
| 109 |
+
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 110 |
+
|
| 111 |
+
# # For each model-task-intervention combination
|
| 112 |
+
# for task in TasksMib_Causalgraph:
|
| 113 |
+
# for model in task.value.models:
|
| 114 |
+
# for intervention in task.value.interventions:
|
| 115 |
+
# col_name = f"{model}_{task.value.benchmark}_{intervention}".lower()
|
| 116 |
+
# auto_eval_column_dict_mib_causalgraph.append([
|
| 117 |
+
# col_name,
|
| 118 |
+
# ColumnContent,
|
| 119 |
+
# ColumnContent(col_name, "number", True)
|
| 120 |
+
# ])
|
| 121 |
auto_eval_column_dict_mib_causalgraph = []
|
| 122 |
|
| 123 |
# Method name column
|
|
|
|
| 126 |
# For each model-task-intervention combination
|
| 127 |
for task in TasksMib_Causalgraph:
|
| 128 |
for model in task.value.models:
|
| 129 |
+
for layer in task.value.layers:
|
| 130 |
+
for intervention in task.value.interventions:
|
| 131 |
+
for counterfactual in task.value.counterfactuals:
|
| 132 |
+
col_name = f"layer{layer}_{intervention}_{counterfactual}"
|
| 133 |
+
field_name = col_name.lower()
|
| 134 |
+
auto_eval_column_dict_mib_causalgraph.append([
|
| 135 |
+
field_name,
|
| 136 |
+
ColumnContent,
|
| 137 |
+
ColumnContent(col_name, "number", True)
|
| 138 |
+
])
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
|
| 143 |
|
| 144 |
# Create the dataclass
|
| 145 |
AutoEvalColumn_mib_causalgraph = make_dataclass("AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -187,46 +187,41 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
| 187 |
"""Represents one full evaluation for a method in MIB causalgraph."""
|
| 188 |
eval_name: str
|
| 189 |
method_name: str
|
| 190 |
-
results: Dict
|
| 191 |
-
|
| 192 |
def init_from_json_file(self, json_filepath):
|
| 193 |
"""Inits results from the method result file"""
|
| 194 |
with open(json_filepath) as fp:
|
| 195 |
data = json.load(fp)
|
| 196 |
-
|
| 197 |
method_name = data.get("method_name")
|
| 198 |
results = {}
|
| 199 |
|
| 200 |
-
#
|
| 201 |
-
for
|
| 202 |
-
model_id =
|
| 203 |
-
task_scores =
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
layer_scores_data = []
|
| 211 |
-
|
| 212 |
-
for intervention_data in layer_data.get("layer_scores", []):
|
| 213 |
-
# Average across counterfactuals
|
| 214 |
-
avg_score = np.mean([cf['score'] for cf in intervention_data['counterfactual_scores']])
|
| 215 |
-
if np.isnan(avg_score):
|
| 216 |
-
avg_score = 0.0
|
| 217 |
-
layer_scores_data.append({
|
| 218 |
-
'intervention': intervention_data['intervention'][0],
|
| 219 |
-
'score': avg_score
|
| 220 |
-
})
|
| 221 |
-
|
| 222 |
-
layer_scores.append({
|
| 223 |
-
'layer': layer,
|
| 224 |
-
'scores': layer_scores_data
|
| 225 |
-
})
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
return EvalResult_MIB_CAUSALGRAPH(
|
| 232 |
eval_name=method_name,
|
|
|
|
| 187 |
"""Represents one full evaluation for a method in MIB causalgraph."""
|
| 188 |
eval_name: str
|
| 189 |
method_name: str
|
| 190 |
+
results: Dict
|
| 191 |
+
|
| 192 |
def init_from_json_file(self, json_filepath):
|
| 193 |
"""Inits results from the method result file"""
|
| 194 |
with open(json_filepath) as fp:
|
| 195 |
data = json.load(fp)
|
| 196 |
+
|
| 197 |
method_name = data.get("method_name")
|
| 198 |
results = {}
|
| 199 |
|
| 200 |
+
# Get results for each model
|
| 201 |
+
for model_result in data.get("results", []):
|
| 202 |
+
model_id = model_result.get("model_id", "") # Will be one of the three models
|
| 203 |
+
task_scores = model_result.get("task_scores", {})
|
| 204 |
|
| 205 |
+
# Process MCQA task scores
|
| 206 |
+
mcqa_scores = {}
|
| 207 |
+
for layer_data in task_scores.get("MCQA", []):
|
| 208 |
+
layer = layer_data.get("layer")
|
| 209 |
+
layer_scores = layer_data.get("layer_scores", [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
# Store scores for each intervention and counterfactual
|
| 212 |
+
for intervention_data in layer_scores:
|
| 213 |
+
intervention = intervention_data["intervention"][0]
|
| 214 |
+
counterfactual_scores = intervention_data["counterfactual_scores"]
|
| 215 |
+
|
| 216 |
+
for cf_score in counterfactual_scores:
|
| 217 |
+
counterfactual = cf_score["counterfactual"][0]
|
| 218 |
+
score = cf_score["score"]
|
| 219 |
+
|
| 220 |
+
# Create key for this combination
|
| 221 |
+
key = f"layer{layer}_{intervention}_{counterfactual}"
|
| 222 |
+
mcqa_scores[key] = score
|
| 223 |
+
|
| 224 |
+
results[model_id] = mcqa_scores
|
| 225 |
|
| 226 |
return EvalResult_MIB_CAUSALGRAPH(
|
| 227 |
eval_name=method_name,
|
src/populate.py
CHANGED
|
@@ -133,37 +133,70 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 133 |
|
| 134 |
# return averaged_df
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 137 |
"""Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
| 138 |
df_copy = df.copy()
|
| 139 |
|
| 140 |
-
# Store Method column
|
| 141 |
method_col = None
|
| 142 |
if 'Method' in df_copy.columns:
|
| 143 |
method_col = df_copy['Method']
|
| 144 |
df_copy = df_copy.drop('Method', axis=1)
|
| 145 |
|
| 146 |
-
# Remove eval_name if present
|
| 147 |
if 'eval_name' in df_copy.columns:
|
| 148 |
df_copy = df_copy.drop('eval_name', axis=1)
|
| 149 |
|
| 150 |
-
# Group columns by
|
| 151 |
-
|
| 152 |
-
for
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
})
|
| 163 |
-
|
| 164 |
-
# Add Method column back
|
| 165 |
if method_col is not None:
|
| 166 |
-
averaged_df
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
return averaged_df
|
| 169 |
|
|
|
|
| 133 |
|
| 134 |
# return averaged_df
|
| 135 |
|
| 136 |
+
# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 137 |
+
# """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
| 138 |
+
# df_copy = df.copy()
|
| 139 |
+
|
| 140 |
+
# # Store Method column if it exists
|
| 141 |
+
# method_col = None
|
| 142 |
+
# if 'Method' in df_copy.columns:
|
| 143 |
+
# method_col = df_copy['Method']
|
| 144 |
+
# df_copy = df_copy.drop('Method', axis=1)
|
| 145 |
+
|
| 146 |
+
# # Remove eval_name if present
|
| 147 |
+
# if 'eval_name' in df_copy.columns:
|
| 148 |
+
# df_copy = df_copy.drop('eval_name', axis=1)
|
| 149 |
+
|
| 150 |
+
# # Group columns by model_task
|
| 151 |
+
# model_task_groups = {}
|
| 152 |
+
# for col in df_copy.columns:
|
| 153 |
+
# model_task = '_'.join(col.split('_')[:2]) # Get model_task part
|
| 154 |
+
# if model_task not in model_task_groups:
|
| 155 |
+
# model_task_groups[model_task] = []
|
| 156 |
+
# model_task_groups[model_task].append(col)
|
| 157 |
+
|
| 158 |
+
# # Create new DataFrame with averaged intervention scores
|
| 159 |
+
# averaged_df = pd.DataFrame({
|
| 160 |
+
# model_task: df_copy[cols].mean(axis=1).round(3)
|
| 161 |
+
# for model_task, cols in model_task_groups.items()
|
| 162 |
+
# })
|
| 163 |
+
|
| 164 |
+
# # Add Method column back
|
| 165 |
+
# if method_col is not None:
|
| 166 |
+
# averaged_df.insert(0, 'Method', method_col)
|
| 167 |
+
|
| 168 |
+
# return averaged_df
|
| 169 |
+
|
| 170 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 171 |
"""Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
| 172 |
df_copy = df.copy()
|
| 173 |
|
| 174 |
+
# Store Method column
|
| 175 |
method_col = None
|
| 176 |
if 'Method' in df_copy.columns:
|
| 177 |
method_col = df_copy['Method']
|
| 178 |
df_copy = df_copy.drop('Method', axis=1)
|
| 179 |
|
|
|
|
| 180 |
if 'eval_name' in df_copy.columns:
|
| 181 |
df_copy = df_copy.drop('eval_name', axis=1)
|
| 182 |
|
| 183 |
+
# Group columns by model and task
|
| 184 |
+
result_cols = {}
|
| 185 |
+
for task in TasksMib_Causalgraph:
|
| 186 |
+
for model in task.value.models: # Will iterate over all three models
|
| 187 |
+
model = model.lower()
|
| 188 |
+
for intervention in task.value.interventions:
|
| 189 |
+
col_name = f"{model}_{task.value.benchmark.lower()}_{intervention}"
|
| 190 |
+
matching_cols = [c for c in df_copy.columns if c.startswith(col_name)]
|
| 191 |
+
if matching_cols:
|
| 192 |
+
result_cols[col_name] = matching_cols
|
| 193 |
+
|
| 194 |
+
averaged_df = pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
| 195 |
if method_col is not None:
|
| 196 |
+
averaged_df['Method'] = method_col
|
| 197 |
+
|
| 198 |
+
for col_name, cols in result_cols.items():
|
| 199 |
+
averaged_df[col_name] = df_copy[cols].mean(axis=1).round(3)
|
| 200 |
|
| 201 |
return averaged_df
|
| 202 |
|