Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
2ba536b
1
Parent(s):
a5eab2c
fix: Align task & model names in CausalGraph
Browse files- app.py +38 -5
- src/about.py +6 -4
- src/leaderboard/read_evals.py +250 -301
app.py
CHANGED
@@ -399,18 +399,48 @@ def init_leaderboard_mib_subgraph(dataframe, track):
|
|
399 |
|
400 |
|
401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
|
403 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
404 |
model_name_mapping = {
|
405 |
"Qwen2ForCausalLM": "Qwen-2.5",
|
406 |
"GPT2ForCausalLM": "GPT-2",
|
|
|
407 |
"Gemma2ForCausalLM": "Gemma-2",
|
408 |
"LlamaForCausalLM": "Llama-3.1"
|
409 |
}
|
410 |
|
411 |
benchmark_mapping = {
|
412 |
-
"
|
413 |
-
"
|
414 |
"arithmetic_addition": "Arithmetic (+)",
|
415 |
"arithmetic_subtraction": "Arithmetic (-)",
|
416 |
"arc_easy": "ARC (Easy)",
|
@@ -420,13 +450,16 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
|
|
420 |
display_mapping = {}
|
421 |
for task in TasksMib_Causalgraph:
|
422 |
for model in task.value.models:
|
423 |
-
|
|
|
424 |
display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
|
425 |
display_mapping[field_name] = display_name
|
426 |
|
|
|
|
|
427 |
renamed_df = dataframe.rename(columns=display_mapping)
|
428 |
-
|
429 |
-
print(renamed_df)
|
430 |
|
431 |
# Create only necessary columns
|
432 |
return Leaderboard(
|
|
|
399 |
|
400 |
|
401 |
|
402 |
+
# @dataclass
|
403 |
+
# class TaskMIB_Causalgraph:
|
404 |
+
# benchmark: str # task name in json (ioi/arithmetic)
|
405 |
+
# models: list[str] # list of models to show as sub-columns
|
406 |
+
# col_name: str # display name in leaderboard
|
407 |
+
# metrics: list[str] # metrics to store (average_score)
|
408 |
+
|
409 |
+
# class TasksMib_Causalgraph(Enum):
|
410 |
+
# task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "ioi_task", ["average_score"])
|
411 |
+
# task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
|
412 |
+
# task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
|
413 |
+
# task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
|
414 |
+
|
415 |
+
# @classmethod
|
416 |
+
# def get_all_tasks(cls):
|
417 |
+
# """Returns a list of all task benchmarks"""
|
418 |
+
# return [task.value.benchmark for task in cls]
|
419 |
+
|
420 |
+
# @classmethod
|
421 |
+
# def get_all_models(cls):
|
422 |
+
# """Returns a list of all unique models across all tasks"""
|
423 |
+
# models = set()
|
424 |
+
# for task in cls:
|
425 |
+
# models.update(task.value.models)
|
426 |
+
# return sorted(list(models))
|
427 |
+
|
428 |
+
# ioi_task
|
429 |
+
# 4_answer_MCQA
|
430 |
+
|
431 |
|
432 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
433 |
model_name_mapping = {
|
434 |
"Qwen2ForCausalLM": "Qwen-2.5",
|
435 |
"GPT2ForCausalLM": "GPT-2",
|
436 |
+
"GPT2LMHeadModel": "GPT-2",
|
437 |
"Gemma2ForCausalLM": "Gemma-2",
|
438 |
"LlamaForCausalLM": "Llama-3.1"
|
439 |
}
|
440 |
|
441 |
benchmark_mapping = {
|
442 |
+
"ioi_task": "IOI",
|
443 |
+
"4_answer_MCQA": "MCQA",
|
444 |
"arithmetic_addition": "Arithmetic (+)",
|
445 |
"arithmetic_subtraction": "Arithmetic (-)",
|
446 |
"arc_easy": "ARC (Easy)",
|
|
|
450 |
display_mapping = {}
|
451 |
for task in TasksMib_Causalgraph:
|
452 |
for model in task.value.models:
|
453 |
+
# print(f"Task: {task.value.benchmark}, Model: {model}")
|
454 |
+
field_name = f"{model}_{task.value.col_name}"
|
455 |
display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
|
456 |
display_mapping[field_name] = display_name
|
457 |
|
458 |
+
# print("\nDebugging display_mapping:", display_mapping)
|
459 |
+
|
460 |
renamed_df = dataframe.rename(columns=display_mapping)
|
461 |
+
|
462 |
+
# print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
|
463 |
|
464 |
# Create only necessary columns
|
465 |
return Leaderboard(
|
src/about.py
CHANGED
@@ -79,11 +79,13 @@ class TaskMIB_Causalgraph:
|
|
79 |
col_name: str # display name in leaderboard
|
80 |
metrics: list[str] # metrics to store (average_score)
|
81 |
|
|
|
|
|
82 |
class TasksMib_Causalgraph(Enum):
|
83 |
-
task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "
|
84 |
-
task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "
|
85 |
-
task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
|
86 |
-
task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
|
87 |
|
88 |
@classmethod
|
89 |
def get_all_tasks(cls):
|
|
|
79 |
col_name: str # display name in leaderboard
|
80 |
metrics: list[str] # metrics to store (average_score)
|
81 |
|
82 |
+
|
83 |
+
|
84 |
class TasksMib_Causalgraph(Enum):
|
85 |
+
task0 = TaskMIB_Subgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"])
|
86 |
+
task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"])
|
87 |
+
task2 = TaskMIB_Subgraph("arithmetic_addition", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
|
88 |
+
task3 = TaskMIB_Subgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
|
89 |
|
90 |
@classmethod
|
91 |
def get_all_tasks(cls):
|
src/leaderboard/read_evals.py
CHANGED
@@ -2,19 +2,22 @@ import glob
|
|
2 |
import json
|
3 |
import math
|
4 |
import os
|
|
|
|
|
5 |
from dataclasses import dataclass
|
|
|
|
|
|
|
6 |
|
7 |
import dateutil
|
8 |
import numpy as np
|
|
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
-
from src.about import TasksMib_Subgraph
|
14 |
|
15 |
-
from typing import List, Dict, Any
|
16 |
-
from collections import defaultdict
|
17 |
-
import pandas as pd
|
18 |
|
19 |
|
20 |
|
@@ -205,226 +208,10 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
|
|
205 |
|
206 |
|
207 |
|
208 |
-
# def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
|
209 |
-
# """
|
210 |
-
# Process a single JSON file and convert it to a DataFrame.
|
211 |
-
|
212 |
-
# Args:
|
213 |
-
# json_file: Dictionary containing the analysis results
|
214 |
-
# method_counter: Counter for handling duplicate method names
|
215 |
-
|
216 |
-
# Returns:
|
217 |
-
# pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
|
218 |
-
# """
|
219 |
-
# method_name = json_file['method_name']
|
220 |
-
# unique_method_name = f"{method_name}_{method_counter}"
|
221 |
-
# method_scores = []
|
222 |
-
|
223 |
-
# for result in json_file['results']:
|
224 |
-
# model = result['model_id']
|
225 |
-
|
226 |
-
# for task, scores in result['task_scores'].items():
|
227 |
-
# # Process each layer's data
|
228 |
-
# intervention_scores = defaultdict(list)
|
229 |
-
|
230 |
-
# for layer_data in scores:
|
231 |
-
# for intervention_data in layer_data['layer_scores']:
|
232 |
-
# # Calculate average score for counterfactuals
|
233 |
-
# avg_cf_score = np.mean([
|
234 |
-
# cf['score']
|
235 |
-
# for cf in intervention_data['counterfactual_scores']
|
236 |
-
# ])
|
237 |
-
|
238 |
-
# if np.isnan(avg_cf_score):
|
239 |
-
# avg_cf_score = 0.0
|
240 |
-
|
241 |
-
# # Group scores by intervention
|
242 |
-
# intervention_key = '_'.join(intervention_data['intervention'])
|
243 |
-
# intervention_scores[intervention_key].append(avg_cf_score)
|
244 |
-
|
245 |
-
# # Average across layers for each intervention
|
246 |
-
# for intervention, layer_scores in intervention_scores.items():
|
247 |
-
# column = f"{model}_{task}_{intervention}"
|
248 |
-
# avg_score = np.mean(layer_scores) if layer_scores else 0.0
|
249 |
-
# method_scores.append((column, f"{avg_score:.3f}"))
|
250 |
-
|
251 |
-
# # Sort by column names for consistency
|
252 |
-
# method_scores.sort(key=lambda x: x[0])
|
253 |
-
# data = {
|
254 |
-
# unique_method_name: {
|
255 |
-
# col: score for col, score in method_scores
|
256 |
-
# }
|
257 |
-
# }
|
258 |
-
|
259 |
-
# return pd.DataFrame.from_dict(data, orient='index')
|
260 |
-
|
261 |
-
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
262 |
-
# model_result_filepaths = []
|
263 |
-
|
264 |
-
# # print(f"Scanning directory: {results_path}")
|
265 |
-
# for root, dirnames, files in os.walk(results_path):
|
266 |
-
# # print(f"Current directory: {root}")
|
267 |
-
# # print(f"Found files: {files}")
|
268 |
-
# if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
269 |
-
# continue
|
270 |
-
|
271 |
-
# try:
|
272 |
-
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
273 |
-
# except dateutil.parser._parser.ParserError:
|
274 |
-
# files = [files[-1]]
|
275 |
-
|
276 |
-
# for file in files:
|
277 |
-
# model_result_filepaths.append(os.path.join(root, file))
|
278 |
-
|
279 |
-
# # print(f"Found json files: {model_result_filepaths}")
|
280 |
-
|
281 |
-
# method_counters = defaultdict(int)
|
282 |
-
# dataframes = []
|
283 |
-
|
284 |
-
# for json_file in model_result_filepaths:
|
285 |
-
# try:
|
286 |
-
# with open(filepath, 'r') as f:
|
287 |
-
# json_data = json.load(f)
|
288 |
-
# method_name = json_data['method_name']
|
289 |
-
# method_counters[method_name] += 1
|
290 |
-
|
291 |
-
# # Process single JSON file
|
292 |
-
# df = process_single_json(json_data, method_counters[method_name])
|
293 |
-
# dataframes.append(df)
|
294 |
-
# except Exception as e:
|
295 |
-
# print(f"Error processing {json_file}: {e}")
|
296 |
-
# continue
|
297 |
-
|
298 |
-
# return dataframes
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
from dataclasses import dataclass
|
304 |
-
import json
|
305 |
-
import numpy as np
|
306 |
-
import pandas as pd
|
307 |
-
from typing import Dict, List, Any
|
308 |
-
import os
|
309 |
-
from datetime import datetime
|
310 |
-
import dateutil
|
311 |
-
from collections import defaultdict
|
312 |
-
|
313 |
-
@dataclass
|
314 |
-
class EvalResult_MIB_CAUSALGRAPH:
|
315 |
-
"""Represents one full evaluation for a method across all models in MIB for causal graph track."""
|
316 |
-
method_name: str # name of the interpretation method
|
317 |
-
results: Dict # nested dict of results for each model and task
|
318 |
-
|
319 |
-
def init_from_json_file(self, json_filepath: str):
|
320 |
-
"""Inits results from the method result file"""
|
321 |
-
with open(json_filepath) as fp:
|
322 |
-
data = json.load(fp)
|
323 |
-
|
324 |
-
method_name = data.get("method_name")
|
325 |
-
|
326 |
-
# Initialize results dictionary
|
327 |
-
results = {}
|
328 |
-
for task in ["IOI", "MCQA", "arithmetic", "ARC-easy"]:
|
329 |
-
results[task] = {}
|
330 |
-
|
331 |
-
print(f"Processing file: {json_filepath}")
|
332 |
-
# Process each model's results
|
333 |
-
for result in data.get("results", []):
|
334 |
-
model_id = result.get("model_id", "")
|
335 |
-
model_name = model_id.replace(".", "_")
|
336 |
-
|
337 |
-
for task, scores in result.get("task_scores", {}).items():
|
338 |
-
intervention_scores = defaultdict(list)
|
339 |
-
|
340 |
-
for layer_data in scores:
|
341 |
-
for intervention_data in layer_data['layer_scores']:
|
342 |
-
# Calculate average score for counterfactuals
|
343 |
-
avg_cf_score = np.mean([
|
344 |
-
cf['score'] if 'score' in cf else 0
|
345 |
-
for cf in intervention_data['counterfactual_scores']
|
346 |
-
])
|
347 |
-
|
348 |
-
if np.isnan(avg_cf_score):
|
349 |
-
avg_cf_score = 0.0
|
350 |
-
|
351 |
-
intervention_key = '_'.join(intervention_data['intervention'])
|
352 |
-
intervention_scores[intervention_key].append(avg_cf_score)
|
353 |
-
print(f"intervention_key is {intervention_key}, avg_cf_score is {avg_cf_score}")
|
354 |
-
|
355 |
-
# Average across layers for each intervention
|
356 |
-
results[task][model_name] = {
|
357 |
-
interv: np.mean(scores) if scores else 0.0
|
358 |
-
for interv, scores in intervention_scores.items()
|
359 |
-
}
|
360 |
-
|
361 |
-
return EvalResult_MIB_CAUSALGRAPH(
|
362 |
-
method_name=method_name,
|
363 |
-
results=results
|
364 |
-
)
|
365 |
-
|
366 |
-
def to_dict(self, metric_type="average"):
|
367 |
-
"""Converts the Eval Result to a dict for dataframe display"""
|
368 |
-
data_dict = {
|
369 |
-
"Method": self.method_name,
|
370 |
-
"Average": "-" # Initialize first to make the order consistent
|
371 |
-
}
|
372 |
-
|
373 |
-
# Initialize columns for all task-model combinations
|
374 |
-
all_scores = []
|
375 |
-
for task, task_results in self.results.items():
|
376 |
-
for model, intervention_scores in task_results.items():
|
377 |
-
if not intervention_scores:
|
378 |
-
continue
|
379 |
-
|
380 |
-
col_name = f"{task}_{model}"
|
381 |
-
scores = list(intervention_scores.values())
|
382 |
-
if not scores:
|
383 |
-
data_dict[col_name] = '-'
|
384 |
-
continue
|
385 |
-
|
386 |
-
avg_score = np.mean(scores)
|
387 |
-
data_dict[col_name] = f"{avg_score:.3f}"
|
388 |
-
all_scores.append(avg_score)
|
389 |
-
|
390 |
-
data_dict["Average"] = f"{np.mean(all_scores):.3f}"
|
391 |
-
return data_dict
|
392 |
-
|
393 |
-
|
394 |
|
395 |
|
396 |
-
# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
397 |
-
# """
|
398 |
-
# Aggregates rows with the same base method name by taking the max value for each column.
|
399 |
-
# Works with Method as a regular column instead of index.
|
400 |
-
# """
|
401 |
-
# df_copy = df.copy()
|
402 |
-
# print("\nBase methods extraction:")
|
403 |
-
# base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
404 |
-
# else name for name in df_copy['Method']]
|
405 |
-
# print(f"Original methods: {df_copy['Method'].tolist()}")
|
406 |
-
# print(f"Base methods: {base_methods}")
|
407 |
|
408 |
|
409 |
-
# df_copy['base_method'] = base_methods
|
410 |
-
|
411 |
-
# # Convert scores to numeric values
|
412 |
-
# score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
|
413 |
-
# for col in score_columns:
|
414 |
-
# df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) else x)
|
415 |
-
|
416 |
-
# # Group by base method name and take the max
|
417 |
-
# aggregated_df = df_copy.groupby('base_method')[score_columns].max().round(3)
|
418 |
-
|
419 |
-
# # Reset index to make base_method a regular column and rename it to Method
|
420 |
-
# aggregated_df = aggregated_df.reset_index()
|
421 |
-
# aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
|
422 |
-
|
423 |
-
# # Convert back to string format
|
424 |
-
# for col in score_columns:
|
425 |
-
# aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}")
|
426 |
-
|
427 |
-
# return aggregated_df
|
428 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
429 |
"""
|
430 |
Aggregates rows with the same base method name by taking the max value for each column.
|
@@ -446,21 +233,21 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
|
446 |
# Convert scores to numeric values
|
447 |
score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
|
448 |
for col in score_columns:
|
449 |
-
df_copy[col] = df_copy[col]
|
450 |
|
451 |
# Group by base method name and take the max, handling NaN values
|
452 |
-
aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(
|
453 |
|
454 |
-
#
|
455 |
aggregated_df = aggregated_df.reset_index()
|
456 |
aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
|
457 |
|
458 |
-
# Convert numeric values back to strings with 3 decimal places
|
459 |
-
for col in score_columns:
|
460 |
-
aggregated_df[col] = aggregated_df[col].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else x)
|
461 |
-
|
462 |
return aggregated_df
|
463 |
|
|
|
|
|
|
|
|
|
464 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
465 |
"""
|
466 |
Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
@@ -469,99 +256,261 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
469 |
# Create a copy of the DataFrame
|
470 |
df_copy = df.copy()
|
471 |
|
472 |
-
#
|
473 |
-
|
474 |
-
df_copy = df_copy.drop('Average', axis=1)
|
475 |
-
|
476 |
-
# Get score columns (excluding Method)
|
477 |
-
score_columns = [col for col in df_copy.columns if col != 'Method']
|
478 |
|
479 |
-
#
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
if model_task not in model_task_groups:
|
488 |
-
model_task_groups[model_task] = []
|
489 |
-
model_task_groups[model_task].append(col)
|
490 |
|
491 |
-
# Create new DataFrame with Method
|
492 |
averaged_data = []
|
493 |
for _, row in df_copy.iterrows():
|
494 |
-
|
|
|
|
|
495 |
for model_task, cols in model_task_groups.items():
|
496 |
-
|
497 |
-
|
498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
|
|
|
500 |
averaged_df = pd.DataFrame(averaged_data)
|
501 |
-
|
502 |
-
|
503 |
-
averaged_df = averaged_df.sort_values('Average', ascending=False)
|
504 |
|
505 |
return averaged_df
|
506 |
|
507 |
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
515 |
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
532 |
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
|
537 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
|
539 |
-
#
|
540 |
-
if
|
541 |
-
data_dict["Method"] = f"{method_name}_{method_counters[method_name]}"
|
542 |
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
# print(f"detailed_df coluns are {detailed_df.columns.tolist()}")
|
552 |
-
# if "eval_name" in detailed_df.columns:
|
553 |
-
# detailed_df.drop("eval_name", axis=1, inplace=True)
|
554 |
|
555 |
-
print("Before aggregation:")
|
556 |
-
print(detailed_df)
|
557 |
|
558 |
-
|
559 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
560 |
|
561 |
-
# Create intervention-averaged DataFrame
|
562 |
-
intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
|
563 |
|
564 |
-
return detailed_df, aggregated_df, intervention_averaged_df
|
565 |
|
566 |
|
567 |
|
|
|
2 |
import json
|
3 |
import math
|
4 |
import os
|
5 |
+
import re
|
6 |
+
import ast
|
7 |
from dataclasses import dataclass
|
8 |
+
from datetime import datetime
|
9 |
+
from typing import List, Dict, Any, Tuple
|
10 |
+
from collections import defaultdict
|
11 |
|
12 |
import dateutil
|
13 |
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
|
16 |
from src.display.formatting import make_clickable_model
|
17 |
+
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, AutoEvalColumn_mib_causalgraph
|
18 |
from src.submission.check_validity import is_model_on_hub
|
19 |
+
from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
|
20 |
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
|
|
|
208 |
|
209 |
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
216 |
"""
|
217 |
Aggregates rows with the same base method name by taking the max value for each column.
|
|
|
233 |
# Convert scores to numeric values
|
234 |
score_columns = [col for col in df_copy.columns if col not in ['Method', 'base_method']]
|
235 |
for col in score_columns:
|
236 |
+
df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
|
237 |
|
238 |
# Group by base method name and take the max, handling NaN values
|
239 |
+
aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(3)
|
240 |
|
241 |
+
# Reset index to make base_method a regular column and rename it to Method
|
242 |
aggregated_df = aggregated_df.reset_index()
|
243 |
aggregated_df = aggregated_df.rename(columns={'base_method': 'Method'})
|
244 |
|
|
|
|
|
|
|
|
|
245 |
return aggregated_df
|
246 |
|
247 |
+
|
248 |
+
|
249 |
+
|
250 |
+
|
251 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
252 |
"""
|
253 |
Creates a DataFrame where columns are model_task and cells are averaged over interventions.
|
|
|
256 |
# Create a copy of the DataFrame
|
257 |
df_copy = df.copy()
|
258 |
|
259 |
+
# Get all columns except Method and Average
|
260 |
+
columns_to_process = [col for col in df_copy.columns if col not in ['Method', 'Average']]
|
|
|
|
|
|
|
|
|
261 |
|
262 |
+
# Extract model and task information from column names
|
263 |
+
model_task_groups = defaultdict(list)
|
264 |
+
for col in columns_to_process:
|
265 |
+
# Split by underscore and extract model, task
|
266 |
+
parts = col.split('_')
|
267 |
+
if len(parts) >= 2:
|
268 |
+
model_task = f"{parts[0]}_{parts[1]}"
|
269 |
+
model_task_groups[model_task].append(col)
|
|
|
|
|
|
|
270 |
|
271 |
+
# Create new DataFrame with Method and averaged columns
|
272 |
averaged_data = []
|
273 |
for _, row in df_copy.iterrows():
|
274 |
+
new_row = {'Method': row['Method']}
|
275 |
+
|
276 |
+
# Calculate average for each model_task group
|
277 |
for model_task, cols in model_task_groups.items():
|
278 |
+
values = [row[col] for col in cols if pd.notna(row[col])]
|
279 |
+
if values:
|
280 |
+
new_row[model_task] = round(np.mean(values), 3)
|
281 |
+
else:
|
282 |
+
new_row[model_task] = np.nan
|
283 |
+
|
284 |
+
# Calculate overall average
|
285 |
+
model_task_values = [v for k, v in new_row.items() if k != 'Method' and pd.notna(v)]
|
286 |
+
if model_task_values:
|
287 |
+
new_row['Average'] = round(np.mean(model_task_values), 3)
|
288 |
+
else:
|
289 |
+
new_row['Average'] = np.nan
|
290 |
+
|
291 |
+
averaged_data.append(new_row)
|
292 |
|
293 |
+
# Create DataFrame and sort by Average
|
294 |
averaged_df = pd.DataFrame(averaged_data)
|
295 |
+
if 'Average' in averaged_df.columns:
|
296 |
+
averaged_df = averaged_df.sort_values('Average', ascending=False)
|
|
|
297 |
|
298 |
return averaged_df
|
299 |
|
300 |
|
301 |
+
@dataclass
|
302 |
+
class EvalResult_MIB_CAUSALGRAPH:
|
303 |
+
"""Represents one full evaluation for a method across all models for causal variable localization."""
|
304 |
+
eval_name: str # method name as identifier
|
305 |
+
method_name: str # name of the interpretation method
|
306 |
+
model_name: str # name of the model
|
307 |
+
task_name: str # name of the task
|
308 |
+
target_variables: str # target variables (e.g., "answer", "answer_pointer")
|
309 |
+
average_accuracy: float # average accuracy score
|
310 |
+
highest_accuracy: float # highest accuracy score
|
311 |
+
|
312 |
+
@staticmethod
|
313 |
+
def init_from_consolidated_json(json_data: Dict):
|
314 |
+
"""
|
315 |
+
Initialize results from the consolidated JSON format, treating each entry as a separate result
|
316 |
|
317 |
+
Args:
|
318 |
+
json_data: The parsed JSON data with tuple keys
|
319 |
+
|
320 |
+
Returns:
|
321 |
+
List of EvalResult_MIB_CAUSALGRAPH objects
|
322 |
+
"""
|
323 |
+
results = []
|
324 |
+
|
325 |
+
for key, entry in json_data.items():
|
326 |
+
try:
|
327 |
+
# Parse tuple key: "('method', 'model', 'task', 'variable')"
|
328 |
+
try:
|
329 |
+
key_tuple = ast.literal_eval(key)
|
330 |
+
method_name, model_name, task_name, target_variable = key_tuple
|
331 |
+
except:
|
332 |
+
# Alternative parsing with regex
|
333 |
+
pattern = r"\('([^']+)', '([^']+)', '([^']+)', '([^']+)'\)"
|
334 |
+
match = re.match(pattern, key)
|
335 |
+
if match:
|
336 |
+
method_name, model_name, task_name, target_variable = match.groups()
|
337 |
+
else:
|
338 |
+
print(f"Couldn't parse key: {key}")
|
339 |
+
continue
|
340 |
+
|
341 |
+
# Get average and highest accuracy
|
342 |
+
average_accuracy = entry.get("average_accuracy", 0.0)
|
343 |
+
highest_accuracy = entry.get("highest_accuracy", 0.0)
|
344 |
+
|
345 |
+
# Create a result object for this entry
|
346 |
+
result = EvalResult_MIB_CAUSALGRAPH(
|
347 |
+
eval_name=f"{method_name}_{model_name}_{task_name}_{target_variable}",
|
348 |
+
method_name=method_name,
|
349 |
+
model_name=model_name,
|
350 |
+
task_name=task_name,
|
351 |
+
target_variables=target_variable,
|
352 |
+
average_accuracy=average_accuracy,
|
353 |
+
highest_accuracy=highest_accuracy
|
354 |
+
)
|
355 |
+
|
356 |
+
results.append(result)
|
357 |
+
|
358 |
+
except Exception as e:
|
359 |
+
print(f"Error processing entry {key}: {e}")
|
360 |
+
continue
|
361 |
+
|
362 |
+
return results
|
363 |
|
364 |
+
def to_dict(self, metric_type="Highest"):
|
365 |
+
"""
|
366 |
+
Converts the Eval Result to a dict for dataframe display
|
367 |
|
368 |
+
Args:
|
369 |
+
metric_type: Either "Mean" to use average_accuracy or "Highest" to use highest_accuracy
|
370 |
+
"""
|
371 |
+
# Create column name in the exact format requested
|
372 |
+
# col_name = f"{self.model_name}_{self.task_name}_{self.target_variables}"
|
373 |
+
col_name = f"{self.model_name}_{self.task_name}"
|
374 |
+
print(f"col_name is {col_name}")
|
375 |
|
376 |
+
# Select the appropriate accuracy metric based on metric_type
|
377 |
+
score = self.average_accuracy if metric_type == "Mean" else self.highest_accuracy
|
|
|
378 |
|
379 |
+
# Create data dictionary with method name and the score
|
380 |
+
data_dict = {
|
381 |
+
"eval_name": self.eval_name,
|
382 |
+
"Method": self.method_name,
|
383 |
+
col_name: score
|
384 |
+
}
|
385 |
+
|
386 |
+
return data_dict
|
|
|
|
|
|
|
387 |
|
|
|
|
|
388 |
|
389 |
+
def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
390 |
+
"""
|
391 |
+
Processes the consolidated JSON format for causal variable localization results
|
392 |
+
Treats each entry as a separate result and then combines them by method
|
393 |
+
|
394 |
+
Args:
|
395 |
+
results_path: Path to the directory containing results
|
396 |
+
|
397 |
+
Returns:
|
398 |
+
Tuple of four DataFrames:
|
399 |
+
- detailed_df_highest: Detailed view with highest accuracy scores
|
400 |
+
- detailed_df_mean: Detailed view with mean accuracy scores
|
401 |
+
- intervention_averaged_highest_df: Averaged by intervention using highest accuracy
|
402 |
+
- intervention_averaged_mean_df: Averaged by intervention using mean accuracy
|
403 |
+
"""
|
404 |
+
# Find the consolidated JSON file
|
405 |
+
json_files = []
|
406 |
+
for root, _, files in os.walk(results_path):
|
407 |
+
for file in files:
|
408 |
+
if file.endswith('.json'):
|
409 |
+
json_files.append(os.path.join(root, file))
|
410 |
+
|
411 |
+
if not json_files:
|
412 |
+
print(f"No JSON files found in {results_path}")
|
413 |
+
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
414 |
+
|
415 |
+
# Load and process the consolidated JSON format
|
416 |
+
raw_data = None
|
417 |
+
for json_file in json_files:
|
418 |
+
try:
|
419 |
+
with open(json_file, 'r') as f:
|
420 |
+
data = json.load(f)
|
421 |
+
|
422 |
+
# Check if this is the consolidated format by examining a sample key
|
423 |
+
sample_key = next(iter(data), None)
|
424 |
+
if sample_key and isinstance(sample_key, str) and '(' in sample_key and ')' in sample_key:
|
425 |
+
raw_data = data
|
426 |
+
print(f"Found consolidated data file: {json_file}")
|
427 |
+
break
|
428 |
+
except Exception as e:
|
429 |
+
print(f"Error reading {json_file}: {e}")
|
430 |
+
|
431 |
+
if raw_data is None:
|
432 |
+
print("No valid consolidated JSON file found")
|
433 |
+
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
434 |
+
|
435 |
+
# Get all results
|
436 |
+
eval_results = EvalResult_MIB_CAUSALGRAPH.init_from_consolidated_json(raw_data)
|
437 |
+
|
438 |
+
if not eval_results:
|
439 |
+
print("No results could be extracted from the JSON data")
|
440 |
+
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
441 |
+
|
442 |
+
# Create two sets of dictionaries - one for highest accuracy and one for mean accuracy
|
443 |
+
highest_results = [result.to_dict(metric_type="Highest") for result in eval_results]
|
444 |
+
mean_results = [result.to_dict(metric_type="Mean") for result in eval_results]
|
445 |
+
|
446 |
+
# Process highest accuracy results
|
447 |
+
# Group results by method
|
448 |
+
highest_method_groups = {}
|
449 |
+
for result_dict in highest_results:
|
450 |
+
method = result_dict["Method"]
|
451 |
+
if method not in highest_method_groups:
|
452 |
+
highest_method_groups[method] = {
|
453 |
+
"eval_name": method,
|
454 |
+
"Method": method
|
455 |
+
}
|
456 |
+
|
457 |
+
# Copy all score columns to the method's group
|
458 |
+
for key, value in result_dict.items():
|
459 |
+
if key not in ["eval_name", "Method"]:
|
460 |
+
highest_method_groups[method][key] = value
|
461 |
+
|
462 |
+
# Create the detailed DataFrame for highest accuracy
|
463 |
+
highest_records = list(highest_method_groups.values())
|
464 |
+
detailed_df_highest = pd.DataFrame(highest_records)
|
465 |
+
|
466 |
+
# Process mean accuracy results
|
467 |
+
# Group results by method
|
468 |
+
mean_method_groups = {}
|
469 |
+
for result_dict in mean_results:
|
470 |
+
method = result_dict["Method"]
|
471 |
+
if method not in mean_method_groups:
|
472 |
+
mean_method_groups[method] = {
|
473 |
+
"eval_name": method,
|
474 |
+
"Method": method
|
475 |
+
}
|
476 |
+
|
477 |
+
# Copy all score columns to the method's group
|
478 |
+
for key, value in result_dict.items():
|
479 |
+
if key not in ["eval_name", "Method"]:
|
480 |
+
mean_method_groups[method][key] = value
|
481 |
+
|
482 |
+
# Create the detailed DataFrame for mean accuracy
|
483 |
+
mean_records = list(mean_method_groups.values())
|
484 |
+
detailed_df_mean = pd.DataFrame(mean_records)
|
485 |
+
|
486 |
+
if detailed_df_highest.empty or detailed_df_mean.empty:
|
487 |
+
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
488 |
+
|
489 |
+
# Calculate and add Average column for both DataFrames
|
490 |
+
score_columns_highest = [col for col in detailed_df_highest.columns if col not in ["eval_name", "Method"]]
|
491 |
+
score_columns_mean = [col for col in detailed_df_mean.columns if col not in ["eval_name", "Method"]]
|
492 |
+
|
493 |
+
if score_columns_highest:
|
494 |
+
detailed_df_highest["Average"] = detailed_df_highest[score_columns_highest].mean(axis=1).round(3)
|
495 |
+
|
496 |
+
if score_columns_mean:
|
497 |
+
detailed_df_mean["Average"] = detailed_df_mean[score_columns_mean].mean(axis=1).round(3)
|
498 |
+
|
499 |
+
# Sort by Average descending
|
500 |
+
if "Average" in detailed_df_highest.columns:
|
501 |
+
detailed_df_highest = detailed_df_highest.sort_values("Average", ascending=False)
|
502 |
+
|
503 |
+
if "Average" in detailed_df_mean.columns:
|
504 |
+
detailed_df_mean = detailed_df_mean.sort_values("Average", ascending=False)
|
505 |
+
|
506 |
+
# # Create intervention-averaged DataFrames for both metrics
|
507 |
+
# intervention_averaged_highest_df = create_intervention_averaged_df(detailed_df_highest)
|
508 |
+
# intervention_averaged_mean_df = create_intervention_averaged_df(detailed_df_mean)
|
509 |
+
|
510 |
+
# return detailed_df_highest, detailed_df_mean, intervention_averaged_highest_df
|
511 |
+
return detailed_df_highest, detailed_df_mean, detailed_df_mean
|
512 |
|
|
|
|
|
513 |
|
|
|
514 |
|
515 |
|
516 |
|