leaderboard / src /about.py
Aaron Mueller
leaderboard update
3c343e0
raw
history blame
6.6 kB
from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
task0 = Task("blimp", "acc", "BLiMP")
task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
task2 = Task("glue", "acc", "(Super)GLUE")
task3 = Task("ewok", "acc", "EWoK")
@dataclass
class TaskMIB_Subgraph:
benchmark: str # task name in json (ioi/arithmetic)
models: list[str] # list of models to show as sub-columns
col_name: str # display name in leaderboard
metrics: list[str] # metrics to store (edge_counts, faithfulness)
class TasksMib_Subgraph(Enum):
task0 = TaskMIB_Subgraph("ioi", ["gpt2", "qwen2_5", "gemma2", "llama3"], "IOI", ["edge_counts", "faithfulness"])
task1 = TaskMIB_Subgraph("mcqa", ["qwen2_5", "gemma2", "llama3"], "MCQA", ["edge_counts", "faithfulness"])
task2 = TaskMIB_Subgraph("arithmetic_addition", ["llama3"], "arithmetic_addition", ["edge_counts", "faithfulness"])
task3 = TaskMIB_Subgraph("arithmetic_subtraction", ["llama3"], "arithmetic_subtraction", ["edge_counts", "faithfulness"])
task4 = TaskMIB_Subgraph("arc_easy", ["gemma2", "llama3"], "arc_easy", ["edge_counts", "faithfulness"])
task5 = TaskMIB_Subgraph("arc_challenge", ["llama3"], "arc_challenge", ["edge_counts", "faithfulness"])
@classmethod
def get_all_tasks(cls):
"""Returns a list of all task benchmarks"""
return [task.value.benchmark for task in cls]
@classmethod
def get_all_models(cls):
"""Returns a list of all unique models across all tasks"""
models = set()
for task in cls:
models.update(task.value.models)
return sorted(list(models))
# @dataclass
# class TaskMIB_Causalgraph:
# benchmark: str
# models: list[str]
# layers: dict[str, list[str]] # Different layers for each model
# col_name: str
# interventions: list[str]
# counterfactuals: list[str]
# metrics: list[str]
# class TasksMib_Causalgraph(Enum):
# task0 = TaskMIB_Causalgraph("MCQA",
# ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"],
# {
# "qwen2forcausallm": [str(i) for i in range(24)], # 0-23
# "gemma2forcausallm": [str(i) for i in range(26)], # 0-25
# "llamaforcausallm": [str(i) for i in range(32)] # 0-31
# },
# "mcqa",
# ["output_token", "output_location"],
# ["randomLetter_counterfactual", "answerPosition_counterfactual",
# "answerPosition_randomLetter_counterfactual"],
# ["score"]
# )
@dataclass
class TaskMIB_Causalgraph:
benchmark: str # task name in json (ioi/arithmetic)
models: list[str] # list of models to show as sub-columns
col_name: str # display name in leaderboard
metrics: list[str] # metrics to store (average_score)
class TasksMib_Causalgraph(Enum):
task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "IOI", ["average_score"])
task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"])
task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
@classmethod
def get_all_tasks(cls):
"""Returns a list of all task benchmarks"""
return [task.value.benchmark for task in cls]
@classmethod
def get_all_models(cls):
"""Returns a list of all unique models across all tasks"""
models = set()
for task in cls:
models.update(task.value.models)
return sorted(list(models))
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark Leaderboards</h1>"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
The leaderboards for each track of the 2024 Mechanistic Interpretability Benchmark.
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
"""
EVALUATION_QUEUE_TEXT = """
## Circuit localization track:
You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
If (ii), then for each critical threshold k, the circuit should contain no more than k% of edges. See [here]() for examples of each valid circuit format.
Create a folder in a HuggingFace repository to hold your circuits. At the URL you provide, there should be one folder per task/model combination; these folders
should contain your circuit(s). As long as the folders contain the model and task names, you do not need to worry about the circuit filenames.
If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically.
For specifications about the file format for a circuit, see the README on our project GitHub: TODO
Once your submission has been validated and makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
## Causal variable localization track:
You'll need to provide a link to a HuggingFace repository containing your trained featurizer, the layer on which the featurizer was trained, and the code needed to load and run your featurizer.
See TODO for an example.
"""
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
CITATION_BUTTON_TEXT = r"""
@article{mib-2025,
title = {{MIB}: A Mechanistic Interpretability Benchmark},
author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
year = {2025},
note = {To appear},
journal = {arXiv preprint}
}
"""