File size: 9,088 Bytes
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e46e945
 
 
 
 
 
 
 
2817fcb
 
 
 
 
 
e46e945
e27c948
 
 
 
 
 
 
 
 
 
 
 
b56a213
 
4780a48
 
b56a213
 
4780a48
 
 
 
 
 
 
e46e945
36438b0
56d1796
4780a48
 
 
 
 
 
 
56d1796
 
 
 
 
 
b56a213
 
4780a48
b56a213
4780a48
 
 
 
475701c
b56a213
2ba536b
 
e46e945
475701c
 
76717d0
475701c
4780a48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fc77f5
3c343e0
2fc77f5
 
 
5ed4bca
2fc77f5
 
 
 
3c343e0
2fc77f5
 
5ed4bca
 
2fc77f5
5ed4bca
3c343e0
5ed4bca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c343e0
5ed4bca
 
2fc77f5
5ed4bca
 
e1faa87
 
 
2fc77f5
5ed4bca
 
e1faa87
 
 
 
b624a39
5ed4bca
 
e1faa87
5ed4bca
3c343e0
5ed4bca
 
2fc77f5
 
3c343e0
11e2149
 
 
 
 
 
 
2fc77f5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str

class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("blimp", "acc", "BLiMP")
    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
    task2 = Task("glue", "acc", "(Super)GLUE")
    task3 = Task("ewok", "acc", "EWoK")

@dataclass
class TaskMIB_Subgraph:
    benchmark: str      # task name in json (ioi/arithmetic)
    models: list[str]   # list of models to show as sub-columns
    col_name: str       # display name in leaderboard
    metrics: list[str]  # metrics to store (edge_counts, faithfulness)

class TasksMib_Subgraph(Enum):
    task0 = TaskMIB_Subgraph("ioi", ["gpt2", "qwen2_5", "gemma2", "llama3"], "IOI", ["edge_counts", "faithfulness"])
    task1 = TaskMIB_Subgraph("mcqa", ["qwen2_5", "gemma2", "llama3"], "MCQA", ["edge_counts", "faithfulness"])
    task2 = TaskMIB_Subgraph("arithmetic_addition", ["llama3"], "arithmetic_addition", ["edge_counts", "faithfulness"])
    task3 = TaskMIB_Subgraph("arithmetic_subtraction", ["llama3"], "arithmetic_subtraction", ["edge_counts", "faithfulness"])
    task4 = TaskMIB_Subgraph("arc_easy", ["gemma2", "llama3"], "arc_easy", ["edge_counts", "faithfulness"])
    task5 = TaskMIB_Subgraph("arc_challenge", ["llama3"], "arc_challenge", ["edge_counts", "faithfulness"])

    @classmethod
    def get_all_tasks(cls):
        """Returns a list of all task benchmarks"""
        return [task.value.benchmark for task in cls]
    
    @classmethod
    def get_all_models(cls):
        """Returns a list of all unique models across all tasks"""
        models = set()
        for task in cls:
            models.update(task.value.models)
        return sorted(list(models))




# @dataclass 
# class TaskMIB_Causalgraph:
#     benchmark: str      
#     models: list[str]   
#     layers: dict[str, list[str]]  # Different layers for each model
#     col_name: str      
#     interventions: list[str]  
#     counterfactuals: list[str]  
#     metrics: list[str]  


# class TasksMib_Causalgraph(Enum):
#     task0 = TaskMIB_Causalgraph("MCQA", 
#         ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"],
#         {
#             "qwen2forcausallm": [str(i) for i in range(24)],    # 0-23
#             "gemma2forcausallm": [str(i) for i in range(26)],   # 0-25
#             "llamaforcausallm": [str(i) for i in range(32)]     # 0-31
#         },
#         "mcqa",
#         ["output_token", "output_location"],
#         ["randomLetter_counterfactual", "answerPosition_counterfactual", 
#          "answerPosition_randomLetter_counterfactual"],
#         ["score"]
#     )


@dataclass
class TaskMIB_Causalgraph:
    benchmark: str      # task name in json (ioi/arithmetic)
    models: list[str]   # list of models to show as sub-columns
    col_name: str       # display name in leaderboard
    metrics: list[str]  # metrics to store (average_score)
    target_variables: list[str]



class TasksMib_Causalgraph(Enum):
    task0 = TaskMIB_Causalgraph("ioi", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ioi_task", ["average_score"], ["output_token", "output_position"])
    task1 = TaskMIB_Causalgraph("mcqa", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "4_answer_MCQA", ["average_score"], ["answer_pointer", "answer"])
    task2 = TaskMIB_Causalgraph("ravel", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "RAVEL", ["average_score"], ["Continent", "Language", "Country", "Language"])
    task3 = TaskMIB_Causalgraph("arc_easy", ["Qwen2ForCausalLM", "GPT2ForCausalLM", "GPT2LMHeadModel", "Gemma2ForCausalLM", "LlamaForCausalLM"], "ARC_easy", ["average_score"], ["answer_pointer", "answer"])

    @classmethod
    def get_all_tasks(cls):
        """Returns a list of all task benchmarks"""
        return [task.value.benchmark for task in cls]
    
    @classmethod
    def get_all_models(cls):
        """Returns a list of all unique models across all tasks"""
        models = set()
        for task in cls:
            models.update(task.value.models)
        return sorted(list(models))



# Your leaderboard name
TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark Leaderboards</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
The leaderboards for each track of the Mechanistic Interpretability Benchmark.
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
"""

EVALUATION_QUEUE_TEXT_SUBGRAPH = """
## Circuit localization track

### 1. Collect your circuits
You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
For specifications about the file formats we accept, see the README on [our project GitHub](https://github.com/hannamw/MIB-subgraph-track).

### 2. Upload your circuits
Create a HuggingFace repository, and create a folder in that repository that will hold all of your circuit folders.
At the URL you provide, there should be one folder per task/model combination; these folders
should contain your circuit(s). As long as the folder names contain the model and task names, you do not need to worry about the circuit filenames.
If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically in a given folder. We provide examples of valid
submissions: see [here](https://huggingface.co/mib-bench/mib-circuits-example/tree/main/importances/json) for a submission using importance scores and
[here](https://huggingface.co/mib-bench/mib-circuits-example/tree/main/multiple_circuits/pt) for a submission uploading multiple circuits.

### 3. Manage your submission in the queue
If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
This will allow you to remove your submission from the queue (e.g., if you find a bug in your circuits). This will prevent you from needing to wait until
next week to resubmit.

Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
It will keep the PENDING status until it has been run on the private test set.
"""

EVALUATION_QUEUE_TEXT_CAUSALVARIABLE = """
## Causal variable localization track

### 1. Collect your materials
You'll need the following:
* Trained featurizer, inverse featurizer, and indices objects.
* A python file containing the implementation of your featurizer and inverse featurizer.
* (Optional) Dynamic token alignment functions, provided in another python file.

### 2. Upload your materials
Create a HuggingFace repository, and create a folder in that repository that will hold all of your materials.
At the URL you provide (we'll call this the "root"), each of the above materials should be present. At the linked folder,
we will take the first python script lexicographically at the root as the featurizer script. Within that folder, we expect
one subfolder per model/task/causal variable triplet. Each subfolder should contain the trained featurizer, inverse featurizer,
and indices.

### 3. Manage your submission in the queue
If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
This will allow you to remove your submission from the queue (e.g., if you find a bug). This will prevent you from needing to wait until
next week to resubmit.

Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
It will keep the PENDING status until it has been run on the private test set.
"""

CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
CITATION_BUTTON_TEXT = r"""@article{mib-2025,
	title = {{MIB}: A Mechanistic Interpretability Benchmark},
	author = {Aaron Mueller and Atticus Geiger and Sarah Wiegreffe and Dana Arad and Iv{\'a}n Arcuschin and Adam Belfki and Yik Siu Chan and Jaden Fiotto-Kaufman and Tal Haklay and Michael Hanna and Jing Huang and Rohan Gupta and Yaniv Nikankin and Hadas Orgad and Nikhil Prakash and Anja Reusch and Aruna Sankaranarayanan and Shun Shao and Alessandro Stolfo and Martin Tutek and Amir Zur and David Bau and Yonatan Belinkov},
	year = {2025},
	journal = {CoRR},
	volume = {arXiv:2504.13151},
	url = {https://arxiv.org/abs/2504.13151v1}
}
"""