File size: 6,636 Bytes
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e46e945
 
 
 
 
 
 
 
 
 
2817fcb
 
 
 
 
 
e46e945
e27c948
 
 
 
 
 
 
 
 
 
 
 
b56a213
 
4780a48
 
b56a213
 
4780a48
 
 
 
 
 
 
e46e945
36438b0
56d1796
4780a48
 
 
 
 
 
 
56d1796
 
 
 
 
 
b56a213
 
4780a48
b56a213
4780a48
 
 
 
b56a213
e46e945
7086c8a
 
 
 
4780a48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e46e945
 
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b624a39
2fc77f5
b624a39
 
 
 
 
2fc77f5
b624a39
2fc77f5
b624a39
 
 
 
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str




# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("blimp", "acc", "BLiMP")
    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
    task2 = Task("glue", "acc", "(Super)GLUE")
    task3 = Task("ewok", "acc", "EWoK")


    
class TasksMultimodal(Enum):
    task0 = Task("blimp", "acc", "BLiMP")
    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
    task2 = Task("glue", "acc", "(Super)GLUE")
    task3 = Task("ewok", "acc", "EWoK")
    task4 = Task("vqa", "acc", "VQA")
    task5 = Task("winoground", "acc", "Winoground")
    task6 = Task("devbench", "acc", "DevBench")



@dataclass
class TaskMIB_Subgraph:
    benchmark: str      # task name in json (ioi/arithmetic)
    models: list[str]   # list of models to show as sub-columns
    col_name: str       # display name in leaderboard
    metrics: list[str]  # metrics to store (edge_counts, faithfulness)

class TasksMib_Subgraph(Enum):
    task0 = TaskMIB_Subgraph("ioi", ["gpt2", "qwen2_5", "gemma2", "llama3"], "IOI", ["edge_counts", "faithfulness"])
    task1 = TaskMIB_Subgraph("mcqa", ["qwen2_5", "gemma2", "llama3"], "MCQA", ["edge_counts", "faithfulness"])
    task2 = TaskMIB_Subgraph("arithmetic_addition", ["llama3"], "arithmetic_addition", ["edge_counts", "faithfulness"])
    task3 = TaskMIB_Subgraph("arithmetic_subtraction", ["llama3"], "arithmetic_subtraction", ["edge_counts", "faithfulness"])
    task4 = TaskMIB_Subgraph("arc_easy", ["gemma2", "llama3"], "arc_easy", ["edge_counts", "faithfulness"])
    task5 = TaskMIB_Subgraph("arc_challenge", ["llama3"], "arc_challenge", ["edge_counts", "faithfulness"])

    @classmethod
    def get_all_tasks(cls):
        """Returns a list of all task benchmarks"""
        return [task.value.benchmark for task in cls]
    
    @classmethod
    def get_all_models(cls):
        """Returns a list of all unique models across all tasks"""
        models = set()
        for task in cls:
            models.update(task.value.models)
        return sorted(list(models))




# @dataclass 
# class TaskMIB_Causalgraph:
#     benchmark: str      
#     models: list[str]   
#     layers: dict[str, list[str]]  # Different layers for each model
#     col_name: str      
#     interventions: list[str]  
#     counterfactuals: list[str]  
#     metrics: list[str]  


# class TasksMib_Causalgraph(Enum):
#     task0 = TaskMIB_Causalgraph("MCQA", 
#         ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"],
#         {
#             "qwen2forcausallm": [str(i) for i in range(24)],    # 0-23
#             "gemma2forcausallm": [str(i) for i in range(26)],   # 0-25
#             "llamaforcausallm": [str(i) for i in range(32)]     # 0-31
#         },
#         "mcqa",
#         ["output_token", "output_location"],
#         ["randomLetter_counterfactual", "answerPosition_counterfactual", 
#          "answerPosition_randomLetter_counterfactual"],
#         ["score"]
#     )


@dataclass
class TaskMIB_Causalgraph:
    benchmark: str      # task name in json (ioi/arithmetic)
    models: list[str]   # list of models to show as sub-columns
    col_name: str       # display name in leaderboard
    metrics: list[str]  # metrics to store (average_score)

class TasksMib_Causalgraph(Enum):
    task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "IOI", ["average_score"])
    task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"])
    task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
    task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])

    @classmethod
    def get_all_tasks(cls):
        """Returns a list of all task benchmarks"""
        return [task.value.benchmark for task in cls]
    
    @classmethod
    def get_all_models(cls):
        """Returns a list of all unique models across all tasks"""
        models = set()
        for task in cls:
            models.update(task.value.models)
        return sorted(list(models))





NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------



# Your leaderboard name
TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark 2024 Leaderboards</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
The leaderboards for each track of the 2024 Mechanistic Interpretability Benchmark.
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
This leaderboard displays scores from the 2024 BabyLM Challenge. Each track has its own tab.
"""

EVALUATION_QUEUE_TEXT = """
## Circuit localization track:

You'll need 10 circuits per task/model combination. For each critical threshold k and previous threshold k_-1,
the circuit should contain no fewer than k_-1% of components, and no more than k% of components. Create a HuggingFace
dataset or model repository; this will house your circuits. Make a folder where the circuits (and *only* the circuits)
are contained. Do not worry about the ordering of the files; our evaluation script will read the circuits and sort them
by size. Provide a link to this folder below.

For specifications about the file format for a circuit, see the README on our project GitHub: TODO

Once your model makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
The evaluations are handled by the National Deep Inference Framework (NDIF).

## Causal variable localization track:
"""

CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
CITATION_BUTTON_TEXT = r"""
@article{hu2024findingssecondbabylmchallenge,
      title={Findings of the Second BabyLM Challenge: Sample-Efficient Pretraining on Developmentally Plausible Corpora}, 
      author={Michael Y. Hu and Aaron Mueller and Candace Ross and Adina Williams and Tal Linzen and Chengxu Zhuang and Ryan Cotterell and Leshem Choshen and Alex Warstadt and Ethan Gotlieb Wilcox},
      year={2024},
      journal={Computing Research Repository},
      volume={arXiv:2412.05149},
      url={https://arxiv.org/abs/2412.05149}, 
}
"""