File size: 8,068 Bytes
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e46e945
 
 
 
 
 
 
 
 
 
2817fcb
 
 
 
 
 
e46e945
e27c948
 
 
 
 
 
 
 
 
 
 
 
b56a213
 
 
 
 
 
 
 
 
 
 
e46e945
36438b0
 
 
 
 
 
 
 
 
 
 
 
56d1796
 
 
 
 
 
 
 
 
 
 
b56a213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1eaca05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e46e945
 
 
1eaca05
b56a213
1eaca05
 
 
b56a213
e46e945
 
36438b0
753260a
a100ebc
e46e945
 
 
2fc77f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str




# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("blimp", "acc", "BLiMP")
    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
    task2 = Task("glue", "acc", "(Super)GLUE")
    task3 = Task("ewok", "acc", "EWoK")


    
class TasksMultimodal(Enum):
    task0 = Task("blimp", "acc", "BLiMP")
    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
    task2 = Task("glue", "acc", "(Super)GLUE")
    task3 = Task("ewok", "acc", "EWoK")
    task4 = Task("vqa", "acc", "VQA")
    task5 = Task("winoground", "acc", "Winoground")
    task6 = Task("devbench", "acc", "DevBench")



@dataclass
class TaskMIB_Subgraph:
    benchmark: str      # task name in json (ioi/arithmetic)
    models: list[str]   # list of models to show as sub-columns
    col_name: str       # display name in leaderboard
    metrics: list[str]  # metrics to store (edge_counts, faithfulness)

class TasksMib_Subgraph(Enum):
    task0 = TaskMIB_Subgraph("ioi", ["gpt2", "qwen2_5", "gemma2", "llama3"], "IOI", ["edge_counts", "faithfulness"])
    task1 = TaskMIB_Subgraph("mcqa", ["qwen2_5", "gemma2", "llama3"], "MCQA", ["edge_counts", "faithfulness"])
    task2 = TaskMIB_Subgraph("arithmetic_addition", ["llama3"], "arithmetic_addition", ["edge_counts", "faithfulness"])
    task3 = TaskMIB_Subgraph("arithmetic_subtraction", ["llama3"], "arithmetic_subtraction", ["edge_counts", "faithfulness"])
    task4 = TaskMIB_Subgraph("arc_easy", ["gemma2", "llama3"], "arc_easy", ["edge_counts", "faithfulness"])
    task5 = TaskMIB_Subgraph("arc_challenge", ["llama3"], "arc_challenge", ["edge_counts", "faithfulness"])

    @classmethod
    def get_all_tasks(cls):
        """Returns a list of all task benchmarks"""
        return [task.value.benchmark for task in cls]
    
    @classmethod
    def get_all_models(cls):
        """Returns a list of all unique models across all tasks"""
        models = set()
        for task in cls:
            models.update(task.value.models)
        return sorted(list(models))


# @dataclass 
# class TaskMIB_Causalgraph:
#     benchmark: str      # MCQA
#     models: list[str]   # List of all models
#     layers: list[str]   # 0-31
#     col_name: str       # display name in leaderboard
#     interventions: list[str]  # output_token, output_location
#     counterfactuals: list[str]  # symbol_counterfactual, etc.
#     metrics: list[str]  # score

# class TasksMib_Causalgraph(Enum):
#     task0 = TaskMIB_Causalgraph(
#         "MCQA", 
#         ["LlamaForCausalLM", "Qwen2ForCausalLM", "Gemma2ForCausalLM"],  # Updated model list
#         [str(i) for i in range(32)],  # 0-31 layers
#         "mcqa",
#         ["output_token", "output_location"],
#         ["symbol_counterfactual", "randomLetter_counterfactual", 
#          "answerPosition_counterfactual", "answerPosition_symbol_counterfactual"],
#         ["score"]
#     )

# class TasksMib_Causalgraph(Enum):
#     task0 = TaskMIB_Causalgraph(
#         "MCQA", 
#         ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"],  # Match exact model names with correct casing
#         [str(i) for i in range(32)],  
#         "mcqa",
#         ["output_token", "output_location"],
#         ["randomLetter_counterfactual", "answerPosition_counterfactual", 
#          "answerPosition_randomLetter_counterfactual"],
#         ["score"]
#     )

# class TasksMib_Causalgraph(Enum):
#     task0 = TaskMIB_Causalgraph(
#         "MCQA", 
#         ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"],  # Use lowercase consistently
#         [str(i) for i in range(32)],
#         "mcqa",
#         ["output_token", "output_location"],
#         ["randomLetter_counterfactual", "answerPosition_counterfactual", 
#          "answerPosition_randomLetter_counterfactual"],
#         ["score"]
#     )

@dataclass 
class TaskMIB_Causalgraph:
    benchmark: str      
    models: list[str]   
    layers: dict[str, list[str]]  # Different layers for each model
    col_name: str      
    interventions: list[str]  
    counterfactuals: list[str]  
    metrics: list[str]  

# class TasksMib_Causalgraph(Enum):
#     task0 = TaskMIB_Causalgraph(
#         "MCQA", 
#         ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"],
#         {
#             "Qwen2ForCausalLM": [str(i) for i in range(24)],    # 0-23
#             "Gemma2ForCausalLM": [str(i) for i in range(26)],   # 0-25
#             "LlamaForCausalLM": [str(i) for i in range(32)]     # 0-31
#         },
#         "mcqa",
#         ["output_token", "output_location"],
#         ["randomLetter_counterfactual", "answerPosition_counterfactual", 
#          "answerPosition_randomLetter_counterfactual"],
#         ["score"]
#     )
class TasksMib_Causalgraph(Enum):
    task0 = TaskMIB_Causalgraph(
        "MCQA", 
        ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"],  # Use lowercase names to match actual columns
        {
            "qwen2forcausallm": [str(i) for i in range(24)],    # 0-23
            "gemma2forcausallm": [str(i) for i in range(26)],   # 0-25
            "llamaforcausallm": [str(i) for i in range(32)]     # 0-31
        },
        "mcqa",
        ["output_token", "output_location"],
        ["randomLetter_counterfactual", "answerPosition_counterfactual", 
         "answerPosition_randomLetter_counterfactual"],
        ["score"]
    )


NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------



# Your leaderboard name
TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark 2024 Leaderboards</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
The leaderboards for each track of the 2024 Mechanistic Interpretability Benchmark.
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
This leaderboard displays scores from the 2024 BabyLM Challenge. Each track has its own tab.
"""

EVALUATION_QUEUE_TEXT = """
## Some good practices before requesting a predictions upload:

Make sure you can get scores from your predictions file using the `score_predictions.py` script.
```bash
git clone https://github.com/babylm/evaluation-pipeline-2024/
cd evaluation-pipeline-2024
python score_predictions.py path/to/your/predictions.json.gz
```
If this step fails, follow the error messages to debug your predictions before getting in touch. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.

Make sure your model has an open license! This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model.

Once these steps have been followed, get in touch with the organizers with your predictions file(s), and the scores you've obtained.
We'll verify that we can match your scores, and then upload to the leaderboard. Optionally, you can give us your preferred model display name for the leaderboard, and a link to your model on HuggingFace.
"""

CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
CITATION_BUTTON_TEXT = r"""
@article{hu2024findingssecondbabylmchallenge,
      title={Findings of the Second BabyLM Challenge: Sample-Efficient Pretraining on Developmentally Plausible Corpora}, 
      author={Michael Y. Hu and Aaron Mueller and Candace Ross and Adina Williams and Tal Linzen and Chengxu Zhuang and Ryan Cotterell and Leshem Choshen and Alex Warstadt and Ethan Gotlieb Wilcox},
      year={2024},
      journal={Computing Research Repository},
      volume={arXiv:2412.05149},
      url={https://arxiv.org/abs/2412.05149}, 
}
"""