from dataclasses import dataclass, make_dataclass from enum import Enum import pandas as pd from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph, TasksMib_Causalgraph def fields(raw_class): return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] # These classes are for user facing column names, # to avoid having to change them all around the code # when a modif is needed @dataclass class ColumnContent: name: str type: str displayed_by_default: bool hidden: bool = False never_hidden: bool = False ## Leaderboard columns auto_eval_column_dict = [] auto_eval_column_dict_multimodal = [] auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]) auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)]) auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)]) #Scores for task in Tasks: auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) # Model information auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)]) auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)]) auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]) auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]) auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)]) auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)]) for task in TasksMultimodal: auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)]) auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)]) auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)]) auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]) AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True) AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_column_dict_multimodal, frozen=True) ############################################################################################################## # Version 1 # auto_eval_column_dict_mib_subgraph = [] # # Method name column # auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]) # # For each task and model combination # for task in TasksMib_Subgraph: # for model in task.value.models: # col_name = f"{task.value.benchmark}_{model}" # ioi_gpt2, mcqa_qwen2.5, etc. # auto_eval_column_dict_mib_subgraph.append([ # col_name, # ColumnContent, # ColumnContent(col_name, "number", True) # ]) # # Average column # auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)]) # ############################################################################################################## # # Version 2 # auto_eval_column_dict_mib_subgraph = [] # # Method name column # auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]) # # Add task filter column # task_values = list(set(task.value.benchmark for task in TasksMib_Subgraph)) # auto_eval_column_dict_mib_subgraph.append( # ["task_filter", ColumnContent, ColumnContent("Task", "str", True, never_hidden=True)] # ) # # Add model filter column # model_values = list(set( # model # for task in TasksMib_Subgraph # for model in task.value.models # )) # auto_eval_column_dict_mib_subgraph.append( # ["model_filter", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)] # ) # # For each task and model combination # for task in TasksMib_Subgraph: # for model in task.value.models: # col_name = f"{task.value.benchmark}_{model}" # auto_eval_column_dict_mib_subgraph.append([ # col_name, # ColumnContent, # ColumnContent(col_name, "number", True) # ]) # # Average column # auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)]) ############################################################################################################## # Version 3 # First, let's create field names that are valid Python identifiers auto_eval_column_dict_mib_subgraph = [] # Method name column (always present) auto_eval_column_dict_mib_subgraph.append( ["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)] ) # Add columns for each task-model combination for task in TasksMib_Subgraph: for model in task.value.models: # Create a valid field name by using underscores field_name = f"{task.value.benchmark}_{model}" # The display name can be more descriptive display_name = f"{task.value.benchmark}({model})" auto_eval_column_dict_mib_subgraph.append([ field_name, # This must be a valid Python identifier ColumnContent, ColumnContent(display_name, "number", True) ]) # Add the Average column auto_eval_column_dict_mib_subgraph.append( ["average", ColumnContent, ColumnContent("Average", "number", True)] ) print("Debug - Column field names:") for field in auto_eval_column_dict_mib_subgraph: print(f"Field name: {field[0]}, Display name: {field[2].name}") # Create the dataclass for MIB columns AutoEvalColumn_mib_subgraph = make_dataclass("AutoEvalColumn_mib_subgraph", auto_eval_column_dict_mib_subgraph, frozen=True) # Column selection for display COLS_MIB_SUBGRAPH = [c.name for c in fields(AutoEvalColumn_mib_subgraph) if not c.hidden] BENCHMARK_COLS_MIB_SUBGRAPH = [] for task in TasksMib_Subgraph: for model in task.value.models: col_name = f"{task.value.col_name}_{model.replace('-', '_')}" BENCHMARK_COLS_MIB_SUBGRAPH.append(col_name) # Implement the same for causal graph, auto_eval_column_dict_mib_causalgraph, AutoEvalColumn_mib_causalgraph AutoEvalColumn_mib_causalgraph = [] COLS_MIB_CAUSALGRAPH = [] BENCHMARK_COLS_MIB_CAUSALGRAPH = [] # # Initialize the MIB causal graph columns # auto_eval_column_dict_mib_causalgraph = [] # # Method name column # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]) # # For each model-task-intervention combination # for task in TasksMib_Causalgraph: # for model in task.value.models: # for intervention in task.value.interventions: # col_name = f"{model}_{task.value.benchmark}_{intervention}".lower() # auto_eval_column_dict_mib_causalgraph.append([ # col_name, # ColumnContent, # ColumnContent(col_name, "number", True) # ]) # auto_eval_column_dict_mib_causalgraph = [] # # Method name column # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]) # # For each model-task-intervention combination # for task in TasksMib_Causalgraph: # for model in task.value.models: # model_name = model.lower() # Convert model name to lowercase # for layer in task.value.layers: # for intervention in task.value.interventions: # for counterfactual in task.value.counterfactuals: # # Include model name in the column name # col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}" # field_name = col_name.lower() # auto_eval_column_dict_mib_causalgraph.append([ # field_name, # ColumnContent, # ColumnContent(col_name, "number", True) # ]) # # In utils.py, modify auto_eval_column_dict_mib_causalgraph: # auto_eval_column_dict_mib_causalgraph = [] # # Method name column # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]) # # For each model-task-intervention-counterfactual combination # for task in TasksMib_Causalgraph: # for model in ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"]: # exact model names # for layer in task.value.layers: # for intervention in task.value.interventions: # for counterfactual in task.value.counterfactuals: # # Match the exact format from the data # col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower() # auto_eval_column_dict_mib_causalgraph.append([ # col_name, # ColumnContent, # ColumnContent(col_name, "number", True) # ]) # auto_eval_column_dict_mib_causalgraph = [] # # Method name column # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]) # # Add eval_name column # auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)]) # # For each model-task-intervention-counterfactual combination # for task in TasksMib_Causalgraph: # for model in task.value.models: # Use exact model names with correct casing # model_name = model # Don't convert to lowercase # for layer in task.value.layers: # for intervention in task.value.interventions: # for counterfactual in task.value.counterfactuals: # # Match exact format from the actual data # col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}" # # Use the exact column name as both the field name and display name # auto_eval_column_dict_mib_causalgraph.append([ # col_name, # ColumnContent, # ColumnContent(col_name, "number", True) # ]) # auto_eval_column_dict_mib_causalgraph = [] # # Method name column # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]) # auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)]) # # For each model-task-intervention-counterfactual combination # for task in TasksMib_Causalgraph: # for model in task.value.models: # for layer in task.value.layers[model]: # Use model-specific layers # for intervention in task.value.interventions: # for counterfactual in task.value.counterfactuals: # col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}" # auto_eval_column_dict_mib_causalgraph.append([ # col_name, # ColumnContent, # ColumnContent(col_name, "number", True) # ]) # auto_eval_column_dict_mib_causalgraph = [] # # Method name column # auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]) # auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)]) # # For each model-task-intervention-counterfactual combination # for task in TasksMib_Causalgraph: # for model in task.value.models: # model will already be lowercase # for layer in task.value.layers[model]: # for intervention in task.value.interventions: # for counterfactual in task.value.counterfactuals: # # Use exactly the same format as in DataFrame # col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}" # auto_eval_column_dict_mib_causalgraph.append([ # col_name, # ColumnContent, # ColumnContent(col_name, "number", True) # ]) auto_eval_column_dict_mib_causalgraph = [] # Only include Method column as required auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)]) # For each model-task-intervention-counterfactual combination for task in TasksMib_Causalgraph: for model in task.value.models: # model will be lowercase for layer in task.value.layers[model]: for intervention in task.value.interventions: for counterfactual in task.value.counterfactuals: col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}" auto_eval_column_dict_mib_causalgraph.append([ col_name, ColumnContent, ColumnContent(col_name, "number", True) ]) # Create the dataclass AutoEvalColumn_mib_causalgraph = make_dataclass( "AutoEvalColumn_mib_causalgraph", auto_eval_column_dict_mib_causalgraph, frozen=True ) # Column selection for display COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden] BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"{model}_{task.value.benchmark}_{intervention}".lower() for task in TasksMib_Causalgraph for model in task.value.models for intervention in task.value.interventions] ## For the queue columns in the submission tab @dataclass(frozen=True) class EvalQueueColumn: # Queue column model = ColumnContent("model", "markdown", True) track = ColumnContent("track", "str", True) revision = ColumnContent("revision", "str", True) private = ColumnContent("private", "bool", True) status = ColumnContent("status", "str", True) ## All the model information that we might need @dataclass class ModelDetails: name: str display_name: str = "" symbol: str = "" # emoji # Column selection COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] COLS_MULTIMODAL = [c.name for c in fields(AutoEvalColumnMultimodal) if not c.hidden] EVAL_COLS = [c.name for c in fields(EvalQueueColumn)] EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)] BENCHMARK_COLS = [t.value.col_name for t in Tasks] BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal] TEXT_TASKS = { "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte", "boolq", "multirc", "wsc"], # Lots of BLiMP tasks – use verifier function below to see if you've included everything. "blimp": ["adjunct_island","anaphor_gender_agreement","anaphor_number_agreement","animate_subject_passive","animate_subject_trans", "causative","complex_NP_island","coordinate_structure_constraint_complex_left_branch","coordinate_structure_constraint_object_extraction","determiner_noun_agreement_1", "determiner_noun_agreement_2","determiner_noun_agreement_irregular_1","determiner_noun_agreement_irregular_2","determiner_noun_agreement_with_adjective_1", "determiner_noun_agreement_with_adj_2","determiner_noun_agreement_with_adj_irregular_1","determiner_noun_agreement_with_adj_irregular_2","distractor_agreement_relational_noun", "distractor_agreement_relative_clause","drop_argument","ellipsis_n_bar_1","ellipsis_n_bar_2", "existential_there_object_raising", "existential_there_quantifiers_1", "existential_there_quantifiers_2", "existential_there_subject_raising", "expletive_it_object_raising", "inchoative", "intransitive","irregular_past_participle_adjectives", "irregular_past_participle_verbs", "irregular_plural_subject_verb_agreement_1", "irregular_plural_subject_verb_agreement_2", "left_branch_island_echo_question", "left_branch_island_simple_question", "matrix_question_npi_licensor_present", "npi_present_1", "npi_present_2", "only_npi_licensor_present", "only_npi_scope", "passive_1", "passive_2", "principle_A_case_1", "principle_A_case_2", "principle_A_c_command", "principle_A_domain_1", "principle_A_domain_2", "principle_A_domain_3", "principle_A_reconstruction", "regular_plural_subject_verb_agreement_1", "regular_plural_subject_verb_agreement_2", "sentential_negation_npi_licensor_present", "sentential_negation_npi_scope", "sentential_subject_island", "superlative_quantifiers_1", "superlative_quantifiers_2", "tough_vs_raising_1", "tough_vs_raising_2", "transitive", "wh_island", "wh_questions_object_gap", "wh_questions_subject_gap", "wh_questions_subject_gap_long_distance", "wh_vs_that_no_gap", "wh_vs_that_no_gap_long_distance", "wh_vs_that_with_gap", "wh_vs_that_with_gap_long_distance" ], "blimp_supplement": ["hypernym", "qa_congruence_easy", "qa_congruence_tricky", "subject_aux_inversion", "turn_taking"], "ewok": ["agent-properties", "material-dynamics", "material-properties", "physical-dynamics", "physical-interactions", "physical-relations", "quantitative-properties", "social-interactions", "social-properties", "social-relations", "spatial-relations"] } VISION_TASKS = { "vqa": ["vqa"], "winoground": ["winoground"], "devbench": ["lex-viz_vocab", "gram-trog", "sem-things"] } NUM_EXPECTED_EXAMPLES = { "glue": { "cola": 522, "sst2": 436, "mrpc": 204, "qqp": 20215, "mnli": 4908, "mnli-mm": 4916, "qnli": 2732, "rte": 139, "boolq": 1635, "multirc": 2424, "wsc": 52 }, "blimp": { "adjunct_island": 928, "anaphor_gender_agreement": 971, "anaphor_number_agreement": 931, "animate_subject_passive": 895, "animate_subject_trans": 923, "causative": 818, "complex_NP_island": 846, "coordinate_structure_constraint_complex_left_branch": 906, "coordinate_structure_constraint_object_extraction": 949, "determiner_noun_agreement_1": 929, "determiner_noun_agreement_2": 931, "determiner_noun_agreement_irregular_1": 681, "determiner_noun_agreement_irregular_2": 820, "determiner_noun_agreement_with_adjective_1": 933, "determiner_noun_agreement_with_adj_2": 941, "determiner_noun_agreement_with_adj_irregular_1": 718, "determiner_noun_agreement_with_adj_irregular_2": 840, "distractor_agreement_relational_noun": 788, "distractor_agreement_relative_clause": 871, "drop_argument": 920, "ellipsis_n_bar_1": 802, "ellipsis_n_bar_2": 828, "existential_there_object_raising": 812, "existential_there_quantifiers_1": 930, "existential_there_quantifiers_2": 911, "existential_there_subject_raising": 924, "expletive_it_object_raising": 759, "inchoative": 855, "intransitive": 868, "irregular_past_participle_adjectives": 961, "irregular_past_participle_verbs": 942, "irregular_plural_subject_verb_agreement_1": 804, "irregular_plural_subject_verb_agreement_2": 892, "left_branch_island_echo_question": 947, "left_branch_island_simple_question": 951, "matrix_question_npi_licensor_present": 929, "npi_present_1": 909, "npi_present_2": 914, "only_npi_licensor_present": 882, "only_npi_scope": 837, "passive_1": 840, "passive_2": 903, "principle_A_case_1": 912, "principle_A_case_2": 915, "principle_A_c_command": 946, "principle_A_domain_1": 914, "principle_A_domain_2": 915, "principle_A_domain_3": 941, "principle_A_reconstruction": 967, "regular_plural_subject_verb_agreement_1": 890, "regular_plural_subject_verb_agreement_2": 945, "sentential_negation_npi_licensor_present": 919, "sentential_negation_npi_scope": 871, "sentential_subject_island": 961, "superlative_quantifiers_1": 979, "superlative_quantifiers_2": 986, "tough_vs_raising_1": 948, "tough_vs_raising_2": 920, "transitive": 868, "wh_island": 960, "wh_questions_object_gap": 859, "wh_questions_subject_gap": 898, "wh_questions_subject_gap_long_distance": 857, "wh_vs_that_no_gap": 861, "wh_vs_that_no_gap_long_distance": 875, "wh_vs_that_with_gap": 919, "wh_vs_that_with_gap_long_distance": 910 }, "blimp_supplement": { "hypernym": 842, "qa_congruence_easy": 64, "qa_congruence_tricky": 165, "subject_aux_inversion": 3867, "turn_taking": 280 }, "ewok": { "agent-properties": 2210, "material-dynamics": 770, "material-properties": 170, "physical-dynamics": 120, "physical-interactions": 556, "physical-relations": 818, "quantitative-properties": 314, "social-interactions": 294, "social-properties": 328, "social-relations": 1548, "spatial-relations": 490 }, "vqa": { "vqa": 25230 }, "winoground": { "winoground": 746 }, "devbench": { "lex-viz_vocab": 119, "gram-trog": 76, "sem-things": 1854 } }