Spaces:

mib-bench
/

leaderboard

Running

jasonshaoshun commited on Jan 29

Commit

b7ce23f

1 Parent(s): 5051ffe

debug

Files changed (1) hide show

src/leaderboard/read_evals.py CHANGED Viewed

@@ -84,7 +84,6 @@ class EvalResult_MIB_SUBGRAPH:
             scores = model_result.get("scores", {})
             # for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
-            print(f"TasksMib_Subgraph.get_all_tasks() is {TasksMib_Subgraph.get_all_tasks()}")
             for task in TasksMib_Subgraph.get_all_tasks():
                 if task in scores:
                     results[task][model_name] = {
@@ -108,16 +107,16 @@ class EvalResult_MIB_SUBGRAPH:
         }
         # Initialize all possible columns with '-'
-        expected_models = ["llama3", "qwen2_5", "gpt2", "gemma2"]
-        expected_tasks = ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]
         for task in expected_tasks:
             for model in expected_models:
-                if model == "gpt2" and task != "ioi":
-                    continue
-                if model == "qwen2_5" and task.startswith(("arithmetic", "arc")):
-                    continue
-                if model == "gemma2" and (task.startswith("arithmetic") or task == "arc_challenge"):
-                    continue
                 data_dict[f"{task}_{model}"] = '-'
         all_scores = []

             scores = model_result.get("scores", {})
             # for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
             for task in TasksMib_Subgraph.get_all_tasks():
                 if task in scores:
                     results[task][model_name] = {
         }
         # Initialize all possible columns with '-'
+        expected_models = TasksMib_Subgraph.get_all_models()
+        expected_tasks = TasksMib_Subgraph.get_all_tasks()
         for task in expected_tasks:
             for model in expected_models:
+                # if model == "gpt2" and task != "ioi":
+                #     continue
+                # if model == "qwen2_5" and task.startswith(("arithmetic", "arc")):
+                #     continue
+                # if model == "gemma2" and (task.startswith("arithmetic") or task == "arc_challenge"):
+                #     continue
                 data_dict[f"{task}_{model}"] = '-'
         all_scores = []