Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

jasonshaoshun commited on Jan 9

Commit

2fc77f5

1 Parent(s): bf334c6

first commit

Browse files

Files changed (14) hide show

Makefile +13 -0
README.md +39 -8
app.py +196 -0
pyproject.toml +13 -0
requirements.txt +17 -0
src/about.py +89 -0
src/display/css_html_js.py +105 -0
src/display/formatting.py +27 -0
src/display/utils.py +291 -0
src/envs.py +30 -0
src/leaderboard/read_evals.py +607 -0
src/populate.py +135 -0
src/submission/check_validity.py +167 -0
src/submission/submit.py +111 -0

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

README.md CHANGED Viewed

@@ -1,14 +1,45 @@
 ---
-title: Leaderboard
-emoji: 📚
-colorFrom: blue
-colorTo: gray
 sdk: gradio
-sdk_version: 5.8.0
 app_file: app.py
-pinned: false
 license: apache-2.0
-short_description: The Mechanistic Interpretability Benchmark leaderboard.
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Leaderboard 2024
+emoji: 🥇
+colorFrom: green
+colorTo: indigo
 sdk: gradio
 app_file: app.py
+pinned: true
 license: apache-2.0
+short_description: Leaderboard for the 2024 BabyLM Challenge
 ---
+# Start the configuration
+Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
+Results files should have the following format and be stored as json files:
+```json
+{
+    "config": {
+        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
+        "model_name": "path of the model on the hub: org/model",
+        "model_sha": "revision on the hub",
+    },
+    "results": {
+        "task_name": {
+            "metric_name": score,
+        },
+        "task_name2": {
+            "metric_name": score,
+        }
+    }
+}
+```
+Request files are created automatically by this tool.
+If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
+# Code logic for more complex edits
+You'll find
+- the main table' columns names and properties in `src/display/utils.py`
+- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import json
+import gzip
+import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from io import StringIO
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    BENCHMARK_COLS_MULTIMODAL,
+    BENCHMARK_COLS_MIB,
+    COLS,
+    COLS_MIB,
+    COLS_MULTIMODAL,
+    EVAL_COLS,
+    EVAL_TYPES,
+    AutoEvalColumn,
+    AutoEvalColumn_mib,
+    fields,
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH
+from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib
+from src.submission.submit import add_new_eval
+print("restart_space ")
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+print("end restart_space")
+print("Space initialisation ")
+### Space initialisation
+print("EVAL_REQUESTS_PATH")
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+print("EVAL_RESULTS_PATH")
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+print("RESULTS_REPO_MIB_SUBGRAPH")
+try:
+    print(RESULTS_REPO_MIB_SUBGRAPH)
+    snapshot_download(
+        repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+print("End Space initialisation ")
+LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
+# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
+(
+    finished_eval_queue_df,
+    running_eval_queue_df,
+    pending_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard_mib(dataframe, track):
+    print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    # filter for correct track
+    # dataframe = dataframe.loc[dataframe["Track"] == track]
+    print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
+    return Leaderboard(
+        value=dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn_mib)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn_mib) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn_mib) if c.never_hidden],
+            label="Select Columns to Display:",
+        ),
+        search_columns=["Method"],  # Changed from AutoEvalColumn_mib.model.name to "Method"
+        hide_columns=[c.name for c in fields(AutoEvalColumn_mib) if c.hidden],
+        bool_checkboxgroup_label="Hide models",
+        interactive=False,
+    )
+def init_leaderboard(dataframe, track):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    # filter for correct track
+    dataframe = dataframe.loc[dataframe["Track"] == track]
+    # print(f"\n\n\n dataframe is {dataframe}\n\n\n")
+    return Leaderboard(
+        value=dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[AutoEvalColumn.model.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        bool_checkboxgroup_label="Hide models",
+        interactive=False,
+    )
+def process_json(temp_file):
+    if temp_file is None:
+        return {}
+    # Handle file upload
+    try:
+        file_path = temp_file.name
+        if file_path.endswith('.gz'):
+            with gzip.open(file_path, 'rt') as f:
+                data = json.load(f)
+        else:
+            with open(file_path, 'r') as f:
+                data = json.load(f)
+    except Exception as e:
+        raise gr.Error(f"Error processing file: {str(e)}")
+    gr.Markdown("Upload successful!")
+    return data
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        # with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
+        #     leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
+        # with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
+        #     leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
+        # with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
+        #     leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
+        # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
+        #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        # with gr.TabItem("👶 Submit", elem_id="llm-benchmark-tab-table", id=5):
+        #     with gr.Column():
+        #         with gr.Row():
+        #             gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
+            leaderboard = init_leaderboard_mib(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
+            # leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
+        with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
+            leaderboard = init_leaderboard_mib(LEADERBOARD_DF_MIB_SUBGRAPH, "Causal Graph")
+    # with gr.Row():
+    #     with gr.Accordion("📙 Citation", open=False):
+    #         citation_button = gr.Textbox(
+    #             value=CITATION_BUTTON_TEXT,
+    #             label=CITATION_BUTTON_LABEL,
+    #             lines=20,
+    #             elem_id="citation-button",
+    #             show_copy_button=True,
+    #         )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+demo.launch(share=True, ssr_mode=False)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+APScheduler
+black
+datasets
+fastapi==0.112.2
+gradio
+gradio[oauth]
+gradio_leaderboard==0.0.13
+gradio_client
+huggingface-hub>=0.18.0
+matplotlib
+numpy
+pandas
+python-dateutil
+tqdm
+transformers
+tokenizers>=0.15.0
+sentencepiece

src/about.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+@dataclass
+class TaskMIB:
+    benchmark: str      # task name in json (ioi/arithmetic)
+    models: list[str]   # list of models to show as sub-columns
+    col_name: str       # display name in leaderboard
+    metrics: list[str]  # metrics to store (edge_counts, faithfulness)
+# Select your tasks here
+# ---------------------------------------------------
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("blimp", "acc", "BLiMP")
+    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
+    task2 = Task("glue", "acc", "(Super)GLUE")
+    task3 = Task("ewok", "acc", "EWoK")
+class TasksMIB(Enum):
+    task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
+    task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
+class TasksMultimodal(Enum):
+    task0 = Task("blimp", "acc", "BLiMP")
+    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
+    task2 = Task("glue", "acc", "(Super)GLUE")
+    task3 = Task("ewok", "acc", "EWoK")
+    task4 = Task("vqa", "acc", "VQA")
+    task5 = Task("winoground", "acc", "Winoground")
+    task6 = Task("devbench", "acc", "DevBench")
+NUM_FEWSHOT = 0 # Change with your few shot
+# ---------------------------------------------------
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Benchmark 2024 Leaderboards</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+The leaderboards for each track of the 2024 Mechanistic Interpretability Benchmark.
+"""
+# Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = f"""
+This leaderboard displays scores from the 2024 BabyLM Challenge. Each track has its own tab.
+"""
+EVALUATION_QUEUE_TEXT = """
+## Some good practices before requesting a predictions upload:
+Make sure you can get scores from your predictions file using the `score_predictions.py` script.
+```bash
+git clone https://github.com/babylm/evaluation-pipeline-2024/
+cd evaluation-pipeline-2024
+python score_predictions.py path/to/your/predictions.json.gz
+```
+If this step fails, follow the error messages to debug your predictions before getting in touch. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
+Make sure your model has an open license! This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model.
+Once these steps have been followed, get in touch with the organizers with your predictions file(s), and the scores you've obtained.
+We'll verify that we can match your scores, and then upload to the leaderboard. Optionally, you can give us your preferred model display name for the leaderboard, and a link to your model on HuggingFace.
+"""
+CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
+CITATION_BUTTON_TEXT = r"""
+@article{hu2024findingssecondbabylmchallenge,
+      title={Findings of the Second BabyLM Challenge: Sample-Efficient Pretraining on Developmentally Plausible Corpora},
+      author={Michael Y. Hu and Aaron Mueller and Candace Ross and Adina Williams and Tal Linzen and Chengxu Zhuang and Ryan Cotterell and Leshem Choshen and Alex Warstadt and Ethan Gotlieb Wilcox},
+      year={2024},
+      journal={Computing Research Repository},
+      volume={arXiv:2412.05149},
+      url={https://arxiv.org/abs/2412.05149},
+}
+"""

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,105 @@

+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+#leaderboard-table td:nth-child(2),
+#leaderboard-table th:nth-child(2) {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+"""
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,27 @@

+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_repo, model_name):
+    link = f"https://huggingface.co/{model_repo}"
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,291 @@

+from dataclasses import dataclass, make_dataclass
+from enum import Enum
+import pandas as pd
+from src.about import Tasks, TasksMultimodal, TasksMIB
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+## Leaderboard columns
+auto_eval_column_dict_mib = []
+auto_eval_column_dict = []
+auto_eval_column_dict_multimodal = []
+auto_eval_column_dict_mib = []
+# Method name column
+auto_eval_column_dict_mib.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
+# For each task and model combination
+for task in TasksMIB:
+    for model in task.value.models:
+        col_name = f"{task.value.benchmark}_{model}"  # ioi_meta_llama, mcqa_qwen, etc.
+        auto_eval_column_dict_mib.append([
+            col_name,
+            ColumnContent,
+            ColumnContent(col_name, "number", True)
+        ])
+# Average column
+auto_eval_column_dict_mib.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
+# Create the dataclass for MIB columns
+AutoEvalColumn_mib = make_dataclass("AutoEvalColumn_mib", auto_eval_column_dict_mib, frozen=True)
+# Column selection for display
+COLS_MIB = [c.name for c in fields(AutoEvalColumn_mib) if not c.hidden]
+# BENCHMARK_COLS_MIB = [t.value.col_name for t in TasksMIB]
+BENCHMARK_COLS_MIB = []
+for task in TasksMIB:
+    for model in task.value.models:
+        col_name = f"{task.value.col_name}_{model.replace('-', '_')}"
+        BENCHMARK_COLS_MIB.append(col_name)
+# Init
+auto_eval_column_dict_mib.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+# auto_eval_column_dict_mib.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
+# auto_eval_column_dict_mib.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
+#Scores
+for task in TasksMIB:
+    auto_eval_column_dict_mib.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
+auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
+#Scores
+for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+# Model information
+auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
+auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
+for task in TasksMultimodal:
+    auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+    if task.value.col_name in ("ewok", "EWoK"):   # make sure this appears in the right order
+        auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
+auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
+auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_column_dict_multimodal, frozen=True)
+## For the queue columns in the submission tab
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    track = ColumnContent("track", "str", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    status = ColumnContent("status", "str", True)
+## All the model information that we might need
+@dataclass
+class ModelDetails:
+    name: str
+    display_name: str = ""
+    symbol: str = "" # emoji
+# Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+COLS_MULTIMODAL = [c.name for c in fields(AutoEvalColumnMultimodal) if not c.hidden]
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
+TEXT_TASKS = {
+    "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
+            "boolq", "multirc", "wsc"],
+    # Lots of BLiMP tasks – use verifier function below to see if you've included everything.
+    "blimp": ["adjunct_island","anaphor_gender_agreement","anaphor_number_agreement","animate_subject_passive","animate_subject_trans",
+        "causative","complex_NP_island","coordinate_structure_constraint_complex_left_branch","coordinate_structure_constraint_object_extraction","determiner_noun_agreement_1",
+        "determiner_noun_agreement_2","determiner_noun_agreement_irregular_1","determiner_noun_agreement_irregular_2","determiner_noun_agreement_with_adjective_1",
+        "determiner_noun_agreement_with_adj_2","determiner_noun_agreement_with_adj_irregular_1","determiner_noun_agreement_with_adj_irregular_2","distractor_agreement_relational_noun",
+        "distractor_agreement_relative_clause","drop_argument","ellipsis_n_bar_1","ellipsis_n_bar_2",
+        "existential_there_object_raising", "existential_there_quantifiers_1",
+        "existential_there_quantifiers_2", "existential_there_subject_raising", "expletive_it_object_raising",
+        "inchoative", "intransitive","irregular_past_participle_adjectives", "irregular_past_participle_verbs",
+        "irregular_plural_subject_verb_agreement_1", "irregular_plural_subject_verb_agreement_2", "left_branch_island_echo_question", "left_branch_island_simple_question",
+        "matrix_question_npi_licensor_present", "npi_present_1", "npi_present_2", "only_npi_licensor_present", "only_npi_scope", "passive_1", "passive_2",
+        "principle_A_case_1", "principle_A_case_2", "principle_A_c_command", "principle_A_domain_1",
+        "principle_A_domain_2", "principle_A_domain_3", "principle_A_reconstruction", "regular_plural_subject_verb_agreement_1",
+        "regular_plural_subject_verb_agreement_2", "sentential_negation_npi_licensor_present", "sentential_negation_npi_scope", "sentential_subject_island",
+        "superlative_quantifiers_1", "superlative_quantifiers_2", "tough_vs_raising_1", "tough_vs_raising_2",
+        "transitive", "wh_island", "wh_questions_object_gap", "wh_questions_subject_gap",
+        "wh_questions_subject_gap_long_distance", "wh_vs_that_no_gap", "wh_vs_that_no_gap_long_distance", "wh_vs_that_with_gap",
+        "wh_vs_that_with_gap_long_distance"
+    ],
+    "blimp_supplement": ["hypernym", "qa_congruence_easy", "qa_congruence_tricky",
+                "subject_aux_inversion", "turn_taking"],
+    "ewok": ["agent-properties", "material-dynamics", "material-properties", "physical-dynamics",
+            "physical-interactions", "physical-relations", "quantitative-properties",
+            "social-interactions", "social-properties", "social-relations", "spatial-relations"]
+}
+VISION_TASKS = {
+    "vqa": ["vqa"],
+    "winoground": ["winoground"],
+    "devbench": ["lex-viz_vocab", "gram-trog", "sem-things"]
+}
+NUM_EXPECTED_EXAMPLES = {
+    "glue": {
+        "cola": 522,
+        "sst2": 436,
+        "mrpc": 204,
+        "qqp": 20215,
+        "mnli": 4908,
+        "mnli-mm": 4916,
+        "qnli": 2732,
+        "rte": 139,
+        "boolq": 1635,
+        "multirc": 2424,
+        "wsc": 52
+    },
+    "blimp": {
+        "adjunct_island": 928,
+        "anaphor_gender_agreement": 971,
+        "anaphor_number_agreement": 931,
+        "animate_subject_passive": 895,
+        "animate_subject_trans": 923,
+        "causative": 818,
+        "complex_NP_island": 846,
+        "coordinate_structure_constraint_complex_left_branch": 906,
+        "coordinate_structure_constraint_object_extraction": 949,
+        "determiner_noun_agreement_1": 929,
+        "determiner_noun_agreement_2": 931,
+        "determiner_noun_agreement_irregular_1": 681,
+        "determiner_noun_agreement_irregular_2": 820,
+        "determiner_noun_agreement_with_adjective_1": 933,
+        "determiner_noun_agreement_with_adj_2": 941,
+        "determiner_noun_agreement_with_adj_irregular_1": 718,
+        "determiner_noun_agreement_with_adj_irregular_2": 840,
+        "distractor_agreement_relational_noun": 788,
+        "distractor_agreement_relative_clause": 871,
+        "drop_argument": 920,
+        "ellipsis_n_bar_1": 802,
+        "ellipsis_n_bar_2": 828,
+        "existential_there_object_raising": 812,
+        "existential_there_quantifiers_1": 930,
+        "existential_there_quantifiers_2": 911,
+        "existential_there_subject_raising": 924,
+        "expletive_it_object_raising": 759,
+        "inchoative": 855,
+        "intransitive": 868,
+        "irregular_past_participle_adjectives": 961,
+        "irregular_past_participle_verbs": 942,
+        "irregular_plural_subject_verb_agreement_1": 804,
+        "irregular_plural_subject_verb_agreement_2": 892,
+        "left_branch_island_echo_question": 947,
+        "left_branch_island_simple_question": 951,
+        "matrix_question_npi_licensor_present": 929,
+        "npi_present_1": 909,
+        "npi_present_2": 914,
+        "only_npi_licensor_present": 882,
+        "only_npi_scope": 837,
+        "passive_1": 840,
+        "passive_2": 903,
+        "principle_A_case_1": 912,
+        "principle_A_case_2": 915,
+        "principle_A_c_command": 946,
+        "principle_A_domain_1": 914,
+        "principle_A_domain_2": 915,
+        "principle_A_domain_3": 941,
+        "principle_A_reconstruction": 967,
+        "regular_plural_subject_verb_agreement_1": 890,
+        "regular_plural_subject_verb_agreement_2": 945,
+        "sentential_negation_npi_licensor_present": 919,
+        "sentential_negation_npi_scope": 871,
+        "sentential_subject_island": 961,
+        "superlative_quantifiers_1": 979,
+        "superlative_quantifiers_2": 986,
+        "tough_vs_raising_1": 948,
+        "tough_vs_raising_2": 920,
+        "transitive": 868,
+        "wh_island": 960,
+        "wh_questions_object_gap": 859,
+        "wh_questions_subject_gap": 898,
+        "wh_questions_subject_gap_long_distance": 857,
+        "wh_vs_that_no_gap": 861,
+        "wh_vs_that_no_gap_long_distance": 875,
+        "wh_vs_that_with_gap": 919,
+        "wh_vs_that_with_gap_long_distance": 910
+    },
+    "blimp_supplement": {
+        "hypernym": 842,
+        "qa_congruence_easy": 64,
+        "qa_congruence_tricky": 165,
+        "subject_aux_inversion": 3867,
+        "turn_taking": 280
+    },
+    "ewok": {
+        "agent-properties": 2210,
+        "material-dynamics": 770,
+        "material-properties": 170,
+        "physical-dynamics": 120,
+        "physical-interactions": 556,
+        "physical-relations": 818,
+        "quantitative-properties": 314,
+        "social-interactions": 294,
+        "social-properties": 328,
+        "social-relations": 1548,
+        "spatial-relations": 490
+    },
+    "vqa": {
+        "vqa": 25230
+    },
+    "winoground": {
+        "winoground": 746
+    },
+    "devbench": {
+        "lex-viz_vocab": 119,
+        "gram-trog": 76,
+        "sem-things": 1854
+    }
+}

src/envs.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "shunshao" # Change to your org - don't forget to create a results and request dataset, with the correct format!
+# ----------------------------------
+REPO_ID = f"{OWNER}/mib-test"
+QUEUE_REPO = f"{OWNER}/requests-mib-test"
+RESULTS_REPO = f"{OWNER}/results-mib-test"
+RESULTS_REPO_MIB_SUBGRAPH = f"{OWNER}/results-mib-subgraph"
+# If you setup a cache later, just change HF_HOME
+CACHE_PATH=os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_RESULTS_MIB_SUBGRAPH_PATH = os.path.join(CACHE_PATH, "eval-results-mib-subgraph")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+API = HfApi(token=TOKEN)

src/leaderboard/read_evals.py ADDED Viewed

	@@ -0,0 +1,607 @@

+import glob
+import json
+import math
+import os
+from dataclasses import dataclass
+import dateutil
+import numpy as np
+from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, Tasks, TasksMultimodal
+from src.submission.check_validity import is_model_on_hub
+from typing import List, Dict
+from src.about import TasksMIB
+def compute_area(edge_counts, faithfulnesses, log_scale=True):
+    percentages = [e / max(edge_counts) for e in edge_counts]
+    area_under = 0.
+    area_from_100 = 0.
+    for i in range(len(faithfulnesses) - 1):
+        i_1, i_2 = i, i+1
+        x_1 = percentages[i_1]
+        x_2 = percentages[i_2]
+        # area from point to 100
+        if log_scale:
+            x_1 = math.log(x_1)
+            x_2 = math.log(x_2)
+        trapezoidal = (percentages[i_2] - percentages[i_1]) * \
+                        (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
+        area_from_100 += trapezoidal
+        trapezoidal = (percentages[i_2] - percentages[i_1]) * ((faithfulnesses[i_1] + faithfulnesses[i_2]) / 2)
+        area_under += trapezoidal
+    average = sum(faithfulnesses) / len(faithfulnesses)
+    return (area_under, area_from_100, average)
+@dataclass
+class EvalResult_MIB:
+    """Represents one full evaluation for a method across all models in MIB."""
+    eval_name: str        # method name as identifier
+    method_name: str      # name of the interpretation method
+    results: Dict         # nested dict of results {task: {model: {metric: scores}}}
+    # def init_from_json_file(self, json_filepath):
+    #     """Inits results from the method result file"""
+    #     with open(json_filepath) as fp:
+    #         data = json.load(fp)
+    #     method_name = data.get("method_name")
+    #     def _get_task_metrics(scores, task_name):
+    #         """Extract both edge_counts and faithfulness scores"""
+    #         task_scores = scores.get(task_name, {})
+    #         if not task_scores:
+    #             return None
+    #         edge_counts = task_scores.get("edge_counts", [])
+    #         faithfulness = task_scores.get("faithfulness", [])
+    #         if not edge_counts or not faithfulness:
+    #             return None
+    #         # Handle case where faithfulness is a list of lists
+    #         if isinstance(faithfulness[0], list):
+    #             faithfulness = faithfulness[0]
+    #         return {
+    #             "edge_counts": edge_counts,
+    #             "faithfulness": faithfulness
+    #         }
+    #     # Process results for each model
+    #     results = {}
+    #     for task in TasksMIB:
+    #         results[task.value.benchmark] = {}
+    #         for model_result in data.get("results", []):
+    #             # model_id = model_result.get("model_id", "").split('/')[-1]  # Get last part of model path
+    #             model_id = model_result.get("model_id", "").split('/')[0]
+    #             scores = model_result.get("scores", {})
+    #             metrics = _get_task_metrics(scores, task.value.benchmark)
+    #             if metrics is not None:
+    #                 results[task.value.benchmark][model_id] = metrics
+    #     return EvalResult_MIB(
+    #         eval_name=method_name,
+    #         method_name=method_name,
+    #         results=results
+    #     )
+    def init_from_json_file(self, json_filepath):
+        """Inits results from the method result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        method_name = data.get("method_name")
+        # Initialize results dictionary with the exact structure from JSON
+        results = {}
+        for task in ["ioi", "mcqa"]:  # Use exact task names from JSON
+            results[task] = {}
+        # Process each model's results maintaining original structure
+        for model_result in data.get("results", []):
+            model_id = model_result.get("model_id", "")
+            if "/" in model_id:
+                org = model_id.split("/")[0]
+                if org == "meta-llama":
+                    model_name = "meta_llama"
+                elif org == "Qwen":
+                    model_name = "qwen"
+                elif "gpt" in model_id.lower():
+                    model_name = "gpt2"
+            else:
+                model_name = model_id
+            # Keep exact scores structure from JSON
+            scores = model_result.get("scores", {})
+            for task in ["ioi", "mcqa"]:
+                if task in scores:
+                    results[task][model_name] = {
+                        "edge_counts": scores[task]["edge_counts"],
+                        "faithfulness": scores[task]["faithfulness"]
+                    }
+        return EvalResult_MIB(
+            eval_name=method_name,
+            method_name=method_name,
+            results=results
+        )
+    # def to_dict(self):
+    #     """Converts the Eval Result to a dict for dataframe display"""
+    #     data_dict = {
+    #         "eval_name": self.eval_name,
+    #         "Method": self.method_name,
+    #     }
+    #     all_scores = []
+    #     expected_entries = 0  # Count how many entries we expect
+    #     actual_entries = 0    # Count how many entries we actually got
+    #     # For each task (ioi, mcqa)
+    #     for task, task_results in self.results.items():
+    #         # Get the models that have results for this task
+    #         models = task_results.keys()
+    #         for model in models:
+    #             expected_entries += 1
+    #             col_name = f"{task}_{model}"
+    #             metrics = task_results[model]
+    #             if metrics:
+    #                 edge_counts = metrics["edge_counts"]
+    #                 faithfulness = metrics["faithfulness"]
+    #                 if isinstance(faithfulness[0], list):
+    #                     faithfulness = faithfulness[0]
+    #                 # Use compute_area instead of simple averaging
+    #                 area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
+    #                 score = area_under * 100  # Scale up for readability
+    #                 data_dict[col_name] = round(score, 2)
+    #                 all_scores.append(score)
+    #                 actual_entries += 1
+    #             else:
+    #                 data_dict[col_name] = '-'
+    #     # Only show average if all entries are present
+    #     if actual_entries == expected_entries:
+    #         data_dict["Average"] = round(np.mean(all_scores), 2)
+    #     else:
+    #         data_dict["Average"] = '-'
+    #     return data_dict
+    def to_dict(self):
+        """Converts the Eval Result to a dict for dataframe display"""
+        data_dict = {
+            "eval_name": self.eval_name,
+            "Method": self.method_name,
+        }
+        all_scores = []
+        required_entries = {
+            'ioi_meta_llama': False,
+            'ioi_qwen': False,
+            'ioi_gpt2': False,
+            'mcqa_meta_llama': False,
+            'mcqa_qwen': False,
+            'mcqa_gpt2': False
+        }
+        # For each task (ioi, mcqa)
+        for task, task_results in self.results.items():
+            # Get the models that have results for this task
+            models = task_results.keys()
+            for model in models:
+                col_name = f"{task}_{model}"
+                metrics = task_results[model]
+                if metrics:
+                    edge_counts = metrics["edge_counts"]
+                    faithfulness = metrics["faithfulness"]
+                    if isinstance(faithfulness[0], list):
+                        faithfulness = faithfulness[0]
+                    # Use compute_area
+                    area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
+                    score = area_under * 100
+                    data_dict[col_name] = round(score, 2)
+                    all_scores.append(score)
+                    required_entries[col_name] = True
+                else:
+                    data_dict[col_name] = '-'
+        # Only show average if all six required entries are present
+        if all(required_entries.values()):
+            data_dict["Average"] = round(np.mean(all_scores), 2)
+        else:
+            data_dict["Average"] = '-'
+        return data_dict
+@dataclass
+class EvalResult:
+    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
+    """
+    eval_name: str # org_model_track (uid)
+    full_model: str # org/model (name of model)
+    repo_id: str # org/model (path to model on HF)
+    track: str
+    org: str
+    model: str
+    revision: str # commit hash, "" if main
+    results: dict
+    date: str = "" # submission date of request file
+    still_on_hub: bool = False
+    @classmethod
+    def init_from_json_file(self, json_filepath):
+        """Inits the result from the specific model result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        config = data.get("config")
+        track = data.get("track")
+        # Get model and org
+        org_and_model = config.get("model_name", config.get("model_args", None))
+        repo_id = config.get("hf_repo", config.get("hf_repo", None))
+        org_and_model = org_and_model.split("/", 1)
+        if len(org_and_model) == 1:
+            org = None
+            model = org_and_model[0]
+        else:
+            org = org_and_model[0]
+            model = org_and_model[1]
+        full_model = "/".join(org_and_model)
+        eval_name = "_".join(org_and_model) + f"_{track}"
+        still_on_hub, _, model_config = is_model_on_hub(
+            repo_id, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
+        )
+        def _get_task_results(task):
+            # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
+                return None
+            mean_acc = np.mean(accs) * 100.0
+            return mean_acc
+        # Extract results available in this file (some results are split in several files)
+        results = {}
+        if track.lower() == "multimodal":
+            for task in TasksMultimodal:
+                task = task.value
+                task_result = _get_task_results(task)
+                if task_result is not None:
+                    results[task.benchmark] = task_result
+        else:
+            for task in Tasks:
+                task = task.value
+                task_result = _get_task_results(task)
+                if task_result is not None:
+                    results[task.benchmark] = task_result
+        return self(
+            eval_name=eval_name,
+            full_model=full_model,
+            repo_id=repo_id,
+            track=track,
+            org=org,
+            model=model,
+            results=results,
+            revision=config.get("model_sha", ""),
+            still_on_hub=still_on_hub,
+        )
+# Q: not sure what to do with this
+    def update_with_request_file(self, requests_path):
+        """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model, self.track)
+        try:
+            with open(request_file, "r") as f:
+                request = json.load(f)
+            self.date = request.get("submitted_time", "")
+        except Exception:
+            print(f"Could not find request file for {self.org}/{self.model}")
+    def to_dict(self):
+        """Converts the Eval Result to a dict compatible with our dataframe display"""
+        eval_column = AutoEvalColumnMultimodal if self.track.lower() == "multimodal" else AutoEvalColumn
+        vision_tasks = ("VQA", "Winoground", "DevBench", "vqa", "winoground", "devbench")
+        num_text_tasks = len(Tasks)
+        text_average = sum([v for k, v in self.results.items() if v is not None and k not in vision_tasks]) / num_text_tasks
+        if self.still_on_hub:
+            model_display_name = make_clickable_model(self.repo_id, self.full_model)
+        else:
+            model_display_name = self.full_model
+        data_dict = {
+            "eval_name": self.eval_name,  # not a column, just a save name,
+            eval_column.model.name: model_display_name,
+            eval_column.hf_repo.name: self.repo_id,
+            eval_column.revision.name: self.revision,
+            eval_column.text_average.name: text_average,
+            eval_column.still_on_hub.name: self.still_on_hub,
+        }
+        if self.track.lower() == "multimodal":
+            taskset = TasksMultimodal
+            num_vision_tasks = len(TasksMultimodal) - len(Tasks)
+            vision_average = sum([v for k, v in self.results.items() if v is not None and k in vision_tasks]) / num_vision_tasks
+            data_dict[eval_column.vision_average.name] = vision_average
+        else:
+            taskset = Tasks
+        for task in taskset:
+            data_dict[task.value.col_name] = self.results[task.value.benchmark]
+        return data_dict
+def get_request_file_for_model(requests_path, model_name, track):
+    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
+    request_files = os.path.join(
+        requests_path,
+        f"{model_name}_eval_request_*.json",
+    )
+    request_files = glob.glob(request_files)
+    # Select correct request file (track)
+    request_file = ""
+    request_files = sorted(request_files, reverse=True)
+    for tmp_request_file in request_files:
+        with open(tmp_request_file, "r") as f:
+            req_content = json.load(f)
+            if (
+                req_content["status"] in ["FINISHED"]
+            ):
+                request_file = tmp_request_file
+    return request_file
+def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
+    """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths = []
+    print(f"results_path is {results_path}")
+    for root, dirnames, files in os.walk(results_path):
+        print(f"root is {root}, dirnames is {dirnames}, files is {files}")
+        # We should only have json files in model results
+        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+            continue
+        # Sort the files by date
+        try:
+            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except dateutil.parser._parser.ParserError:
+            files = [files[-1]]
+        for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
+    print(f"model_result_filepaths is {model_result_filepaths}")
+    eval_results = {}
+    for model_result_filepath in model_result_filepaths:
+        # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        eval_result.update_with_request_file(requests_path)
+        # Store results of same eval together
+        eval_name = eval_result.eval_name
+        if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        else:
+            eval_results[eval_name] = eval_result
+    results = []
+    for v in eval_results.values():
+        try:
+            v.to_dict() # we test if the dict version is complete
+            results.append(v)
+        except KeyError:  # not all eval values present
+            continue
+    return results
+# def get_raw_eval_results_mib(results_path: str) -> List[EvalResult_MIB]:
+#     """Extract all evaluation results from the results folder"""
+#     model_result_filepaths = []
+#     print(f"results_path is {results_path}")
+#     for root, dirnames, files in os.walk(results_path):
+#         print(f"root is {root}, dirnames is {dirnames}, files is {files}")
+#         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+#             continue
+#         files.sort()
+#         for file in files:
+#             model_result_filepaths.append(os.path.join(root, file))
+#     print(f"model_result_filepaths is {model_result_filepaths}")
+#     eval_results = []
+#     for model_result_filepath in model_result_filepaths:
+#         try:
+#             eval_result = EvalResult_MIB("", "", {})  # Create empty instance
+#             result = eval_result.init_from_json_file(model_result_filepath)
+#             # Verify the result can be converted to dict format
+#             result.to_dict()
+#             eval_results.append(result)
+#         except Exception as e:
+#             print(f"Error processing {model_result_filepath}: {e}")
+#             continue
+#     return eval_results
+def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
+    """From the path of the results folder root, extract all needed info for MIB results"""
+    model_result_filepaths = []
+    print(f"results_path is {results_path}")
+    for root, dirnames, files in os.walk(results_path):
+        print(f"root is {root}, dirnames is {dirnames}, files is {files}")
+        # We should only have json files in model results
+        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+            continue
+        # Sort the files by date - keeping original sorting logic
+        try:
+            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except dateutil.parser._parser.ParserError:
+            files = [files[-1]]
+        for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
+    print(f"model_result_filepaths is {model_result_filepaths}")
+    eval_results = []
+    for model_result_filepath in model_result_filepaths:
+        try:
+            eval_result = EvalResult_MIB("", "", {})  # Create empty instance
+            result = eval_result.init_from_json_file(model_result_filepath)
+            print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
+            # Verify the result can be converted to dict format
+            result.to_dict()
+            eval_results.append(result)
+        except Exception as e:
+            print(f"Error processing {model_result_filepath}: {e}")
+            continue
+    return eval_results
+# from dataclasses import dataclass
+# from enum import Enum
+# from typing import Dict, List, Any
+# @dataclass
+# class Task:
+#     benchmark: str
+#     metrics: list[str]
+#     col_name: str
+#     def get_model_ids(self, results: Dict) -> List[str]:
+#         """Extract model IDs from results"""
+#         try:
+#             return [result["model_id"] for result in results["results"]]
+#         except (KeyError, TypeError):
+#             return []
+# class TasksMIB(Enum):
+#     task0 = Task("ioi", ["edge_counts", "faithfulness"], "Indirect Object Identification")
+#     task1 = Task("mcqa", ["edge_counts", "faithfulness"], "Multiple Choice QA")
+#     @classmethod
+#     def get_models(cls, results: Dict) -> List[str]:
+#         """Class method to get model IDs using any task"""
+#         # Since model IDs are common across tasks, we can use any task to extract them
+#         return cls.task0.value.get_model_ids(results)
+# # Example usage:
+# results = {
+#     "method_name": "EAP-IG (mean)",
+#     "results": [
+#         {"model_id": "meta-llama/Llama-3.1-8B", "scores": {}},
+#         {"model_id": "Qwen/Qwen2-1.5B", "scores": {}}
+#     ]
+# }
+# # Get models using TasksMIB
+# model_ids = TasksMIB.get_models(results)
+# print(model_ids)  # ['meta-llama/Llama-3.1-8B', 'Qwen/Qwen2-1.5B']
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Tuple
+@dataclass
+class Task:
+    benchmark: str
+    metrics: list[str]
+    col_name: str
+    def get_method_results(self, results: Dict) -> List[Tuple[str, str, Dict]]:
+        """
+        Extract (method_name, model_id, scores) tuples from results
+        Args:
+            results (Dict): Results dictionary containing method_name and results
+        Returns:
+            List[Tuple[str, str, Dict]]: List of (method_name, model_id, scores) tuples
+        """
+        method_name = results.get("method_name", "unknown")
+        try:
+            return [
+                (method_name, result["model_id"], result["scores"])
+                for result in results["results"]
+            ]
+        except (KeyError, TypeError):
+            return []
+class TasksMIB(Enum):
+    task0 = Task("ioi", ["edge_counts", "faithfulness"], "Indirect Object Identification")
+    task1 = Task("mcqa", ["edge_counts", "faithfulness"], "Multiple Choice QA")
+    @classmethod
+    def get_method_model_pairs(cls, results: Dict) -> List[Tuple[str, str]]:
+        """Get all (method_name, model_id) pairs from results"""
+        return [(pair[0], pair[1]) for pair in cls.task0.value.get_method_results(results)]
+# Example usage:
+results = {
+    "method_name": "EAP-IG (mean)",
+    "results": [
+        {"model_id": "meta-llama/Llama-3.1-8B", "scores": {}},
+        {"model_id": "Qwen/Qwen2-1.5B", "scores": {}}
+    ]
+}
+# Get method-model pairs
+method_model_pairs = TasksMIB.get_method_model_pairs(results)
+print(method_model_pairs)
+# [('EAP-IG (mean)', 'meta-llama/Llama-3.1-8B'), ('EAP-IG (mean)', 'Qwen/Qwen2-1.5B')]
+# Get full results including scores
+full_results = TasksMIB.task0.value.get_method_results(results)
+for method_name, model_id, scores in full_results:
+    print(f"Method: {method_name}, Model: {model_id}")
+    print(f"Scores: {scores}")

src/populate.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import json
+import os
+import pandas as pd
+from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
+from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib
+def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    """Creates a dataframe from all the individual experiment results"""
+    print(f"results_path is {results_path}, requests_path is {requests_path}")
+    raw_data = get_raw_eval_results(results_path, requests_path)
+    print(f"raw_data is {raw_data}")
+    all_data_json = [v.to_dict() for v in raw_data]
+    print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
+    all_data_json_filtered = []
+    for item in all_data_json:
+        item["Track"] = item["eval_name"].split("_")[-1]
+        item["ioi"] = 0
+        item["mcqa"] = 0
+        if "VQA" in benchmark_cols and "VQA" in item:
+            all_data_json_filtered.append(item)
+        if "VQA" not in benchmark_cols and "VQA" not in item:
+            all_data_json_filtered.append(item)
+    all_data_json = all_data_json_filtered
+    df = pd.DataFrame.from_records(all_data_json)
+    df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
+    # df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
+    # df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
+    print(f"df is {df}")
+    # df = df[cols].round(decimals=1)
+    # filter out if any of the benchmarks have not been produced
+    df = df[has_no_nan_values(df, benchmark_cols)]
+    return df
+# def get_leaderboard_df_mib(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+#     """Creates a dataframe from all the individual experiment results"""
+#     print(f"results_path is {results_path}, requests_path is {requests_path}")
+#     raw_data = get_raw_eval_results(results_path, requests_path)
+#     print(f"raw_data is {raw_data}")
+#     all_data_json = [v.to_dict() for v in raw_data]
+#     print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
+#     all_data_json_filtered = []
+#     for item in all_data_json:
+#         item["Track"] = item["eval_name"].split("_")[-1]
+#         if "VQA" in benchmark_cols and "VQA" in item:
+#             all_data_json_filtered.append(item)
+#         if "VQA" not in benchmark_cols and "VQA" not in item:
+#             all_data_json_filtered.append(item)
+#         all_data_json_filtered.append(item)
+#     all_data_json = all_data_json_filtered
+#     df = pd.DataFrame.from_records(all_data_json)
+#     df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
+#     print(f"df is {df}")
+#     df = df[cols].round(decimals=1)
+#     # filter out if any of the benchmarks have not been produced
+#     df = df[has_no_nan_values(df, benchmark_cols)]
+#     return df
+def get_leaderboard_df_mib(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    """Creates a dataframe from all the MIB experiment results"""
+    print(f"results_path is {results_path}, requests_path is {requests_path}")
+    raw_data = get_raw_eval_results_mib(results_path, requests_path)
+    print(f"raw_data is {raw_data}")
+    # Convert each result to dict format
+    all_data_json = [v.to_dict() for v in raw_data]
+    print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
+    # Convert to dataframe
+    df = pd.DataFrame.from_records(all_data_json)
+    # Sort by Average score descending
+    if 'Average' in df.columns:
+        # Convert '-' to NaN for sorting purposes
+        df['Average'] = pd.to_numeric(df['Average'], errors='coerce')
+        df = df.sort_values(by=['Average'], ascending=False, na_position='last')
+        # Convert NaN back to '-'
+        df['Average'] = df['Average'].fillna('-')
+    return df
+def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
+    """Creates the different dataframes for the evaluation queues requests"""
+    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
+    all_evals = []
+    for entry in entries:
+        if ".json" in entry:
+            file_path = os.path.join(save_path, entry)
+            with open(file_path) as fp:
+                data = json.load(fp)
+            if "still_on_hub" in data and data["still_on_hub"]:
+                data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
+                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+            else:
+                data[EvalQueueColumn.model.name] = data["model"]
+                data[EvalQueueColumn.revision.name] = "N/A"
+            all_evals.append(data)
+        elif ".md" not in entry:
+            # this is a folder
+            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
+            for sub_entry in sub_entries:
+                file_path = os.path.join(save_path, entry, sub_entry)
+                with open(file_path) as fp:
+                    data = json.load(fp)
+                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+                all_evals.append(data)
+    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
+    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
+    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
+    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
+    df_running = pd.DataFrame.from_records(running_list, columns=cols)
+    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+    return df_finished[cols], df_running[cols], df_pending[cols]

src/submission/check_validity.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import json
+import os
+import re
+import numpy as np
+from collections import defaultdict
+from datetime import datetime, timedelta, timezone
+import huggingface_hub
+from huggingface_hub import ModelCard
+from huggingface_hub.hf_api import ModelInfo
+from transformers import AutoConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
+def check_model_card(repo_id: str) -> tuple[bool, str]:
+    """Checks if the model card and license exist and have been filled"""
+    try:
+        card = ModelCard.load(repo_id)
+    except huggingface_hub.utils.EntryNotFoundError:
+        return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
+    # Enforce license metadata
+    if card.data.license is None:
+        if not ("license_name" in card.data and "license_link" in card.data):
+            return False, (
+                "License not found. Please add a license to your model card using the `license` metadata or a"
+                " `license_name`/`license_link` pair."
+            )
+    # Enforce card content
+    if len(card.text) < 200:
+        return False, "Please add a description to your model card, it is too short."
+    return True, ""
+def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
+    """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
+    try:
+        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
+        if test_tokenizer:
+            try:
+                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
+            except ValueError as e:
+                return (
+                    False,
+                    f"uses a tokenizer which is not in a transformers release: {e}",
+                    None
+                )
+            except Exception as e:
+                return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
+        return True, None, config
+    except ValueError:
+        return (
+            False,
+            "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
+            None
+        )
+    except Exception as e:
+        return False, "was not found on hub!", None
+def get_model_size(model_info: ModelInfo, precision: str):
+    """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
+    try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError):
+        return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
+    model_size = size_factor * model_size
+    return model_size
+def get_model_arch(model_info: ModelInfo):
+    """Gets the model architecture from the configuration"""
+    return model_info.config.get("architectures", "Unknown")
+def already_submitted_models(requested_models_dir: str) -> set[str]:
+    """Gather a list of already submitted models to avoid duplicates"""
+    depth = 1
+    file_names = []
+    users_to_submission_dates = defaultdict(list)
+    for root, _, files in os.walk(requested_models_dir):
+        current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
+        if current_depth == depth:
+            for file in files:
+                if not file.endswith(".json"):
+                    continue
+                with open(os.path.join(root, file), "r") as f:
+                    info = json.load(f)
+                    file_names.append(f"{info['model']}_{info['revision']}_{info['track']}")
+                    # Select organisation
+                    if info["model"].count("/") == 0 or "submitted_time" not in info:
+                        continue
+                    organisation, _ = info["model"].split("/")
+                    users_to_submission_dates[organisation].append(info["submitted_time"])
+    return set(file_names), users_to_submission_dates
+def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
+    out_msg = ""
+    for task in TEXT_TASKS:
+        if task not in predictions:
+            out_msg = f"Error: {task} not present"
+            break
+        for subtask in TEXT_TASKS[task]:
+            if subtask not in predictions[task]:
+                out_msg = f"Error: {subtask} not present under {task}"
+                break
+        if out_msg != "":
+            break
+    if "vqa" in predictions or "winoground" in predictions or "devbench" in predictions:
+        for task in VISION_TASKS:
+            if task not in predictions:
+                out_msg = f"Error: {task} not present"
+                break
+            for subtask in VISION_TASKS[task]:
+                if subtask not in predictions[task]:
+                    out_msg = f"Error: {subtask} not present under {task}"
+                    break
+            if out_msg != "":
+                break
+    # Make sure all examples have predictions, and that predictions are the correct type
+    for task in predictions:
+        for subtask in predictions[task]:
+            if task == "devbench":
+                a = np.array(predictions[task][subtask]["predictions"])
+                if subtask == "sem-things":
+                    required_shape = (1854, 1854)
+                elif subtask == "gram-trog":
+                    required_shape = (76, 4, 1)
+                elif subtask == "lex-viz_vocab":
+                    required_shape = (119, 4, 1)
+                if a.shape[0] != required_shape[0] or a.shape[1] != required_shape[1]:
+                    out_msg = f"Error: Wrong shape for results for `{subtask}` in `{task}`."
+                    break
+                if not str(a.dtype).startswith("float"):
+                    out_msg = f"Error: Results for `{subtask}` ({task}) \
+                        should be floats but aren't."
+                    break
+                continue
+            num_expected_examples = NUM_EXPECTED_EXAMPLES[task][subtask]
+            if len(predictions[task][subtask]["predictions"]) != num_expected_examples:
+                out_msg = f"Error: {subtask} has the wrong number of examples."
+                break
+            if task == "glue":
+                if type(predictions[task][subtask]["predictions"][0]["pred"]) != int:
+                    out_msg = f"Error: results for `{subtask}` (`{task}`) should be integers but aren't."
+                    break
+            else:
+                if type(predictions[task][subtask]["predictions"][0]["pred"]) != str:
+                    out_msg = f"Error: results for `{subtask}` (`{task}`) should be strings but aren't."
+                    break
+        if out_msg != "":
+            break
+    if out_msg != "":
+        return False, out_msg
+    return True, "Upload successful."

src/submission/submit.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import json
+import os
+from datetime import datetime, timezone
+from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
+from src.submission.check_validity import (
+    already_submitted_models,
+    check_model_card,
+    get_model_size,
+    is_model_on_hub,
+    is_valid_predictions,
+)
+REQUESTED_MODELS = None
+USERS_TO_SUBMISSION_DATES = None
+def add_new_eval(
+    model_name: str,
+    model_id: str,
+    revision: str,
+    track: str,
+    predictions: dict,
+):
+    global REQUESTED_MODELS
+    global USERS_TO_SUBMISSION_DATES
+    if not REQUESTED_MODELS:
+        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
+    out_message = ""
+    user_name = ""
+    model_path = model_name
+    if "/" in model_name:
+        user_name = model_name.split("/")[0]
+        model_path = model_name.split("/")[1]
+    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    if track is None:
+        return styled_error("Please select a track.")
+    # Does the model actually exist?
+    if revision == "":
+        revision = "main"
+    out_message = ""
+    # Is the model info correctly filled?
+    print("Made it before 1")
+    try:
+        model_info = API.model_info(repo_id=model_id, revision=revision)
+    except Exception:
+        out_message += styled_warning("Could not get your model information. The leaderboard entry will not have a link to its HF repo.") + "<br>"
+    print("Made it after 1")
+    try:
+        predictions_OK, error_msg = is_valid_predictions(predictions)
+        if not predictions_OK:
+            return styled_error(error_msg) + "<br>"
+    except:
+        return styled_error(error_msg) + "<br>"
+    print("Made it after 3")
+    # Seems good, creating the eval
+    print("Adding new eval")
+    eval_entry = {
+        "model_name": model_name,
+        "hf_repo": model_id,
+        "revision": revision,
+        "track": track,
+        "predictions": predictions,
+        "status": "PENDING",
+        "submitted_time": current_time,
+    }
+    print("Made it after 4")
+    # Check for duplicate submission
+    if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
+        return styled_error("A model with this name has been already submitted.")
+    print("Creating eval file")
+    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
+    os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
+    print("Made it after 5")
+    with open(out_path, "w") as f:
+        f.write(json.dumps(eval_entry))
+    print("Uploading eval file")
+    API.upload_file(
+        path_or_fileobj=out_path,
+        path_in_repo=out_path.split("eval-queue/")[1],
+        repo_id=QUEUE_REPO,
+        repo_type="dataset",
+        commit_message=f"Add {model_name} to eval queue",
+    )
+    print("Made it after 6")
+    # Remove the local file
+    os.remove(out_path)
+    return styled_message(
+        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the request to show in the PENDING list."
+    )