FormulaOne-Leaderboard

Running on CPU Upgrade

App Files Files Community

galb-dai commited on Jul 31

Commit

416ebf1

1 Parent(s): cc4e1bd

Remove some unused code/imports.

Browse files

Files changed (11) hide show

README.md +1 -1
app.py +7 -104
scripts/upload_f1_dataset.py +20 -5
src/about.py +6 -12
src/datamodel/data.py +9 -3
src/display/css_html_js.py +1 -1
src/leaderboard/read_evals.py +0 -196
src/logger.py +1 -0
src/populate.py +23 -48
src/submission/check_validity.py +22 -9
src/submission/submit.py +6 -38

README.md CHANGED Viewed

@@ -42,7 +42,7 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
-- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
 - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
+- the logic to read all results and request files, then convert them in dataframe lines, in `src/populate.py`
 - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py CHANGED Viewed

@@ -1,39 +1,15 @@
-from functools import partial
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
-# from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
 from src.datamodel.data import F1Data
 from src.display.css_html_js import custom_css
-from src.display.utils import (
-    # BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision,
-)
-from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
 from src.logger import get_logger
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_solutions
 logger = get_logger(__name__)
@@ -52,16 +28,12 @@ leaderboard_df = get_leaderboard_df(RESULTS_REPO)
 logger.info("Initialized LBDB")
-# (
-#     finished_eval_queue_df,
-#     running_eval_queue_df,
-#     pending_eval_queue_df,
-# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -74,24 +46,12 @@ def init_leaderboard(dataframe):
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
             ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
-            # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            # ColumnFilter(
-            #     AutoEvalColumn.params.name,
-            #     type="slider",
-            #     min=0.01,
-            #     max=150,
-            #     label="Select the number of parameters (B)",
-            # ),
-            # ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
-# Display image using Markdown
-# banner = "![Leaderboard Banner](file/assets/banner.png)"
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.Image(
@@ -102,7 +62,6 @@ with demo:
         container=False,
     )
-    # gr.Markdown(banner)
     gr.HTML(
         """
         <style>
@@ -131,51 +90,12 @@ with demo:
         with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
             leaderboard = init_leaderboard(leaderboard_df)
-        # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
-        #     logger.info("Tab about")
-        #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
             logger.info("Tab submission")
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                # with gr.Column():
-                #     with gr.Accordion(
-                #         f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                #         open=False,
-                #     ):
-                #         with gr.Row():
-                #             finished_eval_table = gr.components.Dataframe(
-                #                 value=finished_eval_queue_df,
-                #                 headers=EVAL_COLS,
-                #                 datatype=EVAL_TYPES,
-                #                 row_count=5,
-                #             )
-                #     with gr.Accordion(
-                #         f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                #         open=False,
-                #     ):
-                #         with gr.Row():
-                #             running_eval_table = gr.components.Dataframe(
-                #                 value=running_eval_queue_df,
-                #                 headers=EVAL_COLS,
-                #                 datatype=EVAL_TYPES,
-                #                 row_count=5,
-                #             )
-                #     with gr.Accordion(
-                #         f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                #         open=False,
-                #     ):
-                #         with gr.Row():
-                #             pending_eval_table = gr.components.Dataframe(
-                #                 value=pending_eval_queue_df,
-                #                 headers=EVAL_COLS,
-                #                 datatype=EVAL_TYPES,
-                #                 row_count=5,
-                #             )
             with gr.Row():
                 gr.Markdown("# ✉️✨ Submit your solutions here!", elem_classes="markdown-text")
@@ -183,7 +103,6 @@ with demo:
                 with gr.Column():
                     system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
                     org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
-                    # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     sys_type_dropdown = gr.Dropdown(
                         choices=[t.to_str(" ") for t in ModelType],
                         label=AutoEvalColumn.system_type.name,
@@ -192,23 +111,7 @@ with demo:
                         interactive=True,
                     )
-                    # with gr.Column():
                     submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
-                    # precision = gr.Dropdown(
-                    #     choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                    #     label="Precision",
-                    #     multiselect=False,
-                    #     value="float16",
-                    #     interactive=True,
-                    # )
-                    # weight_type = gr.Dropdown(
-                    #     choices=[i.value.name for i in WeightType],
-                    #     label="Weights type",
-                    #     multiselect=False,
-                    #     value="Original",
-                    #     interactive=True,
-                    # )
-                    # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
             logger.info("Submit button")
             submit_button = gr.Button("Submit")

 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
+from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
+from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
 from src.datamodel.data import F1Data
 from src.display.css_html_js import custom_css
+from src.display.utils import AutoEvalColumn, ModelType, fields
+from src.envs import API, CODE_PROBLEMS_REPO, REPO_ID, RESULTS_REPO, SUBMISSIONS_REPO
 from src.logger import get_logger
+from src.populate import get_leaderboard_df
 from src.submission.submit import add_new_solutions
 logger = get_logger(__name__)
 logger.info("Initialized LBDB")
+def init_leaderboard(dataframe: pd.DataFrame):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
             ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.Image(
         container=False,
     )
     gr.HTML(
         """
         <style>
         with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
             leaderboard = init_leaderboard(leaderboard_df)
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
             logger.info("Tab submission")
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
             with gr.Row():
                 gr.Markdown("# ✉️✨ Submit your solutions here!", elem_classes="markdown-text")
                 with gr.Column():
                     system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
                     org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
                     sys_type_dropdown = gr.Dropdown(
                         choices=[t.to_str(" ") for t in ModelType],
                         label=AutoEvalColumn.system_type.name,
                         interactive=True,
                     )
                     submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
             logger.info("Submit button")
             submit_button = gr.Button("Submit")

scripts/upload_f1_dataset.py CHANGED Viewed

@@ -2,6 +2,7 @@ import argparse
 import fnmatch
 import json
 import os
 from datasets import Dataset
@@ -13,9 +14,23 @@ logger = get_logger(__name__)
 def get_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--input_dir", type=str, help="Dir with .json files", required=True)
-    parser.add_argument("--dataset_name", type=str, default=f"{CODE_PROBLEMS_REPO}")
-    parser.add_argument("--split", type=str, choices=["hard", "warmup"], default="hard")
     return parser.parse_args()
@@ -26,7 +41,7 @@ def main(args: argparse.Namespace) -> None:
         raise ValueError(f"No .json files in input dir {args.input_dir}")
     logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
-    def ds_generator():
         for fname in sorted(input_files):
             formula_name = os.path.splitext(fname)[0]
             cp_path = os.path.join(args.input_dir, fname)
@@ -35,7 +50,7 @@ def main(args: argparse.Namespace) -> None:
             logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
             yield dict(id=code_problem["id"], code_problem=code_problem)
-    ds = Dataset.from_generator(ds_generator)
     logger.info("Created dataset")
     ds.push_to_hub(args.dataset_name, split=args.split, private=True)

 import fnmatch
 import json
 import os
+from typing import Iterator
 from datasets import Dataset
 def get_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        help="Dir with .json files",
+        required=True,
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=f"{CODE_PROBLEMS_REPO}",
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        choices=["hard", "warmup"],
+        default="hard",
+    )
     return parser.parse_args()
         raise ValueError(f"No .json files in input dir {args.input_dir}")
     logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
+    def ds_generator() -> Iterator[dict]:
         for fname in sorted(input_files):
             formula_name = os.path.splitext(fname)[0]
             cp_path = os.path.join(args.input_dir, fname)
             logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
             yield dict(id=code_problem["id"], code_problem=code_problem)
+    ds: Dataset = Dataset.from_generator(ds_generator)  # type: ignore
     logger.info("Created dataset")
     ds.push_to_hub(args.dataset_name, split=args.split, private=True)

src/about.py CHANGED Viewed

@@ -14,16 +14,11 @@ class Task:
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
-    # task1 = Task("logiqa", "acc_norm", "LogiQA")
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
-# Your leaderboard name
-# TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
 TITLE = """
 <h1 id="space-title" style="
     text-align: center;
@@ -44,14 +39,13 @@ INTRODUCTION_TEXT = """
 Welcome to the official leaderboard for the paper:
 **FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
-*Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
 **AAI, July 2025**
 FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
 """
-# Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 ## Reproducibility
@@ -95,7 +89,7 @@ Submissions must:
    - **Organization**
    - **System Type**
 -  Click **Submit**.
 ### ⏱️ After Submission
 Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
@@ -105,12 +99,12 @@ Submissions are validated and evaluated within ~24 hours. Results will appear on
 CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
 CITATION_BUTTON_TEXT = r"""
 @misc{beniamini2025formulaonemeasuringdepthalgorithmic,
-      title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
-      author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
       year={2025},
       eprint={2507.13337},
       archivePrefix={arXiv},
       primaryClass={cs.AI},
-      url={https://arxiv.org/abs/2507.13337},
 }
 """

 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
 TITLE = """
 <h1 id="space-title" style="
     text-align: center;
 Welcome to the official leaderboard for the paper:
 **FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
+*Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Nadav Schweiger, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
 **AAI, July 2025**
 FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
 """
+LLM_BENCHMARKS_TEXT = """
 ## How it works
 ## Reproducibility
    - **Organization**
    - **System Type**
 -  Click **Submit**.
 ### ⏱️ After Submission
 Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
 CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
 CITATION_BUTTON_TEXT = r"""
 @misc{beniamini2025formulaonemeasuringdepthalgorithmic,
+      title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
+      author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Nadav Schweiger and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
       year={2025},
       eprint={2507.13337},
       archivePrefix={arXiv},
       primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2507.13337},
 }
 """

src/datamodel/data.py CHANGED Viewed

@@ -3,14 +3,20 @@ import time
 from datasets import load_dataset
-from src.envs import TOKEN, CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO
 from src.logger import get_logger
 logger = get_logger(__name__)
 class F1Data:
-    def __init__(self, cp_ds_name: str, sub_ds_name: str, res_ds_name: str, split: str = "hard"):
         self.cp_dataset_name = cp_ds_name
         self.submissions_dataset_name = sub_ds_name
         self.results_dataset_name = res_ds_name
@@ -19,7 +25,7 @@ class F1Data:
         self._initialize()
     def _initialize(self):
-        logger.info("Initialize F1Data TOKEN='%s'", TOKEN)
         start_time = time.monotonic()
         cp_ds = load_dataset(self.cp_dataset_name, split=self.split, token=TOKEN)
         logger.info(

 from datasets import load_dataset
+from src.envs import CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO, TOKEN
 from src.logger import get_logger
 logger = get_logger(__name__)
 class F1Data:
+    def __init__(
+        self,
+        cp_ds_name: str,
+        sub_ds_name: str,
+        res_ds_name: str,
+        split: str = "hard",
+    ):
         self.cp_dataset_name = cp_ds_name
         self.submissions_dataset_name = sub_ds_name
         self.results_dataset_name = res_ds_name
         self._initialize()
     def _initialize(self):
+        logger.info(f"Initialize F1Data TOKEN='{TOKEN}'")
         start_time = time.monotonic()
         cp_ds = load_dataset(self.cp_dataset_name, split=self.split, token=TOKEN)
         logger.info(

src/display/css_html_js.py CHANGED Viewed

@@ -33,7 +33,7 @@ custom_css = """
     background: none;
     border: none;
 }
 #search-bar {
     padding: 0px;
 }

     background: none;
     border: none;
 }
 #search-bar {
     padding: 0px;
 }

src/leaderboard/read_evals.py DELETED Viewed

@@ -1,196 +0,0 @@
-import glob
-import json
-import math
-import os
-from dataclasses import dataclass
-import dateutil
-import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
-from src.submission.check_validity import is_model_on_hub
-@dataclass
-class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
-    model: str
-    revision: str # commit hash, "" if main
-    results: dict
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.LLM # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = "" # submission date of request file
-    still_on_hub: bool = False
-    @classmethod
-    def init_from_json_file(self, json_filepath):
-        """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        config = data.get("config")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
-        for task in Tasks:
-            task = task.value
-            # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return self(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture
-        )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-        try:
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
-    def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    request_files = os.path.join(
-        requests_path,
-        f"{model_name}_eval_request_*.json",
-    )
-    request_files = glob.glob(request_files)
-    # Select correct request file (precision)
-    request_file = ""
-    request_files = sorted(request_files, reverse=True)
-    for tmp_request_file in request_files:
-        with open(tmp_request_file, "r") as f:
-            req_content = json.load(f)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
-                request_file = tmp_request_file
-    return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
-    """From the path of the results folder root, extract all needed info for results"""
-    model_result_filepaths = []
-    for root, _, files in os.walk(results_path):
-        # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-            continue
-        # Sort the files by date
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {}
-    for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
-    results = []
-    for v in eval_results.values():
-        try:
-            v.to_dict() # we test if the dict version is complete
-            results.append(v)
-        except KeyError:  # not all eval values present
-            continue
-    return results

src/logger.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 import sys
 def get_logger(filename: str, level=logging.INFO) -> logging.Logger:
     new_logger = logging.getLogger(filename)
     fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")

 import logging
 import sys
 def get_logger(filename: str, level=logging.INFO) -> logging.Logger:
     new_logger = logging.getLogger(filename)
     fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")

src/populate.py CHANGED Viewed

@@ -1,27 +1,29 @@
-import json
-import os
 import pandas as pd
-from datasets import load_dataset, get_dataset_config_names
 from datasets.exceptions import DatasetNotFoundError
 from tqdm.auto import tqdm
-from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.envs import TOKEN
-from src.leaderboard.read_evals import get_raw_eval_results
 from src.logger import get_logger
 logger = get_logger(__name__)
 def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
-    """Creates a dataframe from all the individual experiment results"""
     try:
-        configs = get_dataset_config_names(results_dataset_name, token=TOKEN)
     except (DatasetNotFoundError, FileNotFoundError):
         # Return an empty DataFrame with expected columns
         return pd.DataFrame(
             columns=[
                 "System Name",
@@ -34,8 +36,17 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
         )
     rows = []
-    for submission_id in tqdm(configs, total=len(configs), desc="Processing Submission Results"):
-        submission_ds = load_dataset(results_dataset_name, submission_id, split="train", token=TOKEN)
         submission_df = pd.DataFrame(submission_ds)
         if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
@@ -59,7 +70,7 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
     full_df = pd.DataFrame(rows)
-    # TODO: forbid multiple submissions under the same name?
     # Keep only the latest entry per unique (System Name, System Type, Organization) triplet
     final_df = (
         full_df.sort_values("Submitted On", ascending=False)
@@ -72,39 +83,3 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
     final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
     return final_df
-def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
-    """Creates the different dataframes for the evaluation queues requestes"""
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
-    all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

 import pandas as pd
+from datasets import get_dataset_config_names, load_dataset
 from datasets.exceptions import DatasetNotFoundError
 from tqdm.auto import tqdm
+from src.display.utils import AutoEvalColumn
 from src.envs import TOKEN
 from src.logger import get_logger
 logger = get_logger(__name__)
 def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
+    """
+    @brief Creates a dataframe from all the individual experiment results.
+    """
     try:
+        configs = get_dataset_config_names(
+            results_dataset_name,
+            token=TOKEN,
+        )
     except (DatasetNotFoundError, FileNotFoundError):
         # Return an empty DataFrame with expected columns
+        logger.warning("Failed to load configuration", exc_info=True)
         return pd.DataFrame(
             columns=[
                 "System Name",
         )
     rows = []
+    for submission_id in tqdm(
+        configs,
+        total=len(configs),
+        desc="Processing Submission Results",
+    ):
+        submission_ds = load_dataset(
+            results_dataset_name,
+            submission_id,
+            split="train",
+            token=TOKEN,
+        )
         submission_df = pd.DataFrame(submission_ds)
         if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
     full_df = pd.DataFrame(rows)
+    # TODO: Forbid multiple submissions under the same name?
     # Keep only the latest entry per unique (System Name, System Type, Organization) triplet
     final_df = (
         full_df.sort_values("Submitted On", ascending=False)
     final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
     return final_df

src/submission/check_validity.py CHANGED Viewed

@@ -4,8 +4,8 @@ import re
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
-from datasets import get_dataset_config_names
 import huggingface_hub
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
@@ -13,6 +13,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
 from src.envs import SUBMISSIONS_REPO
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
@@ -34,28 +35,38 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     return True, ""
-def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
-        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
             try:
-                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
             except ValueError as e:
                 return (
                     False,
-                    f"uses a tokenizer which is not in a transformers release: {e}",
-                    None
                 )
-            except Exception as e:
-                return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
     except ValueError:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
-            None
         )
     except Exception as e:
@@ -73,10 +84,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
     """Gather a list of already submitted models to avoid duplicates"""
     depth = 1

 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
 import huggingface_hub
+from datasets import get_dataset_config_names
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from src.envs import SUBMISSIONS_REPO
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
     return True, ""
+def is_model_on_hub(
+    model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
+) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
+        config = AutoConfig.from_pretrained(
+            model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
+        )
         if test_tokenizer:
             try:
+                AutoTokenizer.from_pretrained(
+                    model_name,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    token=token,
+                )
             except ValueError as e:
+                return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
+            except Exception as e:
                 return (
                     False,
+                    "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
+                    None,
                 )
         return True, None, config
     except ValueError:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
+            None,
         )
     except Exception as e:
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
     """Gather a list of already submitted models to avoid duplicates"""
     depth = 1

src/submission/submit.py CHANGED Viewed

@@ -1,25 +1,16 @@
-import json
-import os
-from datetime import datetime, timezone
 import time
-from datasets import Dataset, DatasetDict
 import pandas as pd
-from pandas.api.types import is_integer_dtype, is_string_dtype
 from src.datamodel.data import F1Data
-from src.display.formatting import styled_error, styled_message, styled_warning
 from src.display.utils import ModelType
-from src.envs import API, SUBMISSIONS_REPO, TOKEN
 from src.logger import get_logger
-# from src.submission.check_validity import (
-#     already_submitted_models,
-#     check_model_card,
-#     get_model_size,
-#     is_model_on_hub,
-# )
 logger = get_logger(__name__)
@@ -33,7 +24,7 @@ def validate_submission(lbdb: F1Data, pd_ds: pd.DataFrame) -> str | None:
     if not is_integer_dtype(pd_ds["problem_id"]):
         return "problem_id must be str convertible to int"
-    if any(type(v) != str for v in pd_ds["solution"]):
         return "solution must be of type str"
     submitted_ids = set(pd_ds.problem_id.astype(str))
@@ -96,30 +87,7 @@ def add_new_solutions(
         }
     ds = Dataset.from_pandas(submission_df).map(add_info)
-    # dsdict = DatasetDict({submission_id: ds})
-    # dsdict.push_to_hub(SUBMISSIONS_REPO, private=True)
     ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
-    # print("Creating eval file")
-    # OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-    # os.makedirs(OUT_DIR, exist_ok=True)
-    # out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-    # with open(out_path, "w") as f:
-    #     f.write(json.dumps(eval_entry))
-    # print("Uploading eval file")
-    # API.upload_file(
-    #     path_or_fileobj=out_path,
-    #     path_in_repo=out_path.split("eval-queue/")[1],
-    #     repo_id=QUEUE_REPO,
-    #     repo_type="dataset",
-    #     commit_message=f"Add {model} to eval queue",
-    # )
-    # # Remove the local file
-    # os.remove(out_path)
     return styled_message(
         "Your request has been submitted to the evaluation queue!\nResults may take up to 24 hours to be processed and shown in the leaderboard."

 import time
+from datetime import datetime, timezone
 import pandas as pd
+from datasets import Dataset
+from pandas.api.types import is_integer_dtype
 from src.datamodel.data import F1Data
+from src.display.formatting import styled_error, styled_message
 from src.display.utils import ModelType
+from src.envs import SUBMISSIONS_REPO
 from src.logger import get_logger
 logger = get_logger(__name__)
     if not is_integer_dtype(pd_ds["problem_id"]):
         return "problem_id must be str convertible to int"
+    if any(type(v) is not str for v in pd_ds["solution"]):
         return "solution must be of type str"
     submitted_ids = set(pd_ds.problem_id.astype(str))
         }
     ds = Dataset.from_pandas(submission_df).map(add_info)
     ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
     return styled_message(
         "Your request has been submitted to the evaluation queue!\nResults may take up to 24 hours to be processed and shown in the leaderboard."