galb-dai commited on
Commit
416ebf1
·
1 Parent(s): cc4e1bd

Remove some unused code/imports.

Browse files
README.md CHANGED
@@ -42,7 +42,7 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
42
 
43
  You'll find
44
  - the main table' columns names and properties in `src/display/utils.py`
45
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
  - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
47
 
48
 
 
42
 
43
  You'll find
44
  - the main table' columns names and properties in `src/display/utils.py`
45
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/populate.py`
46
  - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
47
 
48
 
app.py CHANGED
@@ -1,39 +1,15 @@
1
- from functools import partial
2
-
3
  import gradio as gr
4
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
 
7
 
8
- # from huggingface_hub import snapshot_download
9
-
10
- from src.about import (
11
- CITATION_BUTTON_LABEL,
12
- CITATION_BUTTON_TEXT,
13
- EVALUATION_QUEUE_TEXT,
14
- INTRODUCTION_TEXT,
15
- LLM_BENCHMARKS_TEXT,
16
- TITLE,
17
- )
18
  from src.datamodel.data import F1Data
19
-
20
  from src.display.css_html_js import custom_css
21
-
22
- from src.display.utils import (
23
- # BENCHMARK_COLS,
24
- COLS,
25
- EVAL_COLS,
26
- EVAL_TYPES,
27
- AutoEvalColumn,
28
- ModelType,
29
- fields,
30
- WeightType,
31
- Precision,
32
- )
33
- from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
34
  from src.logger import get_logger
35
-
36
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
37
  from src.submission.submit import add_new_solutions
38
 
39
  logger = get_logger(__name__)
@@ -52,16 +28,12 @@ leaderboard_df = get_leaderboard_df(RESULTS_REPO)
52
 
53
  logger.info("Initialized LBDB")
54
 
55
- # (
56
- # finished_eval_queue_df,
57
- # running_eval_queue_df,
58
- # pending_eval_queue_df,
59
- # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
60
 
 
61
 
62
- def init_leaderboard(dataframe):
63
  if dataframe is None or dataframe.empty:
64
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
65
  return Leaderboard(
66
  value=dataframe,
67
  datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -74,24 +46,12 @@ def init_leaderboard(dataframe):
74
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
75
  filter_columns=[
76
  ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
77
- # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
78
- # ColumnFilter(
79
- # AutoEvalColumn.params.name,
80
- # type="slider",
81
- # min=0.01,
82
- # max=150,
83
- # label="Select the number of parameters (B)",
84
- # ),
85
- # ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
90
 
91
 
92
- # Display image using Markdown
93
- # banner = "![Leaderboard Banner](file/assets/banner.png)"
94
-
95
  demo = gr.Blocks(css=custom_css)
96
  with demo:
97
  gr.Image(
@@ -102,7 +62,6 @@ with demo:
102
  container=False,
103
  )
104
 
105
- # gr.Markdown(banner)
106
  gr.HTML(
107
  """
108
  <style>
@@ -131,51 +90,12 @@ with demo:
131
  with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
132
  leaderboard = init_leaderboard(leaderboard_df)
133
 
134
- # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
135
- # logger.info("Tab about")
136
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
137
-
138
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
139
  logger.info("Tab submission")
140
  with gr.Column():
141
  with gr.Row():
142
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
143
 
144
- # with gr.Column():
145
- # with gr.Accordion(
146
- # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
147
- # open=False,
148
- # ):
149
- # with gr.Row():
150
- # finished_eval_table = gr.components.Dataframe(
151
- # value=finished_eval_queue_df,
152
- # headers=EVAL_COLS,
153
- # datatype=EVAL_TYPES,
154
- # row_count=5,
155
- # )
156
- # with gr.Accordion(
157
- # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
158
- # open=False,
159
- # ):
160
- # with gr.Row():
161
- # running_eval_table = gr.components.Dataframe(
162
- # value=running_eval_queue_df,
163
- # headers=EVAL_COLS,
164
- # datatype=EVAL_TYPES,
165
- # row_count=5,
166
- # )
167
-
168
- # with gr.Accordion(
169
- # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
170
- # open=False,
171
- # ):
172
- # with gr.Row():
173
- # pending_eval_table = gr.components.Dataframe(
174
- # value=pending_eval_queue_df,
175
- # headers=EVAL_COLS,
176
- # datatype=EVAL_TYPES,
177
- # row_count=5,
178
- # )
179
  with gr.Row():
180
  gr.Markdown("# ✉️✨ Submit your solutions here!", elem_classes="markdown-text")
181
 
@@ -183,7 +103,6 @@ with demo:
183
  with gr.Column():
184
  system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
185
  org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
186
- # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
187
  sys_type_dropdown = gr.Dropdown(
188
  choices=[t.to_str(" ") for t in ModelType],
189
  label=AutoEvalColumn.system_type.name,
@@ -192,23 +111,7 @@ with demo:
192
  interactive=True,
193
  )
194
 
195
- # with gr.Column():
196
  submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
197
- # precision = gr.Dropdown(
198
- # choices=[i.value.name for i in Precision if i != Precision.Unknown],
199
- # label="Precision",
200
- # multiselect=False,
201
- # value="float16",
202
- # interactive=True,
203
- # )
204
- # weight_type = gr.Dropdown(
205
- # choices=[i.value.name for i in WeightType],
206
- # label="Weights type",
207
- # multiselect=False,
208
- # value="Original",
209
- # interactive=True,
210
- # )
211
- # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
212
 
213
  logger.info("Submit button")
214
  submit_button = gr.Button("Submit")
 
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
5
 
6
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
 
 
 
 
 
 
 
 
 
7
  from src.datamodel.data import F1Data
 
8
  from src.display.css_html_js import custom_css
9
+ from src.display.utils import AutoEvalColumn, ModelType, fields
10
+ from src.envs import API, CODE_PROBLEMS_REPO, REPO_ID, RESULTS_REPO, SUBMISSIONS_REPO
 
 
 
 
 
 
 
 
 
 
 
11
  from src.logger import get_logger
12
+ from src.populate import get_leaderboard_df
 
13
  from src.submission.submit import add_new_solutions
14
 
15
  logger = get_logger(__name__)
 
28
 
29
  logger.info("Initialized LBDB")
30
 
 
 
 
 
 
31
 
32
+ def init_leaderboard(dataframe: pd.DataFrame):
33
 
 
34
  if dataframe is None or dataframe.empty:
35
  raise ValueError("Leaderboard DataFrame is empty or None.")
36
+
37
  return Leaderboard(
38
  value=dataframe,
39
  datatype=[c.type for c in fields(AutoEvalColumn)],
 
46
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
47
  filter_columns=[
48
  ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
 
 
 
 
 
 
 
 
 
49
  ],
50
  bool_checkboxgroup_label="Hide models",
51
  interactive=False,
52
  )
53
 
54
 
 
 
 
55
  demo = gr.Blocks(css=custom_css)
56
  with demo:
57
  gr.Image(
 
62
  container=False,
63
  )
64
 
 
65
  gr.HTML(
66
  """
67
  <style>
 
90
  with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
91
  leaderboard = init_leaderboard(leaderboard_df)
92
 
 
 
 
 
93
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
94
  logger.info("Tab submission")
95
  with gr.Column():
96
  with gr.Row():
97
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  with gr.Row():
100
  gr.Markdown("# ✉️✨ Submit your solutions here!", elem_classes="markdown-text")
101
 
 
103
  with gr.Column():
104
  system_name_textbox = gr.Textbox(label=AutoEvalColumn.system.name)
105
  org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
 
106
  sys_type_dropdown = gr.Dropdown(
107
  choices=[t.to_str(" ") for t in ModelType],
108
  label=AutoEvalColumn.system_type.name,
 
111
  interactive=True,
112
  )
113
 
 
114
  submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  logger.info("Submit button")
117
  submit_button = gr.Button("Submit")
scripts/upload_f1_dataset.py CHANGED
@@ -2,6 +2,7 @@ import argparse
2
  import fnmatch
3
  import json
4
  import os
 
5
 
6
  from datasets import Dataset
7
 
@@ -13,9 +14,23 @@ logger = get_logger(__name__)
13
 
14
  def get_args() -> argparse.Namespace:
15
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
16
- parser.add_argument("--input_dir", type=str, help="Dir with .json files", required=True)
17
- parser.add_argument("--dataset_name", type=str, default=f"{CODE_PROBLEMS_REPO}")
18
- parser.add_argument("--split", type=str, choices=["hard", "warmup"], default="hard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  return parser.parse_args()
20
 
21
 
@@ -26,7 +41,7 @@ def main(args: argparse.Namespace) -> None:
26
  raise ValueError(f"No .json files in input dir {args.input_dir}")
27
  logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
28
 
29
- def ds_generator():
30
  for fname in sorted(input_files):
31
  formula_name = os.path.splitext(fname)[0]
32
  cp_path = os.path.join(args.input_dir, fname)
@@ -35,7 +50,7 @@ def main(args: argparse.Namespace) -> None:
35
  logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
36
  yield dict(id=code_problem["id"], code_problem=code_problem)
37
 
38
- ds = Dataset.from_generator(ds_generator)
39
  logger.info("Created dataset")
40
 
41
  ds.push_to_hub(args.dataset_name, split=args.split, private=True)
 
2
  import fnmatch
3
  import json
4
  import os
5
+ from typing import Iterator
6
 
7
  from datasets import Dataset
8
 
 
14
 
15
  def get_args() -> argparse.Namespace:
16
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
17
+ parser.add_argument(
18
+ "--input_dir",
19
+ type=str,
20
+ help="Dir with .json files",
21
+ required=True,
22
+ )
23
+ parser.add_argument(
24
+ "--dataset_name",
25
+ type=str,
26
+ default=f"{CODE_PROBLEMS_REPO}",
27
+ )
28
+ parser.add_argument(
29
+ "--split",
30
+ type=str,
31
+ choices=["hard", "warmup"],
32
+ default="hard",
33
+ )
34
  return parser.parse_args()
35
 
36
 
 
41
  raise ValueError(f"No .json files in input dir {args.input_dir}")
42
  logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
43
 
44
+ def ds_generator() -> Iterator[dict]:
45
  for fname in sorted(input_files):
46
  formula_name = os.path.splitext(fname)[0]
47
  cp_path = os.path.join(args.input_dir, fname)
 
50
  logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
51
  yield dict(id=code_problem["id"], code_problem=code_problem)
52
 
53
+ ds: Dataset = Dataset.from_generator(ds_generator) # type: ignore
54
  logger.info("Created dataset")
55
 
56
  ds.push_to_hub(args.dataset_name, split=args.split, private=True)
src/about.py CHANGED
@@ -14,16 +14,11 @@ class Task:
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
17
- # task1 = Task("logiqa", "acc_norm", "LogiQA")
18
 
19
 
20
  NUM_FEWSHOT = 0 # Change with your few shot
21
  # ---------------------------------------------------
22
 
23
-
24
- # Your leaderboard name
25
- # TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
26
-
27
  TITLE = """
28
  <h1 id="space-title" style="
29
  text-align: center;
@@ -44,14 +39,13 @@ INTRODUCTION_TEXT = """
44
  Welcome to the official leaderboard for the paper:
45
 
46
  **FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
47
- *Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
48
  **AAI, July 2025**
49
 
50
  FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
51
  """
52
 
53
- # Which evaluations are you running? how can people reproduce what you have?
54
- LLM_BENCHMARKS_TEXT = f"""
55
  ## How it works
56
 
57
  ## Reproducibility
@@ -95,7 +89,7 @@ Submissions must:
95
  - **Organization**
96
  - **System Type**
97
  - Click **Submit**.
98
-
99
  ### ⏱️ After Submission
100
 
101
  Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
@@ -105,12 +99,12 @@ Submissions are validated and evaluated within ~24 hours. Results will appear on
105
  CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
106
  CITATION_BUTTON_TEXT = r"""
107
  @misc{beniamini2025formulaonemeasuringdepthalgorithmic,
108
- title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
109
- author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
110
  year={2025},
111
  eprint={2507.13337},
112
  archivePrefix={arXiv},
113
  primaryClass={cs.AI},
114
- url={https://arxiv.org/abs/2507.13337},
115
  }
116
  """
 
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
 
17
 
18
 
19
  NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
21
 
 
 
 
 
22
  TITLE = """
23
  <h1 id="space-title" style="
24
  text-align: center;
 
39
  Welcome to the official leaderboard for the paper:
40
 
41
  **FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming** <br>
42
+ *Gal Beniamini, Yuval Dor, Alon Vinnikov, Shir Granot Peled, Or Weinstein, Or Sharir, Noam Wies, Tomer Nussbaum, Nadav Schweiger, Ido Ben Shaul, Tomer Zekharya, Yoav Levine, Shai Shalev-Shwartz, Amnon Shashua* <br>
43
  **AAI, July 2025**
44
 
45
  FormulaOne is a new benchmark designed to challenge frontier AI models. The benchmark is constructed from a vast and conceptually diverse family of dynamic programming problems derived from Monadic Second-Order (MSO) logic on graphs, a framework with profound connections to theoretical computer science.
46
  """
47
 
48
+ LLM_BENCHMARKS_TEXT = """
 
49
  ## How it works
50
 
51
  ## Reproducibility
 
89
  - **Organization**
90
  - **System Type**
91
  - Click **Submit**.
92
+
93
  ### ⏱️ After Submission
94
 
95
  Submissions are validated and evaluated within ~24 hours. Results will appear on the leaderboard once processed.
 
99
  CITATION_BUTTON_LABEL = """📚 How to cite FormulaOne"""
100
  CITATION_BUTTON_TEXT = r"""
101
  @misc{beniamini2025formulaonemeasuringdepthalgorithmic,
102
+ title={FormulaOne: Measuring the Depth of Algorithmic Reasoning Beyond Competitive Programming},
103
+ author={Gal Beniamini and Yuval Dor and Alon Vinnikov and Shir Granot Peled and Or Weinstein and Or Sharir and Noam Wies and Tomer Nussbaum and Nadav Schweiger and Ido Ben Shaul and Tomer Zekharya and Yoav Levine and Shai Shalev-Shwartz and Amnon Shashua},
104
  year={2025},
105
  eprint={2507.13337},
106
  archivePrefix={arXiv},
107
  primaryClass={cs.AI},
108
+ url={https://arxiv.org/abs/2507.13337},
109
  }
110
  """
src/datamodel/data.py CHANGED
@@ -3,14 +3,20 @@ import time
3
 
4
  from datasets import load_dataset
5
 
6
- from src.envs import TOKEN, CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO
7
  from src.logger import get_logger
8
 
9
  logger = get_logger(__name__)
10
 
11
 
12
  class F1Data:
13
- def __init__(self, cp_ds_name: str, sub_ds_name: str, res_ds_name: str, split: str = "hard"):
 
 
 
 
 
 
14
  self.cp_dataset_name = cp_ds_name
15
  self.submissions_dataset_name = sub_ds_name
16
  self.results_dataset_name = res_ds_name
@@ -19,7 +25,7 @@ class F1Data:
19
  self._initialize()
20
 
21
  def _initialize(self):
22
- logger.info("Initialize F1Data TOKEN='%s'", TOKEN)
23
  start_time = time.monotonic()
24
  cp_ds = load_dataset(self.cp_dataset_name, split=self.split, token=TOKEN)
25
  logger.info(
 
3
 
4
  from datasets import load_dataset
5
 
6
+ from src.envs import CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO, TOKEN
7
  from src.logger import get_logger
8
 
9
  logger = get_logger(__name__)
10
 
11
 
12
  class F1Data:
13
+ def __init__(
14
+ self,
15
+ cp_ds_name: str,
16
+ sub_ds_name: str,
17
+ res_ds_name: str,
18
+ split: str = "hard",
19
+ ):
20
  self.cp_dataset_name = cp_ds_name
21
  self.submissions_dataset_name = sub_ds_name
22
  self.results_dataset_name = res_ds_name
 
25
  self._initialize()
26
 
27
  def _initialize(self):
28
+ logger.info(f"Initialize F1Data TOKEN='{TOKEN}'")
29
  start_time = time.monotonic()
30
  cp_ds = load_dataset(self.cp_dataset_name, split=self.split, token=TOKEN)
31
  logger.info(
src/display/css_html_js.py CHANGED
@@ -33,7 +33,7 @@ custom_css = """
33
  background: none;
34
  border: none;
35
  }
36
-
37
  #search-bar {
38
  padding: 0px;
39
  }
 
33
  background: none;
34
  border: none;
35
  }
36
+
37
  #search-bar {
38
  padding: 0px;
39
  }
src/leaderboard/read_evals.py DELETED
@@ -1,196 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
-
14
-
15
- @dataclass
16
- class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.LLM # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- still_on_hub: bool = False
34
-
35
- @classmethod
36
- def init_from_json_file(self, json_filepath):
37
- """Inits the result from the specific model result file"""
38
- with open(json_filepath) as fp:
39
- data = json.load(fp)
40
-
41
- config = data.get("config")
42
-
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
-
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
-
69
- # Extract results available in this file (some results are split in several files)
70
- results = {}
71
- for task in Tasks:
72
- task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
-
82
- return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
- org=org,
86
- model=model,
87
- results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
92
- )
93
-
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
-
110
- def to_dict(self):
111
- """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
- data_dict = {
114
- "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
- }
128
-
129
- for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
-
132
- return data_dict
133
-
134
-
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
155
-
156
-
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
- """From the path of the results folder root, extract all needed info for results"""
159
- model_result_filepaths = []
160
-
161
- for root, _, files in os.walk(results_path):
162
- # We should only have json files in model results
163
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
- continue
165
-
166
- # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
-
172
- for file in files:
173
- model_result_filepaths.append(os.path.join(root, file))
174
-
175
- eval_results = {}
176
- for model_result_filepath in model_result_filepaths:
177
- # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
-
181
- # Store results of same eval together
182
- eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
- else:
186
- eval_results[eval_name] = eval_result
187
-
188
- results = []
189
- for v in eval_results.values():
190
- try:
191
- v.to_dict() # we test if the dict version is complete
192
- results.append(v)
193
- except KeyError: # not all eval values present
194
- continue
195
-
196
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/logger.py CHANGED
@@ -1,6 +1,7 @@
1
  import logging
2
  import sys
3
 
 
4
  def get_logger(filename: str, level=logging.INFO) -> logging.Logger:
5
  new_logger = logging.getLogger(filename)
6
  fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
 
1
  import logging
2
  import sys
3
 
4
+
5
  def get_logger(filename: str, level=logging.INFO) -> logging.Logger:
6
  new_logger = logging.getLogger(filename)
7
  fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
src/populate.py CHANGED
@@ -1,27 +1,29 @@
1
- import json
2
- import os
3
-
4
  import pandas as pd
5
- from datasets import load_dataset, get_dataset_config_names
6
  from datasets.exceptions import DatasetNotFoundError
7
  from tqdm.auto import tqdm
8
 
9
- from src.display.formatting import has_no_nan_values, make_clickable_model
10
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
11
  from src.envs import TOKEN
12
- from src.leaderboard.read_evals import get_raw_eval_results
13
  from src.logger import get_logger
14
 
15
  logger = get_logger(__name__)
16
 
17
 
18
  def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
19
- """Creates a dataframe from all the individual experiment results"""
 
 
20
 
21
  try:
22
- configs = get_dataset_config_names(results_dataset_name, token=TOKEN)
 
 
 
23
  except (DatasetNotFoundError, FileNotFoundError):
 
24
  # Return an empty DataFrame with expected columns
 
25
  return pd.DataFrame(
26
  columns=[
27
  "System Name",
@@ -34,8 +36,17 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
34
  )
35
 
36
  rows = []
37
- for submission_id in tqdm(configs, total=len(configs), desc="Processing Submission Results"):
38
- submission_ds = load_dataset(results_dataset_name, submission_id, split="train", token=TOKEN)
 
 
 
 
 
 
 
 
 
39
  submission_df = pd.DataFrame(submission_ds)
40
 
41
  if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
@@ -59,7 +70,7 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
59
 
60
  full_df = pd.DataFrame(rows)
61
 
62
- # TODO: forbid multiple submissions under the same name?
63
  # Keep only the latest entry per unique (System Name, System Type, Organization) triplet
64
  final_df = (
65
  full_df.sort_values("Submitted On", ascending=False)
@@ -72,39 +83,3 @@ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
72
  final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
73
 
74
  return final_df
75
-
76
-
77
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
78
- """Creates the different dataframes for the evaluation queues requestes"""
79
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
80
- all_evals = []
81
-
82
- for entry in entries:
83
- if ".json" in entry:
84
- file_path = os.path.join(save_path, entry)
85
- with open(file_path) as fp:
86
- data = json.load(fp)
87
-
88
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
89
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
90
-
91
- all_evals.append(data)
92
- elif ".md" not in entry:
93
- # this is a folder
94
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
95
- for sub_entry in sub_entries:
96
- file_path = os.path.join(save_path, entry, sub_entry)
97
- with open(file_path) as fp:
98
- data = json.load(fp)
99
-
100
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
101
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
102
- all_evals.append(data)
103
-
104
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
105
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
106
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
107
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
108
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
109
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
110
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
1
  import pandas as pd
2
+ from datasets import get_dataset_config_names, load_dataset
3
  from datasets.exceptions import DatasetNotFoundError
4
  from tqdm.auto import tqdm
5
 
6
+ from src.display.utils import AutoEvalColumn
 
7
  from src.envs import TOKEN
 
8
  from src.logger import get_logger
9
 
10
  logger = get_logger(__name__)
11
 
12
 
13
  def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
14
+ """
15
+ @brief Creates a dataframe from all the individual experiment results.
16
+ """
17
 
18
  try:
19
+ configs = get_dataset_config_names(
20
+ results_dataset_name,
21
+ token=TOKEN,
22
+ )
23
  except (DatasetNotFoundError, FileNotFoundError):
24
+
25
  # Return an empty DataFrame with expected columns
26
+ logger.warning("Failed to load configuration", exc_info=True)
27
  return pd.DataFrame(
28
  columns=[
29
  "System Name",
 
36
  )
37
 
38
  rows = []
39
+ for submission_id in tqdm(
40
+ configs,
41
+ total=len(configs),
42
+ desc="Processing Submission Results",
43
+ ):
44
+ submission_ds = load_dataset(
45
+ results_dataset_name,
46
+ submission_id,
47
+ split="train",
48
+ token=TOKEN,
49
+ )
50
  submission_df = pd.DataFrame(submission_ds)
51
 
52
  if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
 
70
 
71
  full_df = pd.DataFrame(rows)
72
 
73
+ # TODO: Forbid multiple submissions under the same name?
74
  # Keep only the latest entry per unique (System Name, System Type, Organization) triplet
75
  final_df = (
76
  full_df.sort_values("Submitted On", ascending=False)
 
83
  final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
84
 
85
  return final_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py CHANGED
@@ -4,8 +4,8 @@ import re
4
  from collections import defaultdict
5
  from datetime import datetime, timedelta, timezone
6
 
7
- from datasets import get_dataset_config_names
8
  import huggingface_hub
 
9
  from huggingface_hub import ModelCard
10
  from huggingface_hub.hf_api import ModelInfo
11
  from transformers import AutoConfig
@@ -13,6 +13,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
13
 
14
  from src.envs import SUBMISSIONS_REPO
15
 
 
16
  def check_model_card(repo_id: str) -> tuple[bool, str]:
17
  """Checks if the model card and license exist and have been filled"""
18
  try:
@@ -34,28 +35,38 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
34
 
35
  return True, ""
36
 
37
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
 
 
 
38
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
39
  try:
40
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
41
  if test_tokenizer:
42
  try:
43
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
 
 
 
44
  except ValueError as e:
 
 
45
  return (
46
  False,
47
- f"uses a tokenizer which is not in a transformers release: {e}",
48
- None
49
  )
50
- except Exception as e:
51
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
52
  return True, None, config
53
 
54
  except ValueError:
55
  return (
56
  False,
57
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
58
- None
59
  )
60
 
61
  except Exception as e:
@@ -73,10 +84,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
73
  model_size = size_factor * model_size
74
  return model_size
75
 
 
76
  def get_model_arch(model_info: ModelInfo):
77
  """Gets the model architecture from the configuration"""
78
  return model_info.config.get("architectures", "Unknown")
79
 
 
80
  def already_submitted_models(requested_models_dir: str) -> set[str]:
81
  """Gather a list of already submitted models to avoid duplicates"""
82
  depth = 1
 
4
  from collections import defaultdict
5
  from datetime import datetime, timedelta, timezone
6
 
 
7
  import huggingface_hub
8
+ from datasets import get_dataset_config_names
9
  from huggingface_hub import ModelCard
10
  from huggingface_hub.hf_api import ModelInfo
11
  from transformers import AutoConfig
 
13
 
14
  from src.envs import SUBMISSIONS_REPO
15
 
16
+
17
  def check_model_card(repo_id: str) -> tuple[bool, str]:
18
  """Checks if the model card and license exist and have been filled"""
19
  try:
 
35
 
36
  return True, ""
37
 
38
+
39
+ def is_model_on_hub(
40
+ model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
41
+ ) -> tuple[bool, str]:
42
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
43
  try:
44
+ config = AutoConfig.from_pretrained(
45
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
46
+ )
47
  if test_tokenizer:
48
  try:
49
+ AutoTokenizer.from_pretrained(
50
+ model_name,
51
+ revision=revision,
52
+ trust_remote_code=trust_remote_code,
53
+ token=token,
54
+ )
55
  except ValueError as e:
56
+ return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
57
+ except Exception as e:
58
  return (
59
  False,
60
+ "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
61
+ None,
62
  )
 
 
63
  return True, None, config
64
 
65
  except ValueError:
66
  return (
67
  False,
68
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
69
+ None,
70
  )
71
 
72
  except Exception as e:
 
84
  model_size = size_factor * model_size
85
  return model_size
86
 
87
+
88
  def get_model_arch(model_info: ModelInfo):
89
  """Gets the model architecture from the configuration"""
90
  return model_info.config.get("architectures", "Unknown")
91
 
92
+
93
  def already_submitted_models(requested_models_dir: str) -> set[str]:
94
  """Gather a list of already submitted models to avoid duplicates"""
95
  depth = 1
src/submission/submit.py CHANGED
@@ -1,25 +1,16 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
  import time
 
5
 
6
- from datasets import Dataset, DatasetDict
7
  import pandas as pd
8
- from pandas.api.types import is_integer_dtype, is_string_dtype
 
9
 
10
  from src.datamodel.data import F1Data
11
- from src.display.formatting import styled_error, styled_message, styled_warning
12
  from src.display.utils import ModelType
13
- from src.envs import API, SUBMISSIONS_REPO, TOKEN
14
  from src.logger import get_logger
15
 
16
- # from src.submission.check_validity import (
17
- # already_submitted_models,
18
- # check_model_card,
19
- # get_model_size,
20
- # is_model_on_hub,
21
- # )
22
-
23
  logger = get_logger(__name__)
24
 
25
 
@@ -33,7 +24,7 @@ def validate_submission(lbdb: F1Data, pd_ds: pd.DataFrame) -> str | None:
33
  if not is_integer_dtype(pd_ds["problem_id"]):
34
  return "problem_id must be str convertible to int"
35
 
36
- if any(type(v) != str for v in pd_ds["solution"]):
37
  return "solution must be of type str"
38
 
39
  submitted_ids = set(pd_ds.problem_id.astype(str))
@@ -96,30 +87,7 @@ def add_new_solutions(
96
  }
97
 
98
  ds = Dataset.from_pandas(submission_df).map(add_info)
99
-
100
- # dsdict = DatasetDict({submission_id: ds})
101
- # dsdict.push_to_hub(SUBMISSIONS_REPO, private=True)
102
-
103
  ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
104
- # print("Creating eval file")
105
- # OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
106
- # os.makedirs(OUT_DIR, exist_ok=True)
107
- # out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
108
-
109
- # with open(out_path, "w") as f:
110
- # f.write(json.dumps(eval_entry))
111
-
112
- # print("Uploading eval file")
113
- # API.upload_file(
114
- # path_or_fileobj=out_path,
115
- # path_in_repo=out_path.split("eval-queue/")[1],
116
- # repo_id=QUEUE_REPO,
117
- # repo_type="dataset",
118
- # commit_message=f"Add {model} to eval queue",
119
- # )
120
-
121
- # # Remove the local file
122
- # os.remove(out_path)
123
 
124
  return styled_message(
125
  "Your request has been submitted to the evaluation queue!\nResults may take up to 24 hours to be processed and shown in the leaderboard."
 
 
 
 
1
  import time
2
+ from datetime import datetime, timezone
3
 
 
4
  import pandas as pd
5
+ from datasets import Dataset
6
+ from pandas.api.types import is_integer_dtype
7
 
8
  from src.datamodel.data import F1Data
9
+ from src.display.formatting import styled_error, styled_message
10
  from src.display.utils import ModelType
11
+ from src.envs import SUBMISSIONS_REPO
12
  from src.logger import get_logger
13
 
 
 
 
 
 
 
 
14
  logger = get_logger(__name__)
15
 
16
 
 
24
  if not is_integer_dtype(pd_ds["problem_id"]):
25
  return "problem_id must be str convertible to int"
26
 
27
+ if any(type(v) is not str for v in pd_ds["solution"]):
28
  return "solution must be of type str"
29
 
30
  submitted_ids = set(pd_ds.problem_id.astype(str))
 
87
  }
88
 
89
  ds = Dataset.from_pandas(submission_df).map(add_info)
 
 
 
 
90
  ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  return styled_message(
93
  "Your request has been submitted to the evaluation queue!\nResults may take up to 24 hours to be processed and shown in the leaderboard."