KurtMica commited on
Commit
236bb17
·
1 Parent(s): f171a05

Model output submission.

Browse files
app.py CHANGED
@@ -19,14 +19,13 @@ from src.display.utils import (
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
- ModelType,
23
  fields,
24
- WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
 
31
 
32
  def restart_space():
@@ -71,8 +70,15 @@ def init_leaderboard(dataframe):
71
  search_columns=[AutoEvalColumn.model.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
  ColumnFilter(AutoEvalColumn.maltese_training.name, type="checkboxgroup", label="Maltese training"),
 
 
 
 
 
 
 
76
  ColumnFilter(
77
  AutoEvalColumn.params.name,
78
  type="slider",
@@ -80,6 +86,8 @@ def init_leaderboard(dataframe):
80
  max=150,
81
  label="Select the number of parameters (B)",
82
  ),
 
 
83
  ColumnFilter(
84
  AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
  ),
@@ -145,45 +153,71 @@ with demo:
145
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
 
147
  with gr.Row():
 
 
 
 
 
 
 
148
  with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.NK],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
 
 
 
 
 
 
157
  )
158
 
159
  with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
 
163
  multiselect=False,
164
- value="float16",
165
  interactive=True,
166
  )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
 
170
  multiselect=False,
171
- value="Original",
 
 
 
 
 
 
172
  interactive=True,
173
  )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
 
176
  submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
 
 
 
 
 
 
 
178
  submit_button.click(
179
  add_new_eval,
180
  [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
  ],
188
  submission_result,
189
  )
@@ -201,4 +235,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
+ ModelTraining,
23
  fields,
24
+ MalteseTraining
 
25
  )
26
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
27
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
28
+ from src.submission.submit import add_new_eval, read_configuration
29
 
30
 
31
  def restart_space():
 
70
  search_columns=[AutoEvalColumn.model.name],
71
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
72
  filter_columns=[
73
+ ColumnFilter(AutoEvalColumn.model_training.name, type="checkboxgroup", label="Model types"),
74
  ColumnFilter(AutoEvalColumn.maltese_training.name, type="checkboxgroup", label="Maltese training"),
75
+ ColumnFilter(
76
+ AutoEvalColumn.language_count.name,
77
+ type="slider",
78
+ min=1,
79
+ max=1000,
80
+ label="Number of languages during training",
81
+ ),
82
  ColumnFilter(
83
  AutoEvalColumn.params.name,
84
  type="slider",
 
86
  max=150,
87
  label="Select the number of parameters (B)",
88
  ),
89
+ ColumnFilter(AutoEvalColumn.prompt_version.name, type="checkboxgroup", label="Prompt Version"),
90
+ ColumnFilter(AutoEvalColumn.n_shot.name, type="slider", min=0, max=100, label="Number of Shots"),
91
  ColumnFilter(
92
  AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
93
  ),
 
153
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
154
 
155
  with gr.Row():
156
+ files = gr.File(
157
+ label="Files (Configuration File & Prediction Outputs)",
158
+ file_count="directory",
159
+ type="filepath",
160
+ )
161
+
162
+ with gr.Row(equal_height=True):
163
  with gr.Column():
164
+ model_name = gr.Textbox(
165
+ label="Model name",
166
+ info="Read automatically from the results file.",
167
+ interactive=False,
168
+ )
169
+ version = gr.Textbox(
170
+ label="Prompt Version",
171
+ info="Read automatically from the results file.",
172
+ interactive=False,
173
+ )
174
+ n_shots = gr.Number(
175
+ label="Number of Shots",
176
+ info="Read automatically from the results file.",
177
+ interactive=False,
178
  )
179
 
180
  with gr.Column():
181
+ model_training = gr.Dropdown(
182
+ choices=[t.to_str(": ") for t in ModelTraining if t != ModelTraining.NK],
183
+ label="Model Training",
184
+ info="How to model is trained.",
185
  multiselect=False,
186
+ value=None,
187
  interactive=True,
188
  )
189
+ maltese_training = gr.Dropdown(
190
+ choices=[t.to_str(": ") for t in MalteseTraining if t != ModelTraining.NK],
191
+ label="Maltese Training",
192
+ info="The last stage of training in which Maltese was included.",
193
  multiselect=False,
194
+ value=None,
195
+ interactive=True,
196
+ )
197
+ language_count = gr.Number(
198
+ label="Number of languages",
199
+ info="Include languages for all training stages. Set to 0 if unknown.",
200
+ minimum=0,
201
  interactive=True,
202
  )
 
203
 
204
  submit_button = gr.Button("Submit Eval")
205
  submission_result = gr.Markdown()
206
+
207
+ configuration = gr.State()
208
+ file_paths = gr.State()
209
+ files.change(read_configuration,
210
+ files,
211
+ [configuration, file_paths, model_name, version, n_shots, submission_result])
212
+
213
  submit_button.click(
214
  add_new_eval,
215
  [
216
+ model_training,
217
+ maltese_training,
218
+ language_count,
219
+ configuration,
220
+ file_paths
 
221
  ],
222
  submission_result,
223
  )
 
235
  scheduler = BackgroundScheduler()
236
  scheduler.add_job(restart_space, "interval", seconds=1800)
237
  scheduler.start()
238
+ demo.queue(default_concurrency_limit=40).launch()
requirements.txt CHANGED
@@ -9,6 +9,8 @@ huggingface-hub>=0.18.0
9
  matplotlib
10
  numpy
11
  pandas
 
 
12
  python-dateutil
13
  tqdm
14
  transformers
 
9
  matplotlib
10
  numpy
11
  pandas
12
+ protobuf
13
+ pydantic==2.10.6
14
  python-dateutil
15
  tqdm
16
  transformers
src/about.py CHANGED
@@ -26,22 +26,22 @@ class Task:
26
  # ---------------------------------------------------
27
  class Tasks(Enum):
28
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
29
- task0 = Task("sentiment", "f1,none", "Sentiment Analysis (F1)", TaskType.NLU)
30
- task1 = Task("sib200", "f1,none", "SIB200 (F1)", TaskType.NLU)
31
- task2 = Task("taxi1500", "f1,none", "Taxi1500 (F1)", TaskType.NLU)
32
- task3 = Task("maltese_news_categories", "loglikelihood,none", "Maltese News Categories (F1)", TaskType.NLU)
33
- task4 = Task("multi_eurlex", "loglikelihood,none", "MultiEURLEX (F1)", TaskType.NLU)
34
- task5 = Task("belebele", "acc,none", "Belebele (Accuracy)", TaskType.NLU)
35
- task6 = Task("opus100_en-mt", "bleu,none", "OPUS-100 EN→MT (BLEU)", TaskType.NLG, False)
36
- task7 = Task("opus100_en-mt", "chrf,none", "OPUS-100 EN→MT (ChrF)", TaskType.NLG)
37
- task8 = Task("flores200_en-mt", "bleu,none", "Flores-200 EN→MT (BLEU)", TaskType.NLG, False)
38
- task9 = Task("flores200_en-mt", "chrf,none", "Flores-200 EN→MT (ChrF)", TaskType.NLG)
39
- task10 = Task("webnlg", "chrf,none", "WebNLG (ChrF)", TaskType.NLG)
40
- task11 = Task("webnlg", "rouge,none", "WebNLG (Rouge-L)", TaskType.NLG, False)
41
- task12 = Task("eurlex_sum", "chrf,none", "EUR-Lex-Sum (ChrF)", TaskType.NLG, False)
42
- task13 = Task("eurlex_sum", "rouge,none", "EUR-Lex-Sum (Rouge-L)", TaskType.NLG)
43
- task14 = Task("maltese_news_headlines", "chrf,none", "Maltese News Headlines (ChrF)", TaskType.NLG, False)
44
- task15 = Task("maltese_news_headlines", "rouge,none", "Maltese News Headlines (Rouge-L)", TaskType.NLG)
45
 
46
  NUM_FEWSHOT = 0 # Change with your few shot
47
  # ---------------------------------------------------
@@ -66,33 +66,9 @@ To reproduce our results, here is the commands you can run:
66
  """
67
 
68
  EVALUATION_QUEUE_TEXT = """
69
- ## Some good practices before submitting a model
70
-
71
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
72
- ```python
73
- from transformers import AutoConfig, AutoModel, AutoTokenizer
74
- config = AutoConfig.from_pretrained("your model name", revision=revision)
75
- model = AutoModel.from_pretrained("your model name", revision=revision)
76
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
77
- ```
78
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
79
-
80
- Note: make sure your model is public!
81
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
82
-
83
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
84
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
85
-
86
- ### 3) Make sure your model has an open license!
87
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
88
-
89
- ### 4) Fill up your model card
90
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
91
-
92
- ## In case of model failure
93
- If your model is displayed in the `FAILED` category, its execution stopped.
94
- Make sure you have followed the above steps first.
95
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
96
  """
97
 
98
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
26
  # ---------------------------------------------------
27
  class Tasks(Enum):
28
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
29
+ task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", TaskType.NLU)
30
+ task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", TaskType.NLU)
31
+ task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", TaskType.NLU)
32
+ task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", TaskType.NLU)
33
+ task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", TaskType.NLU)
34
+ task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", TaskType.NLU)
35
+ task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", TaskType.NLG, False)
36
+ task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", TaskType.NLG)
37
+ task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", TaskType.NLG, False)
38
+ task9 = Task("flores200_eng-mlt", "chrf", "Flores-200 EN→MT (ChrF)", TaskType.NLG)
39
+ task10 = Task("webnlg_mlt", "chrf", "WebNLG (ChrF)", TaskType.NLG)
40
+ task11 = Task("webnlg_mlt", "rouge", "WebNLG (Rouge-L)", TaskType.NLG, False)
41
+ task12 = Task("eurlexsum_mlt", "chrf", "EUR-Lex-Sum (ChrF)", TaskType.NLG, False)
42
+ task13 = Task("eurlexsum_mlt", "rouge", "EUR-Lex-Sum (Rouge-L)", TaskType.NLG)
43
+ task14 = Task("maltese_news_headlines", "chrf", "Maltese News Headlines (ChrF)", TaskType.NLG, False)
44
+ task15 = Task("maltese_news_headlines", "rouge", "Maltese News Headlines (Rouge-L)", TaskType.NLG)
45
 
46
  NUM_FEWSHOT = 0 # Change with your few shot
47
  # ---------------------------------------------------
 
66
  """
67
 
68
  EVALUATION_QUEUE_TEXT = """
69
+ To include new results on this benchmark, follow the instructions on our [GitHub Repository](https://github.com/MLRS/MELABench/tree/main/prompting).
70
+ You can then upload the output files which should include the configuration/results file and all the prediction files.
71
+ In addition, we ask for additional metadata about model training.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  """
73
 
74
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
src/display/utils.py CHANGED
@@ -26,6 +26,8 @@ auto_eval_column_dict = []
26
  # Init
27
  auto_eval_column_dict.append(["model_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
 
29
  #Scores
30
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average (All) ⬆️", "number", True)])
31
  for task_type in TaskType:
@@ -33,9 +35,9 @@ for task_type in TaskType:
33
  for task in Tasks:
34
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", task.value.is_primary_metric)])
35
  # Model information
36
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
37
  auto_eval_column_dict.append(["maltese_training", ColumnContent, ColumnContent("Maltese Training", "str", False)])
38
- auto_eval_column_dict.append(["num_languages", ColumnContent, ColumnContent("#Languages", "number", False)])
39
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
40
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
41
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
@@ -53,9 +55,10 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
53
  class EvalQueueColumn: # Queue column
54
  model = ColumnContent("model", "markdown", True)
55
  revision = ColumnContent("revision", "str", True)
56
- private = ColumnContent("private", "bool", True)
57
  precision = ColumnContent("precision", "str", True)
58
- weight_type = ColumnContent("weight_type", "str", "Original")
 
 
59
  status = ColumnContent("status", "str", True)
60
 
61
  ## All the model information that we might need
@@ -66,7 +69,7 @@ class ModelDetails:
66
  symbol: str = "" # emoji
67
 
68
 
69
- class ModelType(Enum):
70
  PT = ModelDetails(name="pre-trained", symbol="PT")
71
  FT = ModelDetails(name="fine-tuned", symbol="FT")
72
  IT = ModelDetails(name="instruction-tuned", symbol="IT")
@@ -78,13 +81,13 @@ class ModelType(Enum):
78
  @staticmethod
79
  def from_str(type):
80
  type = type or ""
81
- if type == "PT":
82
- return ModelType.PT
83
- if type == "FT":
84
- return ModelType.FT
85
- if type == "IT":
86
- return ModelType.IT
87
- return ModelType.NK
88
 
89
 
90
  class MalteseTraining(Enum):
@@ -100,13 +103,13 @@ class MalteseTraining(Enum):
100
  @staticmethod
101
  def from_str(type):
102
  type = type or ""
103
- if type == "NO":
104
  return MalteseTraining.NO
105
- if type == "PT":
106
  return MalteseTraining.PT
107
- if type == "FT":
108
  return MalteseTraining.FT
109
- if type == "IT":
110
  return MalteseTraining.IT
111
  return MalteseTraining.NK
112
 
 
26
  # Init
27
  auto_eval_column_dict.append(["model_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
+ auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("N-Shot", "number", False)])
30
+ auto_eval_column_dict.append(["prompt_version", ColumnContent, ColumnContent("Version", "str", False)])
31
  #Scores
32
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average (All) ⬆️", "number", True)])
33
  for task_type in TaskType:
 
35
  for task in Tasks:
36
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", task.value.is_primary_metric)])
37
  # Model information
38
+ auto_eval_column_dict.append(["model_training", ColumnContent, ColumnContent("Type", "str", False)])
39
  auto_eval_column_dict.append(["maltese_training", ColumnContent, ColumnContent("Maltese Training", "str", False)])
40
+ auto_eval_column_dict.append(["language_count", ColumnContent, ColumnContent("#Languages", "number", False)])
41
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
42
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
43
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 
55
  class EvalQueueColumn: # Queue column
56
  model = ColumnContent("model", "markdown", True)
57
  revision = ColumnContent("revision", "str", True)
 
58
  precision = ColumnContent("precision", "str", True)
59
+ n_shot = ColumnContent("n_shot", "int", True)
60
+ prompt_version = ColumnContent("prompt_version", "str", True)
61
+ seed = ColumnContent("seed", "int", True)
62
  status = ColumnContent("status", "str", True)
63
 
64
  ## All the model information that we might need
 
69
  symbol: str = "" # emoji
70
 
71
 
72
+ class ModelTraining(Enum):
73
  PT = ModelDetails(name="pre-trained", symbol="PT")
74
  FT = ModelDetails(name="fine-tuned", symbol="FT")
75
  IT = ModelDetails(name="instruction-tuned", symbol="IT")
 
81
  @staticmethod
82
  def from_str(type):
83
  type = type or ""
84
+ if "PT" in type:
85
+ return ModelTraining.PT
86
+ if "FT" in type:
87
+ return ModelTraining.FT
88
+ if "IT" in type:
89
+ return ModelTraining.IT
90
+ return ModelTraining.NK
91
 
92
 
93
  class MalteseTraining(Enum):
 
103
  @staticmethod
104
  def from_str(type):
105
  type = type or ""
106
+ if "NO" in type:
107
  return MalteseTraining.NO
108
+ if "PT" in type:
109
  return MalteseTraining.PT
110
+ if "FT" in type:
111
  return MalteseTraining.FT
112
+ if "IT" in type:
113
  return MalteseTraining.IT
114
  return MalteseTraining.NK
115
 
src/envs.py CHANGED
@@ -4,17 +4,21 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "MLRS" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
 
 
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/MELABench"
13
  QUEUE_REPO = f"{OWNER}/MELABench_requests"
 
14
  RESULTS_REPO = f"{OWNER}/MELABench_results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ PROMPT_VERSIONS = [version.strip() for version in
10
+ os.environ.get("PROMPT_VERSIONS", "1.0_english,1.0_maltese").split(",")]
11
+
12
+ OWNER = "MLRS" # Change to your org - don't forget to create a results and request dataset, with the correct format!
13
  # ----------------------------------
14
 
15
  REPO_ID = f"{OWNER}/MELABench"
16
  QUEUE_REPO = f"{OWNER}/MELABench_requests"
17
+ PREDICTIONS_REPO = f"{OWNER}/MELABench_predictions"
18
  RESULTS_REPO = f"{OWNER}/MELABench_results"
19
 
20
  # If you setup a cache later, just change HF_HOME
21
+ CACHE_PATH = os.getenv("HF_HOME", ".")
22
 
23
  # Local caches
24
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
src/leaderboard/read_evals.py CHANGED
@@ -3,12 +3,12 @@ import json
3
  import os
4
  from collections import defaultdict
5
  from dataclasses import dataclass
 
6
 
7
- import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, MalteseTraining
12
  from src.envs import TOKEN, API
13
  from src.submission.check_validity import is_model_on_hub, get_model_size
14
 
@@ -24,9 +24,12 @@ class EvalResult:
24
  revision: str # commit hash, "" if main
25
  results: dict
26
  precision: Precision = Precision.Unknown
27
- model_type: ModelType = ModelType.NK # Pretrained, fine tuned, ...
 
 
 
28
  maltese_training: MalteseTraining = MalteseTraining.NK # none, pre-training, ...
29
- num_languages: int = None
30
  weight_type: WeightType = WeightType.Original # Original or Adapter
31
  architecture: str = "Unknown"
32
  license: str = "?"
@@ -36,46 +39,39 @@ class EvalResult:
36
  still_on_hub: bool = False
37
 
38
  @classmethod
39
- def init_from_json_file(self, json_filepath):
40
  """Inits the result from the specific model result file"""
41
- with open(json_filepath) as fp:
42
  data = json.load(fp)
43
 
44
  config = data.get("config")
45
- metadata = data.get("metadata")
46
-
47
  precision = Precision.from_str(config.get("model_dtype"))
48
 
49
- model_type = ModelType.from_str(metadata.get("model_type"))
 
 
 
 
50
 
51
- maltese_training = MalteseTraining.from_str(metadata.get("maltese_training"))
52
 
53
- num_languages = metadata.get("num_languages")
 
 
54
 
55
  model_size = config.get("model_num_parameters")
56
 
57
  # Get model and org
58
- org_and_model = config.get("model_name", None)
59
  org_and_model = org_and_model.split("/", 1)
60
-
61
- if len(org_and_model) == 1:
62
- org = None
63
- model = org_and_model[0]
64
- result_key = f"{model}_{precision.value.name}"
65
- else:
66
- org = org_and_model[0]
67
- model = org_and_model[1]
68
- result_key = f"{org}_{model}_{precision.value.name}"
69
  full_model = "/".join(org_and_model)
70
 
71
  revision = config.get("model_sha", config.get("model_revision", "main"))
72
 
73
- model_args = {
74
- **dict({tuple(arg.split("=")) for arg in config.get("model_args", "").split(",") if len(arg) > 0}),
75
- "revision": revision,
76
- "trust_remote_code": True,
77
- "cache_dir": None
78
- }
79
  base_model = None
80
  if "pretrained" in model_args:
81
  base_model = model_args.pop("pretrained")
@@ -100,17 +96,30 @@ class EvalResult:
100
  pass
101
 
102
  # Extract results available in this file (some results are split in several files)
103
- results = {}
104
- for task in Tasks:
105
- task = task.value
106
-
107
- # We average all scores of a given metric (not all metrics are present in all files)
108
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
109
- if accs.size == 0 or any([acc is None for acc in accs]):
110
- continue
 
 
 
 
 
 
 
 
111
 
112
- mean_acc = np.mean(accs)
113
- results[task.benchmark] = mean_acc
 
 
 
 
 
114
 
115
  return self(
116
  eval_name=result_key,
@@ -118,15 +127,18 @@ class EvalResult:
118
  org=org,
119
  model=model,
120
  results=results,
121
- model_type=model_type,
122
  maltese_training=maltese_training,
123
- num_languages=num_languages or "?",
124
- precision=precision,
125
  revision=revision,
 
 
 
126
  still_on_hub=still_on_hub,
127
  architecture=architecture,
128
  likes=likes or "?",
129
- num_params=round(model_size / 1e9, 3),
130
  license=license,
131
  )
132
 
@@ -137,7 +149,7 @@ class EvalResult:
137
  try:
138
  with open(request_file, "r") as f:
139
  request = json.load(f)
140
- self.model_type = ModelType.from_str(request.get("model_type", ""))
141
  self.weight_type = WeightType[request.get("weight_type", "Original")]
142
  self.license = request.get("license", "?")
143
  self.likes = request.get("likes", 0)
@@ -152,10 +164,12 @@ class EvalResult:
152
  data_dict = {
153
  "eval_name": self.eval_name, # not a column, just a save name,
154
  AutoEvalColumn.precision.name: self.precision.value.name,
155
- AutoEvalColumn.model_type.name: self.model_type.value.name,
 
 
156
  AutoEvalColumn.maltese_training.name: self.maltese_training.value.name,
157
- AutoEvalColumn.model_symbol.name: self.model_type.value.symbol + "/" + self.maltese_training.value.symbol,
158
- AutoEvalColumn.num_languages.name: self.num_languages,
159
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
160
  AutoEvalColumn.architecture.name: self.architecture,
161
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
@@ -169,7 +183,7 @@ class EvalResult:
169
 
170
  results_by_task_type = defaultdict(list)
171
  for task in Tasks:
172
- result = self.results[task.value.benchmark]
173
  data_dict[task.value.col_name] = result
174
  if task.value.is_primary_metric:
175
  results_by_task_type[task.value.task_type].append(result)
@@ -205,28 +219,19 @@ def get_request_file_for_model(requests_path, model_name, precision):
205
  return request_file
206
 
207
 
208
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
209
  """From the path of the results folder root, extract all needed info for results"""
210
- model_result_filepaths = []
211
-
212
- for root, _, files in os.walk(results_path):
213
- # We should only have json files in model results
214
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
215
- continue
216
-
217
- # Sort the files by date
218
- try:
219
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
220
- except dateutil.parser._parser.ParserError:
221
- files = [files[-1]]
222
 
223
- for file in files:
224
- model_result_filepaths.append(os.path.join(root, file))
 
 
225
 
226
  eval_results = {}
227
- for model_result_filepath in model_result_filepaths:
228
  # Creation of result
229
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
230
 
231
  # Store results of same eval together
232
  eval_name = eval_result.eval_name
 
3
  import os
4
  from collections import defaultdict
5
  from dataclasses import dataclass
6
+ from pathlib import Path
7
 
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelTraining, Tasks, Precision, WeightType, MalteseTraining
12
  from src.envs import TOKEN, API
13
  from src.submission.check_validity import is_model_on_hub, get_model_size
14
 
 
24
  revision: str # commit hash, "" if main
25
  results: dict
26
  precision: Precision = Precision.Unknown
27
+ n_shot: int = 0
28
+ prompt_version: str = "1.0_english"
29
+ seed: int = 0
30
+ model_training: ModelTraining = ModelTraining.NK # Pretrained, fine tuned, ...
31
  maltese_training: MalteseTraining = MalteseTraining.NK # none, pre-training, ...
32
+ language_count: int = None
33
  weight_type: WeightType = WeightType.Original # Original or Adapter
34
  architecture: str = "Unknown"
35
  license: str = "?"
 
39
  still_on_hub: bool = False
40
 
41
  @classmethod
42
+ def init_from_json_files(self, seed_directory):
43
  """Inits the result from the specific model result file"""
44
+ with open(list(seed_directory.values())[0][0]) as fp:
45
  data = json.load(fp)
46
 
47
  config = data.get("config")
 
 
48
  precision = Precision.from_str(config.get("model_dtype"))
49
 
50
+ n_shot = config.get("n_shot")
51
+
52
+ prompt_version = config.get("prompt_version")
53
+
54
+ seed = config.get("seed")
55
 
56
+ model_training = ModelTraining.from_str(config.get("model_training"))
57
 
58
+ maltese_training = MalteseTraining.from_str(config.get("maltese_training"))
59
+
60
+ language_count = config.get("language_count")
61
 
62
  model_size = config.get("model_num_parameters")
63
 
64
  # Get model and org
65
+ org_and_model = config.get("model", None)
66
  org_and_model = org_and_model.split("/", 1)
 
 
 
 
 
 
 
 
 
67
  full_model = "/".join(org_and_model)
68
 
69
  revision = config.get("model_sha", config.get("model_revision", "main"))
70
 
71
+ model_args = config.get("model_args")
72
+ model_args["revision"] = revision
73
+ model_args["trust_remote_code"] = True
74
+ model_args["cache_dir"] = None
 
 
75
  base_model = None
76
  if "pretrained" in model_args:
77
  base_model = model_args.pop("pretrained")
 
96
  pass
97
 
98
  # Extract results available in this file (some results are split in several files)
99
+ results = defaultdict(dict)
100
+ for seed, file_paths in seed_directory.items():
101
+ for file_path in file_paths:
102
+ with open(file_path) as file:
103
+ data = json.load(file)["results"]
104
+
105
+ for task in Tasks:
106
+ task = task.value
107
+ if task.benchmark not in data or task.metric not in data[task.benchmark]:
108
+ continue
109
+ score = data[task.benchmark][task.metric]
110
+ if task.metric in ("accuracy", "f1", "loglikelihood", "rouge"):
111
+ score *= 100
112
+ results[task.benchmark + "_" + task.metric][seed] = score
113
+
114
+ results = {task: np.mean(list(seed_results.values())) for task, seed_results in results.items()}
115
 
116
+ if len(org_and_model) == 1:
117
+ org = None
118
+ model = org_and_model[0]
119
+ else:
120
+ org = org_and_model[0]
121
+ model = org_and_model[1]
122
+ result_key = f"{'_'.join(org_and_model)}_{revision}_{precision.value.name}_{n_shot}_{prompt_version}_{seed}"
123
 
124
  return self(
125
  eval_name=result_key,
 
127
  org=org,
128
  model=model,
129
  results=results,
130
+ model_training=model_training,
131
  maltese_training=maltese_training,
132
+ language_count=language_count or "?",
133
+ precision=precision,
134
  revision=revision,
135
+ n_shot=n_shot,
136
+ prompt_version=prompt_version,
137
+ seed=seed,
138
  still_on_hub=still_on_hub,
139
  architecture=architecture,
140
  likes=likes or "?",
141
+ num_params=model_size and round(model_size / 1e9, 3),
142
  license=license,
143
  )
144
 
 
149
  try:
150
  with open(request_file, "r") as f:
151
  request = json.load(f)
152
+ self.model_training = ModelTraining.from_str(request.get("model_training", ""))
153
  self.weight_type = WeightType[request.get("weight_type", "Original")]
154
  self.license = request.get("license", "?")
155
  self.likes = request.get("likes", 0)
 
164
  data_dict = {
165
  "eval_name": self.eval_name, # not a column, just a save name,
166
  AutoEvalColumn.precision.name: self.precision.value.name,
167
+ AutoEvalColumn.n_shot.name: self.n_shot,
168
+ AutoEvalColumn.prompt_version.name: self.prompt_version,
169
+ AutoEvalColumn.model_training.name: self.model_training.value.name,
170
  AutoEvalColumn.maltese_training.name: self.maltese_training.value.name,
171
+ AutoEvalColumn.model_symbol.name: self.model_training.value.symbol + "/" + self.maltese_training.value.symbol,
172
+ AutoEvalColumn.language_count.name: self.language_count,
173
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
174
  AutoEvalColumn.architecture.name: self.architecture,
175
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
 
183
 
184
  results_by_task_type = defaultdict(list)
185
  for task in Tasks:
186
+ result = self.results.get(task.value.benchmark + "_" + task.value.metric)
187
  data_dict[task.value.col_name] = result
188
  if task.value.is_primary_metric:
189
  results_by_task_type[task.value.task_type].append(result)
 
219
  return request_file
220
 
221
 
222
+ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
223
  """From the path of the results folder root, extract all needed info for results"""
224
+ model_result_filepaths = defaultdict(lambda: defaultdict(list))
 
 
 
 
 
 
 
 
 
 
 
225
 
226
+ for directory_path in Path(results_path).rglob("*-shot/*/*/"):
227
+ for file_path in directory_path.rglob("*-seed/results_*.json"):
228
+ seed = file_path.parent.name.removesuffix("-seed")
229
+ model_result_filepaths[directory_path.relative_to(results_path)][seed].append(file_path)
230
 
231
  eval_results = {}
232
+ for model_result_filepath in model_result_filepaths.values():
233
  # Creation of result
234
+ eval_result = EvalResult.init_from_json_files(model_result_filepath)
235
 
236
  # Store results of same eval together
237
  eval_name = eval_result.eval_name
src/populate.py CHANGED
@@ -1,5 +1,5 @@
1
  import json
2
- import os
3
 
4
  import pandas as pd
5
 
@@ -10,44 +10,28 @@ from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24
 
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
  """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
  all_evals = []
29
 
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
 
52
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
 
1
  import json
2
+ from pathlib import Path
3
 
4
  import pandas as pd
5
 
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_eval_results(results_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
 
 
20
  return df
21
 
22
 
23
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
24
  """Creates the different dataframes for the evaluation queues requestes"""
 
25
  all_evals = []
26
 
27
+ for file_path in Path(save_path).rglob("requests_*.json"):
28
+ with open(file_path) as fp:
29
+ data = json.load(fp)["leaderboard"]
30
+
31
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
32
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
33
+
34
+ all_evals.append(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
37
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
src/submission/check_validity.py CHANGED
@@ -1,6 +1,5 @@
1
  import json
2
  import os
3
- from collections import defaultdict
4
  from typing import Any
5
 
6
  import huggingface_hub
@@ -75,11 +74,19 @@ def get_model_arch(model_info: ModelInfo):
75
  """Gets the model architecture from the configuration"""
76
  return model_info.config.get("architectures", "Unknown")
77
 
 
 
 
 
 
 
 
 
 
78
  def already_submitted_models(requested_models_dir: str) -> set[str]:
79
  """Gather a list of already submitted models to avoid duplicates"""
80
  depth = 1
81
- file_names = []
82
- users_to_submission_dates = defaultdict(list)
83
 
84
  for root, _, files in os.walk(requested_models_dir):
85
  current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
@@ -89,12 +96,8 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
89
  continue
90
  with open(os.path.join(root, file), "r") as f:
91
  info = json.load(f)
92
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
93
 
94
- # Select organisation
95
- if info["model"].count("/") == 0 or "submitted_time" not in info:
96
- continue
97
- organisation, _ = info["model"].split("/")
98
- users_to_submission_dates[organisation].append(info["submitted_time"])
99
 
100
- return set(file_names), users_to_submission_dates
 
1
  import json
2
  import os
 
3
  from typing import Any
4
 
5
  import huggingface_hub
 
74
  """Gets the model architecture from the configuration"""
75
  return model_info.config.get("architectures", "Unknown")
76
 
77
+ def get_model_properties(configuration: dict) -> tuple[str, str, str, int, str, int]:
78
+ model_name = configuration["model_name_sanitized"]
79
+ revision = configuration["config"]["model_revision"]
80
+ precision = configuration["config"]["model_dtype"].split(".")[-1]
81
+ seed = configuration["config"]["random_seed"]
82
+ n_shot = list(configuration["n-shot"].values())[0]
83
+ prompt_version = list(configuration["versions"].values())[0]
84
+ return model_name, revision, precision, seed, prompt_version, n_shot
85
+
86
  def already_submitted_models(requested_models_dir: str) -> set[str]:
87
  """Gather a list of already submitted models to avoid duplicates"""
88
  depth = 1
89
+ run_names = []
 
90
 
91
  for root, _, files in os.walk(requested_models_dir):
92
  current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
 
96
  continue
97
  with open(os.path.join(root, file), "r") as f:
98
  info = json.load(f)
 
99
 
100
+ properties = get_model_properties(info)
101
+ run_names.append("_".join([str(property) for property in properties]))
 
 
 
102
 
103
+ return set(run_names)
src/submission/submit.py CHANGED
@@ -1,117 +1,157 @@
1
  import json
2
  import os
 
3
  from datetime import datetime, timezone
 
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
 
14
  REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
 
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
 
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
 
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
 
 
 
41
 
42
- model_args = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Does the model actually exist?
45
- if revision == "":
46
- revision = "main"
47
- model_args["revision"] = revision
48
 
49
- # Is the model on the hub?
50
- if weight_type in ["Delta", "Adapter"]:
51
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, model_args=model_args, token=TOKEN, test_tokenizer=True)
52
- if not base_model_on_hub:
53
- return styled_error(f'Base model "{base_model}" {error}')
 
 
 
 
 
54
 
55
- if not weight_type == "Adapter":
56
- model_on_hub, error, _ = is_model_on_hub(model_name=model, model_args=model_args, token=TOKEN, test_tokenizer=True)
57
- if not model_on_hub:
58
- return styled_error(f'Model "{model}" {error}')
59
 
60
- # Is the model info correctly filled?
61
- try:
62
- model_info = API.model_info(repo_id=model, revision=revision)
63
- except Exception:
64
- return styled_error("Could not get your model information. Please fill it up properly.")
65
 
66
- model_size = get_model_size(model_info=model_info, precision=precision)
 
67
 
68
- # Were the model card and license filled?
69
- try:
70
- license = model_info.cardData["license"]
71
- except Exception:
72
- return styled_error("Please select a license for your model")
73
 
74
- modelcard_OK, error_msg = check_model_card(model)
75
- if not modelcard_OK:
76
- return styled_error(error_msg)
 
 
77
 
78
  # Seems good, creating the eval
79
  print("Adding new eval")
80
 
81
- eval_entry = {
82
- "model": model,
83
- "base_model": base_model,
 
 
 
 
84
  "revision": revision,
85
  "precision": precision,
86
- "weight_type": weight_type,
87
- "status": "PENDING",
 
 
 
 
 
88
  "submitted_time": current_time,
89
- "model_type": model_type,
90
- "likes": model_info.likes,
91
- "params": model_size,
92
- "license": license,
93
- "private": False,
94
  }
95
 
96
- # Check for duplicate submission
97
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
98
- return styled_warning("This model has been already submitted.")
99
-
100
- print("Creating eval file")
101
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
 
 
 
 
 
 
102
  os.makedirs(OUT_DIR, exist_ok=True)
103
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
104
 
105
  with open(out_path, "w") as f:
106
- f.write(json.dumps(eval_entry))
107
 
108
- print("Uploading eval file")
109
  API.upload_file(
110
  path_or_fileobj=out_path,
111
  path_in_repo=out_path.split("eval-queue/")[1],
112
  repo_id=QUEUE_REPO,
113
  repo_type="dataset",
114
- commit_message=f"Add {model} to eval queue",
115
  )
116
 
117
  # Remove the local file
 
1
  import json
2
  import os
3
+ import re
4
  from datetime import datetime, timezone
5
+ from pathlib import Path
6
 
7
  from src.display.formatting import styled_error, styled_message, styled_warning
8
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, PROMPT_VERSIONS, PREDICTIONS_REPO
9
+ from src.submission.check_validity import already_submitted_models, is_model_on_hub, get_model_properties
 
 
 
 
 
10
 
11
  REQUESTED_MODELS = None
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ def read_configuration(file_paths):
15
+ configuration_file_paths = list(filter(lambda file_path: file_path.name.endswith(".json"), file_paths or []))
16
+ if len(configuration_file_paths) != 1:
17
+ return None, None, None, None, None, styled_error(f"Expected exactly one configuration file but found {len(configuration_file_paths)}!")
 
18
 
19
+ configuration_file_path = file_paths.pop(file_paths.index(configuration_file_paths[0]))
 
20
 
21
+ try:
22
+ with open(configuration_file_path.name, "r", encoding="utf-8") as f:
23
+ data = json.load(f)
24
+ except Exception:
25
+ return None, None, None, None, None, styled_error("Failed to read configuration file!")
26
 
27
+ try:
28
+ model_name = data["model_name"]
29
+ model_args = {
30
+ **dict({tuple(arg.split("=")) for arg in data["config"].get("model_args", "").split(",") if len(arg) > 0}),
31
+ "revision": data["config"]["model_revision"],
32
+ "trust_remote_code": True,
33
+ "cache_dir": None
34
+ }
35
+ base_model = model_args.pop("pretrained")
36
+ model_on_hub, error, _ = is_model_on_hub(model_name=base_model, model_args=model_args, token=TOKEN, test_tokenizer=True)
37
+ if not model_on_hub:
38
+ return None, None, model_name, None, None, styled_error(f"Model {model_name} {error}")
39
+
40
+ limit = data["config"]["limit"]
41
+ if limit is not None:
42
+ return None, None, model_name, None, None, styled_error(f"Only full results are accepted but found a specified limit of {limit}!")
43
+
44
+ prediction_files = {}
45
+ versions = {}
46
+ n_shots = {}
47
+ for task_name, _ in data["configs"].items():
48
+ sample_files = list(filter(lambda file_path: re.search(rf"samples_{task_name}_.*\.jsonl", file_path.name), file_paths))
49
+ if len(sample_files) == 0:
50
+ return None, None, model_name, None, None, styled_error(f"No prediction file found for configured task {task_name}!")
51
+
52
+ prediction_files[task_name] = str(file_paths.pop(file_paths.index(sample_files[0])))
53
+
54
+ versions[task_name] = data["versions"][task_name]
55
+ n_shots[task_name] = data["n-shot"][task_name]
56
+ if len(prediction_files) == 0:
57
+ return None, None, model_name, None, None, styled_error("No tasks found in configuration!")
58
+
59
+ versions = set(versions.values())
60
+ if len(versions) != 1:
61
+ return None, None, model_name, None, None, styled_error(f"All tasks should have the same version but found {versions}!")
62
+ version = list(versions)[0]
63
+ if version not in PROMPT_VERSIONS:
64
+ return None, None, model_name, None, None, styled_error(f"Unknown version {version}, should be one of {PROMPT_VERSIONS}!")
65
+
66
+ n_shots = set(n_shots.values())
67
+ if len(n_shots) != 1:
68
+ return None, None, model_name, version, None, styled_error(f"All tasks should have the same number of shots but found {n_shots}!")
69
+ n_shot = list(n_shots)[0]
70
+ except KeyError:
71
+ return None, None, model_name, None, None, styled_error("Wrong configuration file format!")
72
+
73
+ if len(file_paths) > 0:
74
+ ignored_files = [Path(file_path).name for file_path in file_paths]
75
+ return data, prediction_files, model_name, version, n_shot, styled_warning(f"The following files will be ignored: {ignored_files}")
76
+ return data, prediction_files, model_name, version, n_shot, styled_message("Files parsed successfully, verify that read metadata is correct before submitting")
77
 
 
 
 
 
78
 
79
+ def add_new_eval(
80
+ model_training: str,
81
+ maltese_training: str,
82
+ language_count: int,
83
+ configuration: dict,
84
+ prediction_files: dict[str, str],
85
+ ):
86
+ global REQUESTED_MODELS
87
+ if not REQUESTED_MODELS:
88
+ REQUESTED_MODELS = already_submitted_models(EVAL_REQUESTS_PATH)
89
 
90
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S.%f")
 
 
 
91
 
92
+ if configuration is None or configuration == {} or prediction_files is None or prediction_files == {}:
93
+ return styled_error("No files selected for upload, please upload an output folder (or wait for the files to finish uploading).")
 
 
 
94
 
95
+ if model_training is None or model_training == "":
96
+ return styled_error("Please select the model's overall training.")
97
 
98
+ if maltese_training is None or maltese_training == "":
99
+ return styled_error("Please select the model's Maltese training.")
 
 
 
100
 
101
+ if language_count is None or language_count < 1:
102
+ language_count = None
103
+
104
+ model_name, revision, precision, seed, prompt_version, n_shot = get_model_properties(configuration)
105
+ model_id = configuration["model_name"]
106
 
107
  # Seems good, creating the eval
108
  print("Adding new eval")
109
 
110
+ # Check for duplicate submission
111
+ if f"{model_name}_{revision}_{precision}_{seed}_{prompt_version}_{n_shot}" in REQUESTED_MODELS:
112
+ return styled_warning("This model has been already submitted.")
113
+
114
+ request = {
115
+ "model": model_id,
116
+ "model_args": dict({tuple(arg.split("=")) for arg in configuration["config"].get("model_args", "").split(",") if len(arg) > 0}),
117
  "revision": revision,
118
  "precision": precision,
119
+ "seed": seed,
120
+ "n_shot": n_shot,
121
+ "prompt_version": prompt_version,
122
+ "tasks": list(configuration["configs"].keys()),
123
+ "model_training": model_training,
124
+ "maltese_training": maltese_training,
125
+ "language_count": language_count,
126
  "submitted_time": current_time,
127
+ "status": "PENDING",
 
 
 
 
128
  }
129
 
130
+ for task_name, file_path in prediction_files.items():
131
+ print(f"Uploading {model_id} {task_name} prediction file")
132
+ API.upload_file(
133
+ path_or_fileobj=file_path,
134
+ path_in_repo=f"{n_shot}-shot_{prompt_version}/{model_name}_{revision}_{precision}/{seed}-seed/samples_{task_name}_{current_time}.jsonl",
135
+ repo_id=PREDICTIONS_REPO,
136
+ repo_type="dataset",
137
+ commit_message=f"Add {configuration['model_name']} {task_name} {n_shot}-shot outputs",
138
+ )
139
+
140
+ print(f"Creating {model_id} configruation file")
141
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{model_name}"
142
  os.makedirs(OUT_DIR, exist_ok=True)
143
+ out_path = f"{OUT_DIR}/requests_{model_name}_{revision}_{precision}_{n_shot}shot_{prompt_version}_{seed}seed_{current_time}.json"
144
 
145
  with open(out_path, "w") as f:
146
+ f.write(json.dumps({"leaderboard": request, "configuration": configuration}, ensure_ascii=False, indent=2))
147
 
148
+ print(f"Uploading {model_id} configuration file")
149
  API.upload_file(
150
  path_or_fileobj=out_path,
151
  path_in_repo=out_path.split("eval-queue/")[1],
152
  repo_id=QUEUE_REPO,
153
  repo_type="dataset",
154
+ commit_message=f"Add {configuration['model_name']} {n_shot}-shot to eval queue",
155
  )
156
 
157
  # Remove the local file