Model output submission.
Browse files- app.py +63 -29
- requirements.txt +2 -0
- src/about.py +19 -43
- src/display/utils.py +19 -16
- src/envs.py +7 -3
- src/leaderboard/read_evals.py +68 -63
- src/populate.py +10 -26
- src/submission/check_validity.py +13 -10
- src/submission/submit.py +117 -77
app.py
CHANGED
@@ -19,14 +19,13 @@ from src.display.utils import (
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
22 |
-
|
23 |
fields,
|
24 |
-
|
25 |
-
Precision
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
-
from src.submission.submit import add_new_eval
|
30 |
|
31 |
|
32 |
def restart_space():
|
@@ -71,8 +70,15 @@ def init_leaderboard(dataframe):
|
|
71 |
search_columns=[AutoEvalColumn.model.name],
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.
|
75 |
ColumnFilter(AutoEvalColumn.maltese_training.name, type="checkboxgroup", label="Maltese training"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
ColumnFilter(
|
77 |
AutoEvalColumn.params.name,
|
78 |
type="slider",
|
@@ -80,6 +86,8 @@ def init_leaderboard(dataframe):
|
|
80 |
max=150,
|
81 |
label="Select the number of parameters (B)",
|
82 |
),
|
|
|
|
|
83 |
ColumnFilter(
|
84 |
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
),
|
@@ -145,45 +153,71 @@ with demo:
|
|
145 |
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
|
147 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
with gr.Column():
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
)
|
158 |
|
159 |
with gr.Column():
|
160 |
-
|
161 |
-
choices=[
|
162 |
-
label="
|
|
|
163 |
multiselect=False,
|
164 |
-
value=
|
165 |
interactive=True,
|
166 |
)
|
167 |
-
|
168 |
-
choices=[
|
169 |
-
label="
|
|
|
170 |
multiselect=False,
|
171 |
-
value=
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
interactive=True,
|
173 |
)
|
174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
|
176 |
submit_button = gr.Button("Submit Eval")
|
177 |
submission_result = gr.Markdown()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
submit_button.click(
|
179 |
add_new_eval,
|
180 |
[
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
model_type,
|
187 |
],
|
188 |
submission_result,
|
189 |
)
|
@@ -201,4 +235,4 @@ with demo:
|
|
201 |
scheduler = BackgroundScheduler()
|
202 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
scheduler.start()
|
204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
22 |
+
ModelTraining,
|
23 |
fields,
|
24 |
+
MalteseTraining
|
|
|
25 |
)
|
26 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
27 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
28 |
+
from src.submission.submit import add_new_eval, read_configuration
|
29 |
|
30 |
|
31 |
def restart_space():
|
|
|
70 |
search_columns=[AutoEvalColumn.model.name],
|
71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
72 |
filter_columns=[
|
73 |
+
ColumnFilter(AutoEvalColumn.model_training.name, type="checkboxgroup", label="Model types"),
|
74 |
ColumnFilter(AutoEvalColumn.maltese_training.name, type="checkboxgroup", label="Maltese training"),
|
75 |
+
ColumnFilter(
|
76 |
+
AutoEvalColumn.language_count.name,
|
77 |
+
type="slider",
|
78 |
+
min=1,
|
79 |
+
max=1000,
|
80 |
+
label="Number of languages during training",
|
81 |
+
),
|
82 |
ColumnFilter(
|
83 |
AutoEvalColumn.params.name,
|
84 |
type="slider",
|
|
|
86 |
max=150,
|
87 |
label="Select the number of parameters (B)",
|
88 |
),
|
89 |
+
ColumnFilter(AutoEvalColumn.prompt_version.name, type="checkboxgroup", label="Prompt Version"),
|
90 |
+
ColumnFilter(AutoEvalColumn.n_shot.name, type="slider", min=0, max=100, label="Number of Shots"),
|
91 |
ColumnFilter(
|
92 |
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
93 |
),
|
|
|
153 |
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
154 |
|
155 |
with gr.Row():
|
156 |
+
files = gr.File(
|
157 |
+
label="Files (Configuration File & Prediction Outputs)",
|
158 |
+
file_count="directory",
|
159 |
+
type="filepath",
|
160 |
+
)
|
161 |
+
|
162 |
+
with gr.Row(equal_height=True):
|
163 |
with gr.Column():
|
164 |
+
model_name = gr.Textbox(
|
165 |
+
label="Model name",
|
166 |
+
info="Read automatically from the results file.",
|
167 |
+
interactive=False,
|
168 |
+
)
|
169 |
+
version = gr.Textbox(
|
170 |
+
label="Prompt Version",
|
171 |
+
info="Read automatically from the results file.",
|
172 |
+
interactive=False,
|
173 |
+
)
|
174 |
+
n_shots = gr.Number(
|
175 |
+
label="Number of Shots",
|
176 |
+
info="Read automatically from the results file.",
|
177 |
+
interactive=False,
|
178 |
)
|
179 |
|
180 |
with gr.Column():
|
181 |
+
model_training = gr.Dropdown(
|
182 |
+
choices=[t.to_str(": ") for t in ModelTraining if t != ModelTraining.NK],
|
183 |
+
label="Model Training",
|
184 |
+
info="How to model is trained.",
|
185 |
multiselect=False,
|
186 |
+
value=None,
|
187 |
interactive=True,
|
188 |
)
|
189 |
+
maltese_training = gr.Dropdown(
|
190 |
+
choices=[t.to_str(": ") for t in MalteseTraining if t != ModelTraining.NK],
|
191 |
+
label="Maltese Training",
|
192 |
+
info="The last stage of training in which Maltese was included.",
|
193 |
multiselect=False,
|
194 |
+
value=None,
|
195 |
+
interactive=True,
|
196 |
+
)
|
197 |
+
language_count = gr.Number(
|
198 |
+
label="Number of languages",
|
199 |
+
info="Include languages for all training stages. Set to 0 if unknown.",
|
200 |
+
minimum=0,
|
201 |
interactive=True,
|
202 |
)
|
|
|
203 |
|
204 |
submit_button = gr.Button("Submit Eval")
|
205 |
submission_result = gr.Markdown()
|
206 |
+
|
207 |
+
configuration = gr.State()
|
208 |
+
file_paths = gr.State()
|
209 |
+
files.change(read_configuration,
|
210 |
+
files,
|
211 |
+
[configuration, file_paths, model_name, version, n_shots, submission_result])
|
212 |
+
|
213 |
submit_button.click(
|
214 |
add_new_eval,
|
215 |
[
|
216 |
+
model_training,
|
217 |
+
maltese_training,
|
218 |
+
language_count,
|
219 |
+
configuration,
|
220 |
+
file_paths
|
|
|
221 |
],
|
222 |
submission_result,
|
223 |
)
|
|
|
235 |
scheduler = BackgroundScheduler()
|
236 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
237 |
scheduler.start()
|
238 |
+
demo.queue(default_concurrency_limit=40).launch()
|
requirements.txt
CHANGED
@@ -9,6 +9,8 @@ huggingface-hub>=0.18.0
|
|
9 |
matplotlib
|
10 |
numpy
|
11 |
pandas
|
|
|
|
|
12 |
python-dateutil
|
13 |
tqdm
|
14 |
transformers
|
|
|
9 |
matplotlib
|
10 |
numpy
|
11 |
pandas
|
12 |
+
protobuf
|
13 |
+
pydantic==2.10.6
|
14 |
python-dateutil
|
15 |
tqdm
|
16 |
transformers
|
src/about.py
CHANGED
@@ -26,22 +26,22 @@ class Task:
|
|
26 |
# ---------------------------------------------------
|
27 |
class Tasks(Enum):
|
28 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
29 |
-
task0 = Task("
|
30 |
-
task1 = Task("
|
31 |
-
task2 = Task("
|
32 |
-
task3 = Task("maltese_news_categories", "loglikelihood
|
33 |
-
task4 = Task("
|
34 |
-
task5 = Task("
|
35 |
-
task6 = Task("
|
36 |
-
task7 = Task("
|
37 |
-
task8 = Task("
|
38 |
-
task9 = Task("
|
39 |
-
task10 = Task("
|
40 |
-
task11 = Task("
|
41 |
-
task12 = Task("
|
42 |
-
task13 = Task("
|
43 |
-
task14 = Task("maltese_news_headlines", "chrf
|
44 |
-
task15 = Task("maltese_news_headlines", "rouge
|
45 |
|
46 |
NUM_FEWSHOT = 0 # Change with your few shot
|
47 |
# ---------------------------------------------------
|
@@ -66,33 +66,9 @@ To reproduce our results, here is the commands you can run:
|
|
66 |
"""
|
67 |
|
68 |
EVALUATION_QUEUE_TEXT = """
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
```python
|
73 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
74 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
75 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
76 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
77 |
-
```
|
78 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
79 |
-
|
80 |
-
Note: make sure your model is public!
|
81 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
82 |
-
|
83 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
84 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
85 |
-
|
86 |
-
### 3) Make sure your model has an open license!
|
87 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
88 |
-
|
89 |
-
### 4) Fill up your model card
|
90 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
91 |
-
|
92 |
-
## In case of model failure
|
93 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
94 |
-
Make sure you have followed the above steps first.
|
95 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
96 |
"""
|
97 |
|
98 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
26 |
# ---------------------------------------------------
|
27 |
class Tasks(Enum):
|
28 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
29 |
+
task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", TaskType.NLU)
|
30 |
+
task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", TaskType.NLU)
|
31 |
+
task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", TaskType.NLU)
|
32 |
+
task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", TaskType.NLU)
|
33 |
+
task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", TaskType.NLU)
|
34 |
+
task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", TaskType.NLU)
|
35 |
+
task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", TaskType.NLG, False)
|
36 |
+
task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", TaskType.NLG)
|
37 |
+
task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", TaskType.NLG, False)
|
38 |
+
task9 = Task("flores200_eng-mlt", "chrf", "Flores-200 EN→MT (ChrF)", TaskType.NLG)
|
39 |
+
task10 = Task("webnlg_mlt", "chrf", "WebNLG (ChrF)", TaskType.NLG)
|
40 |
+
task11 = Task("webnlg_mlt", "rouge", "WebNLG (Rouge-L)", TaskType.NLG, False)
|
41 |
+
task12 = Task("eurlexsum_mlt", "chrf", "EUR-Lex-Sum (ChrF)", TaskType.NLG, False)
|
42 |
+
task13 = Task("eurlexsum_mlt", "rouge", "EUR-Lex-Sum (Rouge-L)", TaskType.NLG)
|
43 |
+
task14 = Task("maltese_news_headlines", "chrf", "Maltese News Headlines (ChrF)", TaskType.NLG, False)
|
44 |
+
task15 = Task("maltese_news_headlines", "rouge", "Maltese News Headlines (Rouge-L)", TaskType.NLG)
|
45 |
|
46 |
NUM_FEWSHOT = 0 # Change with your few shot
|
47 |
# ---------------------------------------------------
|
|
|
66 |
"""
|
67 |
|
68 |
EVALUATION_QUEUE_TEXT = """
|
69 |
+
To include new results on this benchmark, follow the instructions on our [GitHub Repository](https://github.com/MLRS/MELABench/tree/main/prompting).
|
70 |
+
You can then upload the output files which should include the configuration/results file and all the prediction files.
|
71 |
+
In addition, we ask for additional metadata about model training.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
73 |
|
74 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/display/utils.py
CHANGED
@@ -26,6 +26,8 @@ auto_eval_column_dict = []
|
|
26 |
# Init
|
27 |
auto_eval_column_dict.append(["model_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
|
|
29 |
#Scores
|
30 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average (All) ⬆️", "number", True)])
|
31 |
for task_type in TaskType:
|
@@ -33,9 +35,9 @@ for task_type in TaskType:
|
|
33 |
for task in Tasks:
|
34 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", task.value.is_primary_metric)])
|
35 |
# Model information
|
36 |
-
auto_eval_column_dict.append(["
|
37 |
auto_eval_column_dict.append(["maltese_training", ColumnContent, ColumnContent("Maltese Training", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["
|
39 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
40 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
41 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
@@ -53,9 +55,10 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
53 |
class EvalQueueColumn: # Queue column
|
54 |
model = ColumnContent("model", "markdown", True)
|
55 |
revision = ColumnContent("revision", "str", True)
|
56 |
-
private = ColumnContent("private", "bool", True)
|
57 |
precision = ColumnContent("precision", "str", True)
|
58 |
-
|
|
|
|
|
59 |
status = ColumnContent("status", "str", True)
|
60 |
|
61 |
## All the model information that we might need
|
@@ -66,7 +69,7 @@ class ModelDetails:
|
|
66 |
symbol: str = "" # emoji
|
67 |
|
68 |
|
69 |
-
class
|
70 |
PT = ModelDetails(name="pre-trained", symbol="PT")
|
71 |
FT = ModelDetails(name="fine-tuned", symbol="FT")
|
72 |
IT = ModelDetails(name="instruction-tuned", symbol="IT")
|
@@ -78,13 +81,13 @@ class ModelType(Enum):
|
|
78 |
@staticmethod
|
79 |
def from_str(type):
|
80 |
type = type or ""
|
81 |
-
if
|
82 |
-
return
|
83 |
-
if
|
84 |
-
return
|
85 |
-
if
|
86 |
-
return
|
87 |
-
return
|
88 |
|
89 |
|
90 |
class MalteseTraining(Enum):
|
@@ -100,13 +103,13 @@ class MalteseTraining(Enum):
|
|
100 |
@staticmethod
|
101 |
def from_str(type):
|
102 |
type = type or ""
|
103 |
-
if
|
104 |
return MalteseTraining.NO
|
105 |
-
if
|
106 |
return MalteseTraining.PT
|
107 |
-
if
|
108 |
return MalteseTraining.FT
|
109 |
-
if
|
110 |
return MalteseTraining.IT
|
111 |
return MalteseTraining.NK
|
112 |
|
|
|
26 |
# Init
|
27 |
auto_eval_column_dict.append(["model_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
29 |
+
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("N-Shot", "number", False)])
|
30 |
+
auto_eval_column_dict.append(["prompt_version", ColumnContent, ColumnContent("Version", "str", False)])
|
31 |
#Scores
|
32 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average (All) ⬆️", "number", True)])
|
33 |
for task_type in TaskType:
|
|
|
35 |
for task in Tasks:
|
36 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", task.value.is_primary_metric)])
|
37 |
# Model information
|
38 |
+
auto_eval_column_dict.append(["model_training", ColumnContent, ColumnContent("Type", "str", False)])
|
39 |
auto_eval_column_dict.append(["maltese_training", ColumnContent, ColumnContent("Maltese Training", "str", False)])
|
40 |
+
auto_eval_column_dict.append(["language_count", ColumnContent, ColumnContent("#Languages", "number", False)])
|
41 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
42 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
43 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
|
|
55 |
class EvalQueueColumn: # Queue column
|
56 |
model = ColumnContent("model", "markdown", True)
|
57 |
revision = ColumnContent("revision", "str", True)
|
|
|
58 |
precision = ColumnContent("precision", "str", True)
|
59 |
+
n_shot = ColumnContent("n_shot", "int", True)
|
60 |
+
prompt_version = ColumnContent("prompt_version", "str", True)
|
61 |
+
seed = ColumnContent("seed", "int", True)
|
62 |
status = ColumnContent("status", "str", True)
|
63 |
|
64 |
## All the model information that we might need
|
|
|
69 |
symbol: str = "" # emoji
|
70 |
|
71 |
|
72 |
+
class ModelTraining(Enum):
|
73 |
PT = ModelDetails(name="pre-trained", symbol="PT")
|
74 |
FT = ModelDetails(name="fine-tuned", symbol="FT")
|
75 |
IT = ModelDetails(name="instruction-tuned", symbol="IT")
|
|
|
81 |
@staticmethod
|
82 |
def from_str(type):
|
83 |
type = type or ""
|
84 |
+
if "PT" in type:
|
85 |
+
return ModelTraining.PT
|
86 |
+
if "FT" in type:
|
87 |
+
return ModelTraining.FT
|
88 |
+
if "IT" in type:
|
89 |
+
return ModelTraining.IT
|
90 |
+
return ModelTraining.NK
|
91 |
|
92 |
|
93 |
class MalteseTraining(Enum):
|
|
|
103 |
@staticmethod
|
104 |
def from_str(type):
|
105 |
type = type or ""
|
106 |
+
if "NO" in type:
|
107 |
return MalteseTraining.NO
|
108 |
+
if "PT" in type:
|
109 |
return MalteseTraining.PT
|
110 |
+
if "FT" in type:
|
111 |
return MalteseTraining.FT
|
112 |
+
if "IT" in type:
|
113 |
return MalteseTraining.IT
|
114 |
return MalteseTraining.NK
|
115 |
|
src/envs.py
CHANGED
@@ -4,17 +4,21 @@ from huggingface_hub import HfApi
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("HF_TOKEN")
|
8 |
|
9 |
-
|
|
|
|
|
|
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/MELABench"
|
13 |
QUEUE_REPO = f"{OWNER}/MELABench_requests"
|
|
|
14 |
RESULTS_REPO = f"{OWNER}/MELABench_results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
PROMPT_VERSIONS = [version.strip() for version in
|
10 |
+
os.environ.get("PROMPT_VERSIONS", "1.0_english,1.0_maltese").split(",")]
|
11 |
+
|
12 |
+
OWNER = "MLRS" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
13 |
# ----------------------------------
|
14 |
|
15 |
REPO_ID = f"{OWNER}/MELABench"
|
16 |
QUEUE_REPO = f"{OWNER}/MELABench_requests"
|
17 |
+
PREDICTIONS_REPO = f"{OWNER}/MELABench_predictions"
|
18 |
RESULTS_REPO = f"{OWNER}/MELABench_results"
|
19 |
|
20 |
# If you setup a cache later, just change HF_HOME
|
21 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
22 |
|
23 |
# Local caches
|
24 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
src/leaderboard/read_evals.py
CHANGED
@@ -3,12 +3,12 @@ import json
|
|
3 |
import os
|
4 |
from collections import defaultdict
|
5 |
from dataclasses import dataclass
|
|
|
6 |
|
7 |
-
import dateutil
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn,
|
12 |
from src.envs import TOKEN, API
|
13 |
from src.submission.check_validity import is_model_on_hub, get_model_size
|
14 |
|
@@ -24,9 +24,12 @@ class EvalResult:
|
|
24 |
revision: str # commit hash, "" if main
|
25 |
results: dict
|
26 |
precision: Precision = Precision.Unknown
|
27 |
-
|
|
|
|
|
|
|
28 |
maltese_training: MalteseTraining = MalteseTraining.NK # none, pre-training, ...
|
29 |
-
|
30 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
31 |
architecture: str = "Unknown"
|
32 |
license: str = "?"
|
@@ -36,46 +39,39 @@ class EvalResult:
|
|
36 |
still_on_hub: bool = False
|
37 |
|
38 |
@classmethod
|
39 |
-
def
|
40 |
"""Inits the result from the specific model result file"""
|
41 |
-
with open(
|
42 |
data = json.load(fp)
|
43 |
|
44 |
config = data.get("config")
|
45 |
-
metadata = data.get("metadata")
|
46 |
-
|
47 |
precision = Precision.from_str(config.get("model_dtype"))
|
48 |
|
49 |
-
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
|
52 |
|
53 |
-
|
|
|
|
|
54 |
|
55 |
model_size = config.get("model_num_parameters")
|
56 |
|
57 |
# Get model and org
|
58 |
-
org_and_model = config.get("
|
59 |
org_and_model = org_and_model.split("/", 1)
|
60 |
-
|
61 |
-
if len(org_and_model) == 1:
|
62 |
-
org = None
|
63 |
-
model = org_and_model[0]
|
64 |
-
result_key = f"{model}_{precision.value.name}"
|
65 |
-
else:
|
66 |
-
org = org_and_model[0]
|
67 |
-
model = org_and_model[1]
|
68 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
69 |
full_model = "/".join(org_and_model)
|
70 |
|
71 |
revision = config.get("model_sha", config.get("model_revision", "main"))
|
72 |
|
73 |
-
model_args =
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
"cache_dir": None
|
78 |
-
}
|
79 |
base_model = None
|
80 |
if "pretrained" in model_args:
|
81 |
base_model = model_args.pop("pretrained")
|
@@ -100,17 +96,30 @@ class EvalResult:
|
|
100 |
pass
|
101 |
|
102 |
# Extract results available in this file (some results are split in several files)
|
103 |
-
results =
|
104 |
-
for
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
return self(
|
116 |
eval_name=result_key,
|
@@ -118,15 +127,18 @@ class EvalResult:
|
|
118 |
org=org,
|
119 |
model=model,
|
120 |
results=results,
|
121 |
-
|
122 |
maltese_training=maltese_training,
|
123 |
-
|
124 |
-
precision=precision,
|
125 |
revision=revision,
|
|
|
|
|
|
|
126 |
still_on_hub=still_on_hub,
|
127 |
architecture=architecture,
|
128 |
likes=likes or "?",
|
129 |
-
num_params=round(model_size / 1e9, 3),
|
130 |
license=license,
|
131 |
)
|
132 |
|
@@ -137,7 +149,7 @@ class EvalResult:
|
|
137 |
try:
|
138 |
with open(request_file, "r") as f:
|
139 |
request = json.load(f)
|
140 |
-
self.
|
141 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
142 |
self.license = request.get("license", "?")
|
143 |
self.likes = request.get("likes", 0)
|
@@ -152,10 +164,12 @@ class EvalResult:
|
|
152 |
data_dict = {
|
153 |
"eval_name": self.eval_name, # not a column, just a save name,
|
154 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
155 |
-
AutoEvalColumn.
|
|
|
|
|
156 |
AutoEvalColumn.maltese_training.name: self.maltese_training.value.name,
|
157 |
-
AutoEvalColumn.model_symbol.name: self.
|
158 |
-
AutoEvalColumn.
|
159 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
160 |
AutoEvalColumn.architecture.name: self.architecture,
|
161 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
@@ -169,7 +183,7 @@ class EvalResult:
|
|
169 |
|
170 |
results_by_task_type = defaultdict(list)
|
171 |
for task in Tasks:
|
172 |
-
result = self.results
|
173 |
data_dict[task.value.col_name] = result
|
174 |
if task.value.is_primary_metric:
|
175 |
results_by_task_type[task.value.task_type].append(result)
|
@@ -205,28 +219,19 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
205 |
return request_file
|
206 |
|
207 |
|
208 |
-
def get_raw_eval_results(results_path: str
|
209 |
"""From the path of the results folder root, extract all needed info for results"""
|
210 |
-
model_result_filepaths =
|
211 |
-
|
212 |
-
for root, _, files in os.walk(results_path):
|
213 |
-
# We should only have json files in model results
|
214 |
-
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
215 |
-
continue
|
216 |
-
|
217 |
-
# Sort the files by date
|
218 |
-
try:
|
219 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
220 |
-
except dateutil.parser._parser.ParserError:
|
221 |
-
files = [files[-1]]
|
222 |
|
223 |
-
|
224 |
-
|
|
|
|
|
225 |
|
226 |
eval_results = {}
|
227 |
-
for model_result_filepath in model_result_filepaths:
|
228 |
# Creation of result
|
229 |
-
eval_result = EvalResult.
|
230 |
|
231 |
# Store results of same eval together
|
232 |
eval_name = eval_result.eval_name
|
|
|
3 |
import os
|
4 |
from collections import defaultdict
|
5 |
from dataclasses import dataclass
|
6 |
+
from pathlib import Path
|
7 |
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelTraining, Tasks, Precision, WeightType, MalteseTraining
|
12 |
from src.envs import TOKEN, API
|
13 |
from src.submission.check_validity import is_model_on_hub, get_model_size
|
14 |
|
|
|
24 |
revision: str # commit hash, "" if main
|
25 |
results: dict
|
26 |
precision: Precision = Precision.Unknown
|
27 |
+
n_shot: int = 0
|
28 |
+
prompt_version: str = "1.0_english"
|
29 |
+
seed: int = 0
|
30 |
+
model_training: ModelTraining = ModelTraining.NK # Pretrained, fine tuned, ...
|
31 |
maltese_training: MalteseTraining = MalteseTraining.NK # none, pre-training, ...
|
32 |
+
language_count: int = None
|
33 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
34 |
architecture: str = "Unknown"
|
35 |
license: str = "?"
|
|
|
39 |
still_on_hub: bool = False
|
40 |
|
41 |
@classmethod
|
42 |
+
def init_from_json_files(self, seed_directory):
|
43 |
"""Inits the result from the specific model result file"""
|
44 |
+
with open(list(seed_directory.values())[0][0]) as fp:
|
45 |
data = json.load(fp)
|
46 |
|
47 |
config = data.get("config")
|
|
|
|
|
48 |
precision = Precision.from_str(config.get("model_dtype"))
|
49 |
|
50 |
+
n_shot = config.get("n_shot")
|
51 |
+
|
52 |
+
prompt_version = config.get("prompt_version")
|
53 |
+
|
54 |
+
seed = config.get("seed")
|
55 |
|
56 |
+
model_training = ModelTraining.from_str(config.get("model_training"))
|
57 |
|
58 |
+
maltese_training = MalteseTraining.from_str(config.get("maltese_training"))
|
59 |
+
|
60 |
+
language_count = config.get("language_count")
|
61 |
|
62 |
model_size = config.get("model_num_parameters")
|
63 |
|
64 |
# Get model and org
|
65 |
+
org_and_model = config.get("model", None)
|
66 |
org_and_model = org_and_model.split("/", 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
full_model = "/".join(org_and_model)
|
68 |
|
69 |
revision = config.get("model_sha", config.get("model_revision", "main"))
|
70 |
|
71 |
+
model_args = config.get("model_args")
|
72 |
+
model_args["revision"] = revision
|
73 |
+
model_args["trust_remote_code"] = True
|
74 |
+
model_args["cache_dir"] = None
|
|
|
|
|
75 |
base_model = None
|
76 |
if "pretrained" in model_args:
|
77 |
base_model = model_args.pop("pretrained")
|
|
|
96 |
pass
|
97 |
|
98 |
# Extract results available in this file (some results are split in several files)
|
99 |
+
results = defaultdict(dict)
|
100 |
+
for seed, file_paths in seed_directory.items():
|
101 |
+
for file_path in file_paths:
|
102 |
+
with open(file_path) as file:
|
103 |
+
data = json.load(file)["results"]
|
104 |
+
|
105 |
+
for task in Tasks:
|
106 |
+
task = task.value
|
107 |
+
if task.benchmark not in data or task.metric not in data[task.benchmark]:
|
108 |
+
continue
|
109 |
+
score = data[task.benchmark][task.metric]
|
110 |
+
if task.metric in ("accuracy", "f1", "loglikelihood", "rouge"):
|
111 |
+
score *= 100
|
112 |
+
results[task.benchmark + "_" + task.metric][seed] = score
|
113 |
+
|
114 |
+
results = {task: np.mean(list(seed_results.values())) for task, seed_results in results.items()}
|
115 |
|
116 |
+
if len(org_and_model) == 1:
|
117 |
+
org = None
|
118 |
+
model = org_and_model[0]
|
119 |
+
else:
|
120 |
+
org = org_and_model[0]
|
121 |
+
model = org_and_model[1]
|
122 |
+
result_key = f"{'_'.join(org_and_model)}_{revision}_{precision.value.name}_{n_shot}_{prompt_version}_{seed}"
|
123 |
|
124 |
return self(
|
125 |
eval_name=result_key,
|
|
|
127 |
org=org,
|
128 |
model=model,
|
129 |
results=results,
|
130 |
+
model_training=model_training,
|
131 |
maltese_training=maltese_training,
|
132 |
+
language_count=language_count or "?",
|
133 |
+
precision=precision,
|
134 |
revision=revision,
|
135 |
+
n_shot=n_shot,
|
136 |
+
prompt_version=prompt_version,
|
137 |
+
seed=seed,
|
138 |
still_on_hub=still_on_hub,
|
139 |
architecture=architecture,
|
140 |
likes=likes or "?",
|
141 |
+
num_params=model_size and round(model_size / 1e9, 3),
|
142 |
license=license,
|
143 |
)
|
144 |
|
|
|
149 |
try:
|
150 |
with open(request_file, "r") as f:
|
151 |
request = json.load(f)
|
152 |
+
self.model_training = ModelTraining.from_str(request.get("model_training", ""))
|
153 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
154 |
self.license = request.get("license", "?")
|
155 |
self.likes = request.get("likes", 0)
|
|
|
164 |
data_dict = {
|
165 |
"eval_name": self.eval_name, # not a column, just a save name,
|
166 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
167 |
+
AutoEvalColumn.n_shot.name: self.n_shot,
|
168 |
+
AutoEvalColumn.prompt_version.name: self.prompt_version,
|
169 |
+
AutoEvalColumn.model_training.name: self.model_training.value.name,
|
170 |
AutoEvalColumn.maltese_training.name: self.maltese_training.value.name,
|
171 |
+
AutoEvalColumn.model_symbol.name: self.model_training.value.symbol + "/" + self.maltese_training.value.symbol,
|
172 |
+
AutoEvalColumn.language_count.name: self.language_count,
|
173 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
174 |
AutoEvalColumn.architecture.name: self.architecture,
|
175 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
|
|
183 |
|
184 |
results_by_task_type = defaultdict(list)
|
185 |
for task in Tasks:
|
186 |
+
result = self.results.get(task.value.benchmark + "_" + task.value.metric)
|
187 |
data_dict[task.value.col_name] = result
|
188 |
if task.value.is_primary_metric:
|
189 |
results_by_task_type[task.value.task_type].append(result)
|
|
|
219 |
return request_file
|
220 |
|
221 |
|
222 |
+
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
223 |
"""From the path of the results folder root, extract all needed info for results"""
|
224 |
+
model_result_filepaths = defaultdict(lambda: defaultdict(list))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
+
for directory_path in Path(results_path).rglob("*-shot/*/*/"):
|
227 |
+
for file_path in directory_path.rglob("*-seed/results_*.json"):
|
228 |
+
seed = file_path.parent.name.removesuffix("-seed")
|
229 |
+
model_result_filepaths[directory_path.relative_to(results_path)][seed].append(file_path)
|
230 |
|
231 |
eval_results = {}
|
232 |
+
for model_result_filepath in model_result_filepaths.values():
|
233 |
# Creation of result
|
234 |
+
eval_result = EvalResult.init_from_json_files(model_result_filepath)
|
235 |
|
236 |
# Store results of same eval together
|
237 |
eval_name = eval_result.eval_name
|
src/populate.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import json
|
2 |
-
import
|
3 |
|
4 |
import pandas as pd
|
5 |
|
@@ -10,44 +10,28 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
-
# filter out if any of the benchmarks have not been produced
|
21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
23 |
|
24 |
|
25 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
26 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
27 |
-
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
28 |
all_evals = []
|
29 |
|
30 |
-
for
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
all_evals.append(data)
|
40 |
-
elif ".md" not in entry:
|
41 |
-
# this is a folder
|
42 |
-
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
43 |
-
for sub_entry in sub_entries:
|
44 |
-
file_path = os.path.join(save_path, entry, sub_entry)
|
45 |
-
with open(file_path) as fp:
|
46 |
-
data = json.load(fp)
|
47 |
-
|
48 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
-
all_evals.append(data)
|
51 |
|
52 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
53 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
|
|
1 |
import json
|
2 |
+
from pathlib import Path
|
3 |
|
4 |
import pandas as pd
|
5 |
|
|
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
raw_data = get_raw_eval_results(results_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
|
|
|
|
20 |
return df
|
21 |
|
22 |
|
23 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
24 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
|
|
25 |
all_evals = []
|
26 |
|
27 |
+
for file_path in Path(save_path).rglob("requests_*.json"):
|
28 |
+
with open(file_path) as fp:
|
29 |
+
data = json.load(fp)["leaderboard"]
|
30 |
+
|
31 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
32 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
33 |
+
|
34 |
+
all_evals.append(data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
37 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
src/submission/check_validity.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
from collections import defaultdict
|
4 |
from typing import Any
|
5 |
|
6 |
import huggingface_hub
|
@@ -75,11 +74,19 @@ def get_model_arch(model_info: ModelInfo):
|
|
75 |
"""Gets the model architecture from the configuration"""
|
76 |
return model_info.config.get("architectures", "Unknown")
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
79 |
"""Gather a list of already submitted models to avoid duplicates"""
|
80 |
depth = 1
|
81 |
-
|
82 |
-
users_to_submission_dates = defaultdict(list)
|
83 |
|
84 |
for root, _, files in os.walk(requested_models_dir):
|
85 |
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
@@ -89,12 +96,8 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
89 |
continue
|
90 |
with open(os.path.join(root, file), "r") as f:
|
91 |
info = json.load(f)
|
92 |
-
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
continue
|
97 |
-
organisation, _ = info["model"].split("/")
|
98 |
-
users_to_submission_dates[organisation].append(info["submitted_time"])
|
99 |
|
100 |
-
return set(
|
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
from typing import Any
|
4 |
|
5 |
import huggingface_hub
|
|
|
74 |
"""Gets the model architecture from the configuration"""
|
75 |
return model_info.config.get("architectures", "Unknown")
|
76 |
|
77 |
+
def get_model_properties(configuration: dict) -> tuple[str, str, str, int, str, int]:
|
78 |
+
model_name = configuration["model_name_sanitized"]
|
79 |
+
revision = configuration["config"]["model_revision"]
|
80 |
+
precision = configuration["config"]["model_dtype"].split(".")[-1]
|
81 |
+
seed = configuration["config"]["random_seed"]
|
82 |
+
n_shot = list(configuration["n-shot"].values())[0]
|
83 |
+
prompt_version = list(configuration["versions"].values())[0]
|
84 |
+
return model_name, revision, precision, seed, prompt_version, n_shot
|
85 |
+
|
86 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
87 |
"""Gather a list of already submitted models to avoid duplicates"""
|
88 |
depth = 1
|
89 |
+
run_names = []
|
|
|
90 |
|
91 |
for root, _, files in os.walk(requested_models_dir):
|
92 |
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
|
|
96 |
continue
|
97 |
with open(os.path.join(root, file), "r") as f:
|
98 |
info = json.load(f)
|
|
|
99 |
|
100 |
+
properties = get_model_properties(info)
|
101 |
+
run_names.append("_".join([str(property) for property in properties]))
|
|
|
|
|
|
|
102 |
|
103 |
+
return set(run_names)
|
src/submission/submit.py
CHANGED
@@ -1,117 +1,157 @@
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
from datetime import datetime, timezone
|
|
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
-
from src.submission.check_validity import
|
8 |
-
already_submitted_models,
|
9 |
-
check_model_card,
|
10 |
-
get_model_size,
|
11 |
-
is_model_on_hub,
|
12 |
-
)
|
13 |
|
14 |
REQUESTED_MODELS = None
|
15 |
-
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
17 |
-
def add_new_eval(
|
18 |
-
model: str,
|
19 |
-
base_model: str,
|
20 |
-
revision: str,
|
21 |
-
precision: str,
|
22 |
-
weight_type: str,
|
23 |
-
model_type: str,
|
24 |
-
):
|
25 |
-
global REQUESTED_MODELS
|
26 |
-
global USERS_TO_SUBMISSION_DATES
|
27 |
-
if not REQUESTED_MODELS:
|
28 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
if
|
33 |
-
|
34 |
-
model_path = model.split("/")[1]
|
35 |
|
36 |
-
|
37 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
# Does the model actually exist?
|
45 |
-
if revision == "":
|
46 |
-
revision = "main"
|
47 |
-
model_args["revision"] = revision
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
|
56 |
-
model_on_hub, error, _ = is_model_on_hub(model_name=model, model_args=model_args, token=TOKEN, test_tokenizer=True)
|
57 |
-
if not model_on_hub:
|
58 |
-
return styled_error(f'Model "{model}" {error}')
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
model_info = API.model_info(repo_id=model, revision=revision)
|
63 |
-
except Exception:
|
64 |
-
return styled_error("Could not get your model information. Please fill it up properly.")
|
65 |
|
66 |
-
|
|
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
license = model_info.cardData["license"]
|
71 |
-
except Exception:
|
72 |
-
return styled_error("Please select a license for your model")
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
77 |
|
78 |
# Seems good, creating the eval
|
79 |
print("Adding new eval")
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
"
|
|
|
|
|
|
|
|
|
84 |
"revision": revision,
|
85 |
"precision": precision,
|
86 |
-
"
|
87 |
-
"
|
|
|
|
|
|
|
|
|
|
|
88 |
"submitted_time": current_time,
|
89 |
-
"
|
90 |
-
"likes": model_info.likes,
|
91 |
-
"params": model_size,
|
92 |
-
"license": license,
|
93 |
-
"private": False,
|
94 |
}
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
os.makedirs(OUT_DIR, exist_ok=True)
|
103 |
-
out_path = f"{OUT_DIR}/{
|
104 |
|
105 |
with open(out_path, "w") as f:
|
106 |
-
f.write(json.dumps(
|
107 |
|
108 |
-
print("Uploading
|
109 |
API.upload_file(
|
110 |
path_or_fileobj=out_path,
|
111 |
path_in_repo=out_path.split("eval-queue/")[1],
|
112 |
repo_id=QUEUE_REPO,
|
113 |
repo_type="dataset",
|
114 |
-
commit_message=f"Add {
|
115 |
)
|
116 |
|
117 |
# Remove the local file
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import re
|
4 |
from datetime import datetime, timezone
|
5 |
+
from pathlib import Path
|
6 |
|
7 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
8 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, PROMPT_VERSIONS, PREDICTIONS_REPO
|
9 |
+
from src.submission.check_validity import already_submitted_models, is_model_on_hub, get_model_properties
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
REQUESTED_MODELS = None
|
|
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
def read_configuration(file_paths):
|
15 |
+
configuration_file_paths = list(filter(lambda file_path: file_path.name.endswith(".json"), file_paths or []))
|
16 |
+
if len(configuration_file_paths) != 1:
|
17 |
+
return None, None, None, None, None, styled_error(f"Expected exactly one configuration file but found {len(configuration_file_paths)}!")
|
|
|
18 |
|
19 |
+
configuration_file_path = file_paths.pop(file_paths.index(configuration_file_paths[0]))
|
|
|
20 |
|
21 |
+
try:
|
22 |
+
with open(configuration_file_path.name, "r", encoding="utf-8") as f:
|
23 |
+
data = json.load(f)
|
24 |
+
except Exception:
|
25 |
+
return None, None, None, None, None, styled_error("Failed to read configuration file!")
|
26 |
|
27 |
+
try:
|
28 |
+
model_name = data["model_name"]
|
29 |
+
model_args = {
|
30 |
+
**dict({tuple(arg.split("=")) for arg in data["config"].get("model_args", "").split(",") if len(arg) > 0}),
|
31 |
+
"revision": data["config"]["model_revision"],
|
32 |
+
"trust_remote_code": True,
|
33 |
+
"cache_dir": None
|
34 |
+
}
|
35 |
+
base_model = model_args.pop("pretrained")
|
36 |
+
model_on_hub, error, _ = is_model_on_hub(model_name=base_model, model_args=model_args, token=TOKEN, test_tokenizer=True)
|
37 |
+
if not model_on_hub:
|
38 |
+
return None, None, model_name, None, None, styled_error(f"Model {model_name} {error}")
|
39 |
+
|
40 |
+
limit = data["config"]["limit"]
|
41 |
+
if limit is not None:
|
42 |
+
return None, None, model_name, None, None, styled_error(f"Only full results are accepted but found a specified limit of {limit}!")
|
43 |
+
|
44 |
+
prediction_files = {}
|
45 |
+
versions = {}
|
46 |
+
n_shots = {}
|
47 |
+
for task_name, _ in data["configs"].items():
|
48 |
+
sample_files = list(filter(lambda file_path: re.search(rf"samples_{task_name}_.*\.jsonl", file_path.name), file_paths))
|
49 |
+
if len(sample_files) == 0:
|
50 |
+
return None, None, model_name, None, None, styled_error(f"No prediction file found for configured task {task_name}!")
|
51 |
+
|
52 |
+
prediction_files[task_name] = str(file_paths.pop(file_paths.index(sample_files[0])))
|
53 |
+
|
54 |
+
versions[task_name] = data["versions"][task_name]
|
55 |
+
n_shots[task_name] = data["n-shot"][task_name]
|
56 |
+
if len(prediction_files) == 0:
|
57 |
+
return None, None, model_name, None, None, styled_error("No tasks found in configuration!")
|
58 |
+
|
59 |
+
versions = set(versions.values())
|
60 |
+
if len(versions) != 1:
|
61 |
+
return None, None, model_name, None, None, styled_error(f"All tasks should have the same version but found {versions}!")
|
62 |
+
version = list(versions)[0]
|
63 |
+
if version not in PROMPT_VERSIONS:
|
64 |
+
return None, None, model_name, None, None, styled_error(f"Unknown version {version}, should be one of {PROMPT_VERSIONS}!")
|
65 |
+
|
66 |
+
n_shots = set(n_shots.values())
|
67 |
+
if len(n_shots) != 1:
|
68 |
+
return None, None, model_name, version, None, styled_error(f"All tasks should have the same number of shots but found {n_shots}!")
|
69 |
+
n_shot = list(n_shots)[0]
|
70 |
+
except KeyError:
|
71 |
+
return None, None, model_name, None, None, styled_error("Wrong configuration file format!")
|
72 |
+
|
73 |
+
if len(file_paths) > 0:
|
74 |
+
ignored_files = [Path(file_path).name for file_path in file_paths]
|
75 |
+
return data, prediction_files, model_name, version, n_shot, styled_warning(f"The following files will be ignored: {ignored_files}")
|
76 |
+
return data, prediction_files, model_name, version, n_shot, styled_message("Files parsed successfully, verify that read metadata is correct before submitting")
|
77 |
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
def add_new_eval(
|
80 |
+
model_training: str,
|
81 |
+
maltese_training: str,
|
82 |
+
language_count: int,
|
83 |
+
configuration: dict,
|
84 |
+
prediction_files: dict[str, str],
|
85 |
+
):
|
86 |
+
global REQUESTED_MODELS
|
87 |
+
if not REQUESTED_MODELS:
|
88 |
+
REQUESTED_MODELS = already_submitted_models(EVAL_REQUESTS_PATH)
|
89 |
|
90 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S.%f")
|
|
|
|
|
|
|
91 |
|
92 |
+
if configuration is None or configuration == {} or prediction_files is None or prediction_files == {}:
|
93 |
+
return styled_error("No files selected for upload, please upload an output folder (or wait for the files to finish uploading).")
|
|
|
|
|
|
|
94 |
|
95 |
+
if model_training is None or model_training == "":
|
96 |
+
return styled_error("Please select the model's overall training.")
|
97 |
|
98 |
+
if maltese_training is None or maltese_training == "":
|
99 |
+
return styled_error("Please select the model's Maltese training.")
|
|
|
|
|
|
|
100 |
|
101 |
+
if language_count is None or language_count < 1:
|
102 |
+
language_count = None
|
103 |
+
|
104 |
+
model_name, revision, precision, seed, prompt_version, n_shot = get_model_properties(configuration)
|
105 |
+
model_id = configuration["model_name"]
|
106 |
|
107 |
# Seems good, creating the eval
|
108 |
print("Adding new eval")
|
109 |
|
110 |
+
# Check for duplicate submission
|
111 |
+
if f"{model_name}_{revision}_{precision}_{seed}_{prompt_version}_{n_shot}" in REQUESTED_MODELS:
|
112 |
+
return styled_warning("This model has been already submitted.")
|
113 |
+
|
114 |
+
request = {
|
115 |
+
"model": model_id,
|
116 |
+
"model_args": dict({tuple(arg.split("=")) for arg in configuration["config"].get("model_args", "").split(",") if len(arg) > 0}),
|
117 |
"revision": revision,
|
118 |
"precision": precision,
|
119 |
+
"seed": seed,
|
120 |
+
"n_shot": n_shot,
|
121 |
+
"prompt_version": prompt_version,
|
122 |
+
"tasks": list(configuration["configs"].keys()),
|
123 |
+
"model_training": model_training,
|
124 |
+
"maltese_training": maltese_training,
|
125 |
+
"language_count": language_count,
|
126 |
"submitted_time": current_time,
|
127 |
+
"status": "PENDING",
|
|
|
|
|
|
|
|
|
128 |
}
|
129 |
|
130 |
+
for task_name, file_path in prediction_files.items():
|
131 |
+
print(f"Uploading {model_id} {task_name} prediction file")
|
132 |
+
API.upload_file(
|
133 |
+
path_or_fileobj=file_path,
|
134 |
+
path_in_repo=f"{n_shot}-shot_{prompt_version}/{model_name}_{revision}_{precision}/{seed}-seed/samples_{task_name}_{current_time}.jsonl",
|
135 |
+
repo_id=PREDICTIONS_REPO,
|
136 |
+
repo_type="dataset",
|
137 |
+
commit_message=f"Add {configuration['model_name']} {task_name} {n_shot}-shot outputs",
|
138 |
+
)
|
139 |
+
|
140 |
+
print(f"Creating {model_id} configruation file")
|
141 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{model_name}"
|
142 |
os.makedirs(OUT_DIR, exist_ok=True)
|
143 |
+
out_path = f"{OUT_DIR}/requests_{model_name}_{revision}_{precision}_{n_shot}shot_{prompt_version}_{seed}seed_{current_time}.json"
|
144 |
|
145 |
with open(out_path, "w") as f:
|
146 |
+
f.write(json.dumps({"leaderboard": request, "configuration": configuration}, ensure_ascii=False, indent=2))
|
147 |
|
148 |
+
print(f"Uploading {model_id} configuration file")
|
149 |
API.upload_file(
|
150 |
path_or_fileobj=out_path,
|
151 |
path_in_repo=out_path.split("eval-queue/")[1],
|
152 |
repo_id=QUEUE_REPO,
|
153 |
repo_type="dataset",
|
154 |
+
commit_message=f"Add {configuration['model_name']} {n_shot}-shot to eval queue",
|
155 |
)
|
156 |
|
157 |
# Remove the local file
|