Spaces:
Sleeping
Sleeping
Simplify leaderboard to EMEA-sen and MEDLINE tasks only
Browse files- Update Tasks enum to only include emea_ner and medline_ner
- Simplify interface text and remove unnecessary complexity
- Update French localization
- Remove hardcoded dataset information
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <[email protected]>
- app.py +5 -29
- pyproject.toml +36 -0
- src/about.py +51 -52
- src/display/utils.py +13 -28
- src/leaderboard/read_evals.py +28 -12
- src/submission/submit.py +4 -19
- test_fixes.py +199 -0
app.py
CHANGED
@@ -19,9 +19,7 @@ from src.display.utils import (
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
22 |
-
ModelType,
|
23 |
fields,
|
24 |
-
WeightType,
|
25 |
Precision
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
@@ -72,7 +70,6 @@ def init_leaderboard(dataframe):
|
|
72 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
73 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
74 |
filter_columns=[
|
75 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
76 |
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
77 |
ColumnFilter(
|
78 |
AutoEvalColumn.params.name,
|
@@ -100,7 +97,6 @@ def init_leaderboard(dataframe):
|
|
100 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
101 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
102 |
filter_columns=[
|
103 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
104 |
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
105 |
ColumnFilter(
|
106 |
AutoEvalColumn.params.name,
|
@@ -171,48 +167,28 @@ with demo:
|
|
171 |
row_count=5,
|
172 |
)
|
173 |
with gr.Row():
|
174 |
-
gr.Markdown("# ✉️✨
|
175 |
|
176 |
with gr.Row():
|
177 |
with gr.Column():
|
178 |
-
model_name_textbox = gr.Textbox(label="
|
179 |
-
revision_name_textbox = gr.Textbox(label="
|
180 |
-
model_type = gr.Dropdown(
|
181 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
182 |
-
label="Model type",
|
183 |
-
multiselect=False,
|
184 |
-
value=None,
|
185 |
-
interactive=True,
|
186 |
-
)
|
187 |
-
|
188 |
-
with gr.Column():
|
189 |
precision = gr.Dropdown(
|
190 |
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
191 |
-
label="
|
192 |
multiselect=False,
|
193 |
value="float16",
|
194 |
interactive=True,
|
195 |
)
|
196 |
-
weight_type = gr.Dropdown(
|
197 |
-
choices=[i.value.name for i in WeightType],
|
198 |
-
label="Weights type",
|
199 |
-
multiselect=False,
|
200 |
-
value="Original",
|
201 |
-
interactive=True,
|
202 |
-
)
|
203 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
204 |
|
205 |
-
submit_button = gr.Button("
|
206 |
submission_result = gr.Markdown()
|
207 |
submit_button.click(
|
208 |
add_new_eval,
|
209 |
[
|
210 |
model_name_textbox,
|
211 |
-
base_model_name_textbox,
|
212 |
revision_name_textbox,
|
213 |
precision,
|
214 |
-
weight_type,
|
215 |
-
model_type,
|
216 |
],
|
217 |
submission_result,
|
218 |
)
|
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
|
|
22 |
fields,
|
|
|
23 |
Precision
|
24 |
)
|
25 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
|
|
70 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
72 |
filter_columns=[
|
|
|
73 |
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
74 |
ColumnFilter(
|
75 |
AutoEvalColumn.params.name,
|
|
|
97 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
98 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
99 |
filter_columns=[
|
|
|
100 |
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
101 |
ColumnFilter(
|
102 |
AutoEvalColumn.params.name,
|
|
|
167 |
row_count=5,
|
168 |
)
|
169 |
with gr.Row():
|
170 |
+
gr.Markdown("# ✉️✨ Soumettez votre modèle ici !", elem_classes="markdown-text")
|
171 |
|
172 |
with gr.Row():
|
173 |
with gr.Column():
|
174 |
+
model_name_textbox = gr.Textbox(label="Nom du modèle")
|
175 |
+
revision_name_textbox = gr.Textbox(label="Révision commit", placeholder="main")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
precision = gr.Dropdown(
|
177 |
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
178 |
+
label="Précision",
|
179 |
multiselect=False,
|
180 |
value="float16",
|
181 |
interactive=True,
|
182 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
+
submit_button = gr.Button("Soumettre l'évaluation")
|
185 |
submission_result = gr.Markdown()
|
186 |
submit_button.click(
|
187 |
add_new_eval,
|
188 |
[
|
189 |
model_name_textbox,
|
|
|
190 |
revision_name_textbox,
|
191 |
precision,
|
|
|
|
|
192 |
],
|
193 |
submission_result,
|
194 |
)
|
pyproject.toml
CHANGED
@@ -1,3 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
[tool.ruff]
|
2 |
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
select = ["E", "F"]
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "french-medical-nlp-leaderboard"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Leaderboard for French medical NLP models"
|
5 |
+
authors = ["rntc <[email protected]>"]
|
6 |
+
readme = "README.md"
|
7 |
+
|
8 |
+
[tool.poetry.dependencies]
|
9 |
+
python = "^3.8"
|
10 |
+
APScheduler = "*"
|
11 |
+
black = "*"
|
12 |
+
datasets = "*"
|
13 |
+
gradio = "*"
|
14 |
+
gradio-leaderboard = "0.0.13"
|
15 |
+
gradio-client = "*"
|
16 |
+
huggingface-hub = ">=0.18.0"
|
17 |
+
matplotlib = "*"
|
18 |
+
numpy = "*"
|
19 |
+
pandas = "*"
|
20 |
+
python-dateutil = "*"
|
21 |
+
tqdm = "*"
|
22 |
+
transformers = "*"
|
23 |
+
tokenizers = ">=0.15.0"
|
24 |
+
sentencepiece = "*"
|
25 |
+
torch = ">=2.6.0"
|
26 |
+
seqeval = ">=1.2.2"
|
27 |
+
scikit-learn = ">=1.3.0"
|
28 |
+
|
29 |
+
[tool.poetry.group.dev.dependencies]
|
30 |
+
ruff = "*"
|
31 |
+
isort = "*"
|
32 |
+
|
33 |
+
[build-system]
|
34 |
+
requires = ["poetry-core"]
|
35 |
+
build-backend = "poetry.core.masonry.api"
|
36 |
+
|
37 |
[tool.ruff]
|
38 |
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
39 |
select = ["E", "F"]
|
src/about.py
CHANGED
@@ -12,8 +12,8 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
emea_ner = Task("emea_ner", "f1", "EMEA
|
16 |
-
medline_ner = Task("medline_ner", "f1", "MEDLINE
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
@@ -21,79 +21,78 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">🏥
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
29 |
-
|
30 |
|
31 |
-
**
|
32 |
-
- **EMEA
|
33 |
-
- **MEDLINE
|
34 |
|
35 |
-
|
36 |
"""
|
37 |
|
38 |
# Which evaluations are you running? how can people reproduce what you have?
|
39 |
LLM_BENCHMARKS_TEXT = f"""
|
40 |
-
##
|
41 |
|
42 |
-
|
43 |
|
44 |
-
**
|
45 |
-
- **
|
46 |
-
- **
|
47 |
-
- **Scheduler
|
48 |
-
- **
|
49 |
-
- **Batch Size
|
50 |
-
- **
|
51 |
-
- **
|
52 |
-
- **
|
53 |
|
54 |
-
**
|
55 |
|
56 |
-
##
|
57 |
-
|
58 |
|
59 |
-
**Datasets:**
|
60 |
-
|
61 |
-
- MEDLINE: `rntc/quaero-frenchmed-ner-medline`
|
62 |
"""
|
63 |
|
64 |
EVALUATION_QUEUE_TEXT = """
|
65 |
-
##
|
66 |
|
67 |
-
### 1)
|
68 |
```python
|
69 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
70 |
-
tokenizer = AutoTokenizer.from_pretrained("
|
71 |
-
model = AutoModelForTokenClassification.from_pretrained("
|
72 |
```
|
73 |
|
74 |
-
### 2)
|
75 |
-
-
|
76 |
-
-
|
77 |
-
-
|
78 |
-
-
|
79 |
-
|
80 |
-
### 3)
|
81 |
-
-
|
82 |
-
-
|
83 |
-
|
84 |
-
### 4)
|
85 |
-
-
|
86 |
-
-
|
87 |
-
-
|
88 |
-
-
|
89 |
-
|
90 |
-
##
|
91 |
-
|
92 |
-
1.
|
93 |
-
2.
|
94 |
-
3.
|
95 |
"""
|
96 |
|
97 |
-
CITATION_BUTTON_LABEL = "
|
98 |
CITATION_BUTTON_TEXT = r"""
|
99 |
"""
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
emea_ner = Task("emea_ner", "f1", "EMEA")
|
16 |
+
medline_ner = Task("medline_ner", "f1", "MEDLINE")
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
+
TITLE = """<h1 align="center" id="space-title">🏥 Leaderboard NLP Biomédical Français</h1>"""
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
+
Ce leaderboard évalue les modèles NLP français sur des tâches de reconnaissance d'entités nommées (NER) biomédicales.
|
29 |
+
Nous nous concentrons sur les modèles de type BERT avec des plans d'extension vers d'autres architectures.
|
30 |
|
31 |
+
**Tâches actuelles :**
|
32 |
+
- **EMEA** : Reconnaissance d'entités sur textes médicaux français de l'EMEA
|
33 |
+
- **MEDLINE** : Reconnaissance d'entités sur résumés médicaux français de MEDLINE
|
34 |
|
35 |
+
Les modèles sont évalués par fine-tuning sur chaque tâche.
|
36 |
"""
|
37 |
|
38 |
# Which evaluations are you running? how can people reproduce what you have?
|
39 |
LLM_BENCHMARKS_TEXT = f"""
|
40 |
+
## Comment ça fonctionne
|
41 |
|
42 |
+
Nous évaluons les modèles en les **fine-tunant** sur des tâches de NER médical français :
|
43 |
|
44 |
+
**Paramètres de fine-tuning :**
|
45 |
+
- **Optimiseur** : AdamW
|
46 |
+
- **Taux d'apprentissage** : 5e-5
|
47 |
+
- **Scheduler** : Cosine avec redémarrages
|
48 |
+
- **Étapes** : 2000
|
49 |
+
- **Batch Size** : 4
|
50 |
+
- **Accumulation de gradient** : 4 étapes
|
51 |
+
- **Longueur max** : 512 tokens
|
52 |
+
- **Sortie** : Couche linéaire simple
|
53 |
|
54 |
+
**Évaluation** : Utilise seqeval avec schéma IOB2 pour le **micro F1**, précision et rappel au niveau des entités.
|
55 |
|
56 |
+
## Reproductibilité
|
57 |
+
Les résultats sont obtenus par fine-tuning approprié, pas par évaluation zero-shot. Chaque modèle est fine-tuné indépendamment sur chaque tâche.
|
58 |
|
59 |
+
**Datasets :**
|
60 |
+
Les datasets utilisés sont basés sur des corpus annotés français pour la reconnaissance d'entités biomédicales.
|
|
|
61 |
"""
|
62 |
|
63 |
EVALUATION_QUEUE_TEXT = """
|
64 |
+
## Avant de soumettre un modèle
|
65 |
|
66 |
+
### 1) Assurez-vous que votre modèle est compatible avec les AutoClasses :
|
67 |
```python
|
68 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
69 |
+
tokenizer = AutoTokenizer.from_pretrained("nom_de_votre_modèle")
|
70 |
+
model = AutoModelForTokenClassification.from_pretrained("nom_de_votre_modèle")
|
71 |
```
|
72 |
|
73 |
+
### 2) Exigences du modèle :
|
74 |
+
- Doit être un modèle fine-tuné pour la classification de tokens (pas juste un modèle de base)
|
75 |
+
- Devrait être entraîné sur des données NER médicales françaises
|
76 |
+
- Doit être publiquement disponible sur le Hub Hugging Face
|
77 |
+
- Préférez le format safetensors pour un chargement plus rapide
|
78 |
+
|
79 |
+
### 3) Performance attendue :
|
80 |
+
- Les modèles de base sans fine-tuning obtiendront des scores très bas (~0.02 F1)
|
81 |
+
- Les modèles fine-tunés devraient atteindre des scores significativement plus élevés
|
82 |
+
|
83 |
+
### 4) Recommandations pour la carte du modèle :
|
84 |
+
- Spécifiez le dataset d'entraînement utilisé
|
85 |
+
- Incluez les détails de l'architecture du modèle
|
86 |
+
- Ajoutez les métriques de performance si disponibles
|
87 |
+
- Utilisez une licence ouverte
|
88 |
+
|
89 |
+
## Dépannage
|
90 |
+
Si votre modèle échoue à l'évaluation :
|
91 |
+
1. Vérifiez qu'il se charge correctement avec AutoModelForTokenClassification
|
92 |
+
2. Vérifiez qu'il est entraîné pour la classification de tokens
|
93 |
+
3. Assurez-vous que le modèle est public et accessible
|
94 |
"""
|
95 |
|
96 |
+
CITATION_BUTTON_LABEL = "Copiez le snippet suivant pour citer ces résultats"
|
97 |
CITATION_BUTTON_TEXT = r"""
|
98 |
"""
|
src/display/utils.py
CHANGED
@@ -23,21 +23,20 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(
|
27 |
-
auto_eval_column_dict.append(
|
|
|
|
|
28 |
#Scores
|
29 |
for task in Tasks:
|
30 |
-
auto_eval_column_dict.append(
|
31 |
# Model information
|
32 |
-
auto_eval_column_dict.append(
|
33 |
-
auto_eval_column_dict.append(
|
34 |
-
auto_eval_column_dict.append(
|
35 |
-
auto_eval_column_dict.append(
|
36 |
-
auto_eval_column_dict.append(
|
37 |
-
auto_eval_column_dict.append(
|
38 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
40 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
41 |
|
42 |
# We use make dataclass to dynamically fill the scores from Tasks
|
43 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -47,9 +46,7 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
47 |
class EvalQueueColumn: # Queue column
|
48 |
model = ColumnContent("model", "markdown", True)
|
49 |
revision = ColumnContent("revision", "str", True)
|
50 |
-
private = ColumnContent("private", "bool", True)
|
51 |
precision = ColumnContent("precision", "str", True)
|
52 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
53 |
status = ColumnContent("status", "str", True)
|
54 |
|
55 |
## All the model information that we might need
|
@@ -61,10 +58,7 @@ class ModelDetails:
|
|
61 |
|
62 |
|
63 |
class ModelType(Enum):
|
64 |
-
PT = ModelDetails(name="pretrained", symbol="🟢")
|
65 |
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
66 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
67 |
-
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
68 |
Unknown = ModelDetails(name="", symbol="?")
|
69 |
|
70 |
def to_str(self, separator=" "):
|
@@ -74,33 +68,24 @@ class ModelType(Enum):
|
|
74 |
def from_str(type):
|
75 |
if "fine-tuned" in type or "🔶" in type:
|
76 |
return ModelType.FT
|
77 |
-
if "pretrained" in type or "🟢" in type:
|
78 |
-
return ModelType.PT
|
79 |
-
if "RL-tuned" in type or "🟦" in type:
|
80 |
-
return ModelType.RL
|
81 |
-
if "instruction-tuned" in type or "⭕" in type:
|
82 |
-
return ModelType.IFT
|
83 |
return ModelType.Unknown
|
84 |
|
85 |
@staticmethod
|
86 |
def from_config(config):
|
87 |
"""Determine model type from configuration - for NER models, most will be fine-tuned"""
|
88 |
-
if hasattr(config, 'num_labels') and config.num_labels
|
89 |
return ModelType.FT # Fine-tuned for NER
|
90 |
-
elif hasattr(config, 'num_labels') and config.num_labels == 2:
|
91 |
-
return ModelType.PT # Base model
|
92 |
return ModelType.Unknown
|
93 |
|
94 |
class WeightType(Enum):
|
95 |
-
Adapter = ModelDetails("Adapter")
|
96 |
Original = ModelDetails("Original")
|
97 |
-
Delta = ModelDetails("Delta")
|
98 |
|
99 |
class Precision(Enum):
|
100 |
float16 = ModelDetails("float16")
|
101 |
bfloat16 = ModelDetails("bfloat16")
|
102 |
Unknown = ModelDetails("?")
|
103 |
|
|
|
104 |
def from_str(precision):
|
105 |
if precision in ["torch.float16", "float16"]:
|
106 |
return Precision.float16
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
auto_eval_column_dict.append(("model_type_symbol", ColumnContent("T", "str", True, never_hidden=True)))
|
27 |
+
auto_eval_column_dict.append(("model", ColumnContent("Model", "markdown", True, never_hidden=True)))
|
28 |
+
# Average score
|
29 |
+
auto_eval_column_dict.append(("average", ColumnContent("Average", "number", True)))
|
30 |
#Scores
|
31 |
for task in Tasks:
|
32 |
+
auto_eval_column_dict.append((task.name, ColumnContent(task.value.col_name, "number", True)))
|
33 |
# Model information
|
34 |
+
auto_eval_column_dict.append(("precision", ColumnContent("Precision", "str", False)))
|
35 |
+
auto_eval_column_dict.append(("license", ColumnContent("Hub License", "str", False)))
|
36 |
+
auto_eval_column_dict.append(("params", ColumnContent("#Params (B)", "number", False)))
|
37 |
+
auto_eval_column_dict.append(("likes", ColumnContent("Hub ❤️", "number", False)))
|
38 |
+
auto_eval_column_dict.append(("still_on_hub", ColumnContent("Available on the hub", "bool", False)))
|
39 |
+
auto_eval_column_dict.append(("revision", ColumnContent("Model sha", "str", False, False)))
|
|
|
|
|
|
|
40 |
|
41 |
# We use make dataclass to dynamically fill the scores from Tasks
|
42 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
46 |
class EvalQueueColumn: # Queue column
|
47 |
model = ColumnContent("model", "markdown", True)
|
48 |
revision = ColumnContent("revision", "str", True)
|
|
|
49 |
precision = ColumnContent("precision", "str", True)
|
|
|
50 |
status = ColumnContent("status", "str", True)
|
51 |
|
52 |
## All the model information that we might need
|
|
|
58 |
|
59 |
|
60 |
class ModelType(Enum):
|
|
|
61 |
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
|
|
|
|
62 |
Unknown = ModelDetails(name="", symbol="?")
|
63 |
|
64 |
def to_str(self, separator=" "):
|
|
|
68 |
def from_str(type):
|
69 |
if "fine-tuned" in type or "🔶" in type:
|
70 |
return ModelType.FT
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
return ModelType.Unknown
|
72 |
|
73 |
@staticmethod
|
74 |
def from_config(config):
|
75 |
"""Determine model type from configuration - for NER models, most will be fine-tuned"""
|
76 |
+
if hasattr(config, 'num_labels') and config.num_labels > 2:
|
77 |
return ModelType.FT # Fine-tuned for NER
|
|
|
|
|
78 |
return ModelType.Unknown
|
79 |
|
80 |
class WeightType(Enum):
|
|
|
81 |
Original = ModelDetails("Original")
|
|
|
82 |
|
83 |
class Precision(Enum):
|
84 |
float16 = ModelDetails("float16")
|
85 |
bfloat16 = ModelDetails("bfloat16")
|
86 |
Unknown = ModelDetails("?")
|
87 |
|
88 |
+
@staticmethod
|
89 |
def from_str(precision):
|
90 |
if precision in ["torch.float16", "float16"]:
|
91 |
return Precision.float16
|
src/leaderboard/read_evals.py
CHANGED
@@ -33,19 +33,32 @@ class EvalResult:
|
|
33 |
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
36 |
-
def init_from_json_file(
|
37 |
"""Inits the result from the specific model result file"""
|
38 |
with open(json_filepath) as fp:
|
39 |
data = json.load(fp)
|
40 |
|
41 |
-
config = data.get("config")
|
42 |
-
|
43 |
-
# Precision
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
org_and_model =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
if len(org_and_model) == 1:
|
51 |
org = None
|
@@ -57,8 +70,11 @@ class EvalResult:
|
|
57 |
result_key = f"{org}_{model}_{precision.value.name}"
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
|
|
|
|
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
-
full_model,
|
62 |
)
|
63 |
architecture = "?"
|
64 |
if model_config is not None:
|
@@ -79,14 +95,14 @@ class EvalResult:
|
|
79 |
mean_acc = np.mean(accs) * 100.0
|
80 |
results[task.benchmark] = mean_acc
|
81 |
|
82 |
-
return
|
83 |
eval_name=result_key,
|
84 |
full_model=full_model,
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
precision=precision,
|
89 |
-
revision=
|
90 |
still_on_hub=still_on_hub,
|
91 |
architecture=architecture
|
92 |
)
|
|
|
33 |
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
36 |
+
def init_from_json_file(cls, json_filepath):
|
37 |
"""Inits the result from the specific model result file"""
|
38 |
with open(json_filepath) as fp:
|
39 |
data = json.load(fp)
|
40 |
|
41 |
+
config = data.get("config", {})
|
42 |
+
|
43 |
+
# Precision - handle different field names
|
44 |
+
precision_value = config.get("model_dtype") or config.get("precision") or "Unknown"
|
45 |
+
precision = Precision.from_str(precision_value)
|
46 |
+
|
47 |
+
# Get model and org - handle different field names
|
48 |
+
org_and_model = config.get("model_name") or config.get("model_args") or config.get("model")
|
49 |
+
if org_and_model is None:
|
50 |
+
# Try to extract from filename as fallback
|
51 |
+
basename = os.path.basename(json_filepath)
|
52 |
+
if basename.startswith("results_"):
|
53 |
+
org_and_model = basename.replace("results_", "").replace(".json", "")
|
54 |
+
|
55 |
+
if org_and_model is None:
|
56 |
+
raise ValueError(f"Could not determine model name from {json_filepath}")
|
57 |
+
|
58 |
+
if "/" in org_and_model:
|
59 |
+
org_and_model = org_and_model.split("/", 1)
|
60 |
+
else:
|
61 |
+
org_and_model = [org_and_model]
|
62 |
|
63 |
if len(org_and_model) == 1:
|
64 |
org = None
|
|
|
70 |
result_key = f"{org}_{model}_{precision.value.name}"
|
71 |
full_model = "/".join(org_and_model)
|
72 |
|
73 |
+
# Model revision - handle different field names
|
74 |
+
revision = config.get("model_sha") or config.get("revision") or "main"
|
75 |
+
|
76 |
still_on_hub, _, model_config = is_model_on_hub(
|
77 |
+
full_model, revision, trust_remote_code=True, test_tokenizer=False
|
78 |
)
|
79 |
architecture = "?"
|
80 |
if model_config is not None:
|
|
|
95 |
mean_acc = np.mean(accs) * 100.0
|
96 |
results[task.benchmark] = mean_acc
|
97 |
|
98 |
+
return cls(
|
99 |
eval_name=result_key,
|
100 |
full_model=full_model,
|
101 |
org=org,
|
102 |
model=model,
|
103 |
results=results,
|
104 |
precision=precision,
|
105 |
+
revision=revision,
|
106 |
still_on_hub=still_on_hub,
|
107 |
architecture=architecture
|
108 |
)
|
src/submission/submit.py
CHANGED
@@ -16,11 +16,8 @@ USERS_TO_SUBMISSION_DATES = None
|
|
16 |
|
17 |
def add_new_eval(
|
18 |
model: str,
|
19 |
-
base_model: str,
|
20 |
revision: str,
|
21 |
precision: str,
|
22 |
-
weight_type: str,
|
23 |
-
model_type: str,
|
24 |
):
|
25 |
global REQUESTED_MODELS
|
26 |
global USERS_TO_SUBMISSION_DATES
|
@@ -36,23 +33,14 @@ def add_new_eval(
|
|
36 |
precision = precision.split(" ")[0]
|
37 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
|
39 |
-
if model_type is None or model_type == "":
|
40 |
-
return styled_error("Please select a model type.")
|
41 |
-
|
42 |
# Does the model actually exist?
|
43 |
if revision == "":
|
44 |
revision = "main"
|
45 |
|
46 |
# Is the model on the hub?
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
-
|
52 |
-
if not weight_type == "Adapter":
|
53 |
-
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
-
if not model_on_hub:
|
55 |
-
return styled_error(f'Model "{model}" {error}')
|
56 |
|
57 |
# Is the model info correctly filled?
|
58 |
try:
|
@@ -77,13 +65,10 @@ def add_new_eval(
|
|
77 |
|
78 |
eval_entry = {
|
79 |
"model": model,
|
80 |
-
"base_model": base_model,
|
81 |
"revision": revision,
|
82 |
"precision": precision,
|
83 |
-
"weight_type": weight_type,
|
84 |
"status": "PENDING",
|
85 |
"submitted_time": current_time,
|
86 |
-
"model_type": model_type,
|
87 |
"likes": model_info.likes,
|
88 |
"params": model_size,
|
89 |
"license": license,
|
@@ -97,7 +82,7 @@ def add_new_eval(
|
|
97 |
print("Creating eval file")
|
98 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}
|
101 |
|
102 |
with open(out_path, "w") as f:
|
103 |
f.write(json.dumps(eval_entry))
|
|
|
16 |
|
17 |
def add_new_eval(
|
18 |
model: str,
|
|
|
19 |
revision: str,
|
20 |
precision: str,
|
|
|
|
|
21 |
):
|
22 |
global REQUESTED_MODELS
|
23 |
global USERS_TO_SUBMISSION_DATES
|
|
|
33 |
precision = precision.split(" ")[0]
|
34 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
35 |
|
|
|
|
|
|
|
36 |
# Does the model actually exist?
|
37 |
if revision == "":
|
38 |
revision = "main"
|
39 |
|
40 |
# Is the model on the hub?
|
41 |
+
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
42 |
+
if not model_on_hub:
|
43 |
+
return styled_error(f'Model "{model}" {error}')
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
# Is the model info correctly filled?
|
46 |
try:
|
|
|
65 |
|
66 |
eval_entry = {
|
67 |
"model": model,
|
|
|
68 |
"revision": revision,
|
69 |
"precision": precision,
|
|
|
70 |
"status": "PENDING",
|
71 |
"submitted_time": current_time,
|
|
|
72 |
"likes": model_info.likes,
|
73 |
"params": model_size,
|
74 |
"license": license,
|
|
|
82 |
print("Creating eval file")
|
83 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
84 |
os.makedirs(OUT_DIR, exist_ok=True)
|
85 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_Original.json"
|
86 |
|
87 |
with open(out_path, "w") as f:
|
88 |
f.write(json.dumps(eval_entry))
|
test_fixes.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""Test script to verify the fixes work correctly."""
|
3 |
+
|
4 |
+
import sys
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
# Add the src directory to the path
|
10 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
11 |
+
|
12 |
+
def test_dataclass_creation():
|
13 |
+
"""Test that the AutoEvalColumn dataclass can be created successfully."""
|
14 |
+
print("Testing AutoEvalColumn dataclass creation...")
|
15 |
+
try:
|
16 |
+
from src.display.utils import AutoEvalColumn, fields
|
17 |
+
|
18 |
+
# Test that we can access the fields
|
19 |
+
all_fields = fields(AutoEvalColumn)
|
20 |
+
print(f"✓ Successfully created AutoEvalColumn with {len(all_fields)} fields")
|
21 |
+
|
22 |
+
# Test that the average field exists
|
23 |
+
assert hasattr(AutoEvalColumn, 'average'), "Missing 'average' field"
|
24 |
+
print("✓ 'average' field exists")
|
25 |
+
|
26 |
+
# Test that we can access field names
|
27 |
+
field_names = [c.name for c in all_fields]
|
28 |
+
assert 'average' in field_names, "Average field not in field names"
|
29 |
+
print("✓ Average field accessible in field names")
|
30 |
+
|
31 |
+
return True
|
32 |
+
except Exception as e:
|
33 |
+
print(f"✗ Error: {e}")
|
34 |
+
return False
|
35 |
+
|
36 |
+
def test_precision_from_str():
|
37 |
+
"""Test that the Precision.from_str method works correctly."""
|
38 |
+
print("Testing Precision.from_str method...")
|
39 |
+
try:
|
40 |
+
from src.display.utils import Precision
|
41 |
+
|
42 |
+
# Test different precision values
|
43 |
+
result1 = Precision.from_str("torch.float16")
|
44 |
+
assert result1 == Precision.float16, f"Expected float16, got {result1}"
|
45 |
+
print("✓ torch.float16 correctly parsed")
|
46 |
+
|
47 |
+
result2 = Precision.from_str("float16")
|
48 |
+
assert result2 == Precision.float16, f"Expected float16, got {result2}"
|
49 |
+
print("✓ float16 correctly parsed")
|
50 |
+
|
51 |
+
result3 = Precision.from_str("torch.bfloat16")
|
52 |
+
assert result3 == Precision.bfloat16, f"Expected bfloat16, got {result3}"
|
53 |
+
print("✓ torch.bfloat16 correctly parsed")
|
54 |
+
|
55 |
+
result4 = Precision.from_str("unknown")
|
56 |
+
assert result4 == Precision.Unknown, f"Expected Unknown, got {result4}"
|
57 |
+
print("✓ Unknown precision correctly parsed")
|
58 |
+
|
59 |
+
return True
|
60 |
+
except Exception as e:
|
61 |
+
print(f"✗ Error: {e}")
|
62 |
+
return False
|
63 |
+
|
64 |
+
def test_eval_result_parsing():
|
65 |
+
"""Test that the EvalResult can parse JSON files correctly."""
|
66 |
+
print("Testing EvalResult JSON parsing...")
|
67 |
+
try:
|
68 |
+
from src.leaderboard.read_evals import EvalResult
|
69 |
+
from src.about import Tasks
|
70 |
+
|
71 |
+
# Create a sample result file
|
72 |
+
sample_result = {
|
73 |
+
"config": {
|
74 |
+
"model_name": "test/model",
|
75 |
+
"model_dtype": "torch.float16",
|
76 |
+
"model_sha": "abc123"
|
77 |
+
},
|
78 |
+
"results": {
|
79 |
+
"emea_ner": {"f1": 0.85},
|
80 |
+
"medline_ner": {"f1": 0.82}
|
81 |
+
}
|
82 |
+
}
|
83 |
+
|
84 |
+
# Write to temp file
|
85 |
+
temp_file = "/tmp/test_result.json"
|
86 |
+
with open(temp_file, 'w') as f:
|
87 |
+
json.dump(sample_result, f)
|
88 |
+
|
89 |
+
# Test parsing
|
90 |
+
result = EvalResult.init_from_json_file(temp_file)
|
91 |
+
|
92 |
+
assert result.full_model == "test/model", f"Expected test/model, got {result.full_model}"
|
93 |
+
assert result.org == "test", f"Expected test, got {result.org}"
|
94 |
+
assert result.model == "model", f"Expected model, got {result.model}"
|
95 |
+
assert result.revision == "abc123", f"Expected abc123, got {result.revision}"
|
96 |
+
|
97 |
+
print("✓ JSON parsing works correctly")
|
98 |
+
|
99 |
+
# Test with missing fields
|
100 |
+
sample_result_minimal = {
|
101 |
+
"config": {
|
102 |
+
"model": "test/model2"
|
103 |
+
},
|
104 |
+
"results": {
|
105 |
+
"emea_ner": {"f1": 0.75}
|
106 |
+
}
|
107 |
+
}
|
108 |
+
|
109 |
+
temp_file_minimal = "/tmp/test_result_minimal.json"
|
110 |
+
with open(temp_file_minimal, 'w') as f:
|
111 |
+
json.dump(sample_result_minimal, f)
|
112 |
+
|
113 |
+
result_minimal = EvalResult.init_from_json_file(temp_file_minimal)
|
114 |
+
assert result_minimal.full_model == "test/model2", f"Expected test/model2, got {result_minimal.full_model}"
|
115 |
+
print("✓ Minimal JSON parsing works correctly")
|
116 |
+
|
117 |
+
# Clean up
|
118 |
+
os.remove(temp_file)
|
119 |
+
os.remove(temp_file_minimal)
|
120 |
+
|
121 |
+
return True
|
122 |
+
except Exception as e:
|
123 |
+
print(f"✗ Error: {e}")
|
124 |
+
return False
|
125 |
+
|
126 |
+
def test_to_dict():
|
127 |
+
"""Test that EvalResult.to_dict works correctly."""
|
128 |
+
print("Testing EvalResult.to_dict method...")
|
129 |
+
try:
|
130 |
+
from src.leaderboard.read_evals import EvalResult
|
131 |
+
from src.display.utils import Precision, ModelType, WeightType
|
132 |
+
|
133 |
+
# Create a test EvalResult
|
134 |
+
eval_result = EvalResult(
|
135 |
+
eval_name="test_model_float16",
|
136 |
+
full_model="test/model",
|
137 |
+
org="test",
|
138 |
+
model="model",
|
139 |
+
revision="abc123",
|
140 |
+
results={"emea_ner": 85.0, "medline_ner": 82.0},
|
141 |
+
precision=Precision.float16,
|
142 |
+
model_type=ModelType.FT,
|
143 |
+
weight_type=WeightType.Original,
|
144 |
+
architecture="BertForTokenClassification",
|
145 |
+
license="MIT",
|
146 |
+
likes=10,
|
147 |
+
num_params=110,
|
148 |
+
date="2023-01-01",
|
149 |
+
still_on_hub=True
|
150 |
+
)
|
151 |
+
|
152 |
+
# Test to_dict conversion
|
153 |
+
result_dict = eval_result.to_dict()
|
154 |
+
|
155 |
+
# Check that all required fields are present
|
156 |
+
assert "average" in result_dict, "Missing average field in dict"
|
157 |
+
assert result_dict["average"] == 83.5, f"Expected average 83.5, got {result_dict['average']}"
|
158 |
+
|
159 |
+
print("✓ to_dict method works correctly")
|
160 |
+
print(f" - Average: {result_dict['average']}")
|
161 |
+
|
162 |
+
return True
|
163 |
+
except Exception as e:
|
164 |
+
print(f"✗ Error: {e}")
|
165 |
+
return False
|
166 |
+
|
167 |
+
def main():
|
168 |
+
"""Run all tests."""
|
169 |
+
print("Running bug fix tests...\n")
|
170 |
+
|
171 |
+
tests = [
|
172 |
+
test_dataclass_creation,
|
173 |
+
test_precision_from_str,
|
174 |
+
test_eval_result_parsing,
|
175 |
+
test_to_dict,
|
176 |
+
]
|
177 |
+
|
178 |
+
results = []
|
179 |
+
for test in tests:
|
180 |
+
print(f"\n{'='*50}")
|
181 |
+
try:
|
182 |
+
result = test()
|
183 |
+
results.append(result)
|
184 |
+
except Exception as e:
|
185 |
+
print(f"✗ Test {test.__name__} failed with exception: {e}")
|
186 |
+
results.append(False)
|
187 |
+
|
188 |
+
print(f"\n{'='*50}")
|
189 |
+
print(f"Test Results: {sum(results)}/{len(results)} tests passed")
|
190 |
+
|
191 |
+
if all(results):
|
192 |
+
print("🎉 All tests passed! The fixes are working correctly.")
|
193 |
+
return 0
|
194 |
+
else:
|
195 |
+
print("❌ Some tests failed. Please check the output above.")
|
196 |
+
return 1
|
197 |
+
|
198 |
+
if __name__ == "__main__":
|
199 |
+
sys.exit(main())
|