rntc Claude commited on
Commit
fefe31a
·
1 Parent(s): f127f6a

Simplify leaderboard to EMEA-sen and MEDLINE tasks only

Browse files

- Update Tasks enum to only include emea_ner and medline_ner
- Simplify interface text and remove unnecessary complexity
- Update French localization
- Remove hardcoded dataset information

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

app.py CHANGED
@@ -19,9 +19,7 @@ from src.display.utils import (
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
- ModelType,
23
  fields,
24
- WeightType,
25
  Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
@@ -72,7 +70,6 @@ def init_leaderboard(dataframe):
72
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
73
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
74
  filter_columns=[
75
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
76
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
77
  ColumnFilter(
78
  AutoEvalColumn.params.name,
@@ -100,7 +97,6 @@ def init_leaderboard(dataframe):
100
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
101
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
102
  filter_columns=[
103
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
104
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
105
  ColumnFilter(
106
  AutoEvalColumn.params.name,
@@ -171,48 +167,28 @@ with demo:
171
  row_count=5,
172
  )
173
  with gr.Row():
174
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
175
 
176
  with gr.Row():
177
  with gr.Column():
178
- model_name_textbox = gr.Textbox(label="Model name")
179
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
180
- model_type = gr.Dropdown(
181
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
182
- label="Model type",
183
- multiselect=False,
184
- value=None,
185
- interactive=True,
186
- )
187
-
188
- with gr.Column():
189
  precision = gr.Dropdown(
190
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
191
- label="Precision",
192
  multiselect=False,
193
  value="float16",
194
  interactive=True,
195
  )
196
- weight_type = gr.Dropdown(
197
- choices=[i.value.name for i in WeightType],
198
- label="Weights type",
199
- multiselect=False,
200
- value="Original",
201
- interactive=True,
202
- )
203
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
204
 
205
- submit_button = gr.Button("Submit Eval")
206
  submission_result = gr.Markdown()
207
  submit_button.click(
208
  add_new_eval,
209
  [
210
  model_name_textbox,
211
- base_model_name_textbox,
212
  revision_name_textbox,
213
  precision,
214
- weight_type,
215
- model_type,
216
  ],
217
  submission_result,
218
  )
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
 
22
  fields,
 
23
  Precision
24
  )
25
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 
70
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
71
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
72
  filter_columns=[
 
73
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
74
  ColumnFilter(
75
  AutoEvalColumn.params.name,
 
97
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
98
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
99
  filter_columns=[
 
100
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
101
  ColumnFilter(
102
  AutoEvalColumn.params.name,
 
167
  row_count=5,
168
  )
169
  with gr.Row():
170
+ gr.Markdown("# ✉️✨ Soumettez votre modèle ici !", elem_classes="markdown-text")
171
 
172
  with gr.Row():
173
  with gr.Column():
174
+ model_name_textbox = gr.Textbox(label="Nom du modèle")
175
+ revision_name_textbox = gr.Textbox(label="Révision commit", placeholder="main")
 
 
 
 
 
 
 
 
 
176
  precision = gr.Dropdown(
177
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
178
+ label="Précision",
179
  multiselect=False,
180
  value="float16",
181
  interactive=True,
182
  )
 
 
 
 
 
 
 
 
183
 
184
+ submit_button = gr.Button("Soumettre l'évaluation")
185
  submission_result = gr.Markdown()
186
  submit_button.click(
187
  add_new_eval,
188
  [
189
  model_name_textbox,
 
190
  revision_name_textbox,
191
  precision,
 
 
192
  ],
193
  submission_result,
194
  )
pyproject.toml CHANGED
@@ -1,3 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  [tool.ruff]
2
  # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
  select = ["E", "F"]
 
1
+ [tool.poetry]
2
+ name = "french-medical-nlp-leaderboard"
3
+ version = "0.1.0"
4
+ description = "Leaderboard for French medical NLP models"
5
+ authors = ["rntc <[email protected]>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.8"
10
+ APScheduler = "*"
11
+ black = "*"
12
+ datasets = "*"
13
+ gradio = "*"
14
+ gradio-leaderboard = "0.0.13"
15
+ gradio-client = "*"
16
+ huggingface-hub = ">=0.18.0"
17
+ matplotlib = "*"
18
+ numpy = "*"
19
+ pandas = "*"
20
+ python-dateutil = "*"
21
+ tqdm = "*"
22
+ transformers = "*"
23
+ tokenizers = ">=0.15.0"
24
+ sentencepiece = "*"
25
+ torch = ">=2.6.0"
26
+ seqeval = ">=1.2.2"
27
+ scikit-learn = ">=1.3.0"
28
+
29
+ [tool.poetry.group.dev.dependencies]
30
+ ruff = "*"
31
+ isort = "*"
32
+
33
+ [build-system]
34
+ requires = ["poetry-core"]
35
+ build-backend = "poetry.core.masonry.api"
36
+
37
  [tool.ruff]
38
  # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
39
  select = ["E", "F"]
src/about.py CHANGED
@@ -12,8 +12,8 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- emea_ner = Task("emea_ner", "f1", "EMEA NER")
16
- medline_ner = Task("medline_ner", "f1", "MEDLINE NER")
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,79 +21,78 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">🏥 French Medical NLP Leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- This leaderboard evaluates French NLP models on biomedical Named Entity Recognition (NER) tasks.
29
- We focus on BERT-like models with plans to extend to other architectures.
30
 
31
- **Current Tasks:**
32
- - **EMEA NER**: Named Entity Recognition on French medical texts from EMEA (European Medicines Agency)
33
- - **MEDLINE NER**: Named Entity Recognition on French medical abstracts from MEDLINE
34
 
35
- **Entity Types:** ANAT, CHEM, DEVI, DISO, GEOG, LIVB, OBJC, PHEN, PHYS, PROC
36
  """
37
 
38
  # Which evaluations are you running? how can people reproduce what you have?
39
  LLM_BENCHMARKS_TEXT = f"""
40
- ## How it works
41
 
42
- We evaluate models by **fine-tuning** them on French medical NER tasks following the CamemBERT-bio methodology:
43
 
44
- **Fine-tuning Parameters:**
45
- - **Optimizer**: AdamW (following CamemBERT-bio paper)
46
- - **Learning Rate**: 5e-5 (optimal from Optuna search - unchanged)
47
- - **Scheduler**: Cosine with restarts (22.4% warmup ratio)
48
- - **Steps**: 2000 (same as paper)
49
- - **Batch Size**: 4 (CPU constraint)
50
- - **Gradient Accumulation**: 4 steps (effective batch size 16)
51
- - **Max Length**: 512 tokens
52
- - **Output**: Simple linear layer (no CRF)
53
 
54
- **Evaluation**: Uses seqeval with IOB2 scheme for entity-level **micro F1**, precision, and recall.
55
 
56
- ## Reproducibility
57
- Results are obtained through proper fine-tuning, not zero-shot evaluation. Each model is fine-tuned independently on each task.
58
 
59
- **Datasets:**
60
- - EMEA: `rntc/quaero-frenchmed-ner-emea-sen`
61
- - MEDLINE: `rntc/quaero-frenchmed-ner-medline`
62
  """
63
 
64
  EVALUATION_QUEUE_TEXT = """
65
- ## Before submitting a model
66
 
67
- ### 1) Ensure your model is compatible with AutoClasses:
68
  ```python
69
  from transformers import AutoTokenizer, AutoModelForTokenClassification
70
- tokenizer = AutoTokenizer.from_pretrained("your_model_name")
71
- model = AutoModelForTokenClassification.from_pretrained("your_model_name")
72
  ```
73
 
74
- ### 2) Model requirements:
75
- - Must be a fine-tuned model for token classification (not just a base model)
76
- - Should be trained on French medical NER data
77
- - Must be publicly available on Hugging Face Hub
78
- - Prefer safetensors format for faster loading
79
-
80
- ### 3) Expected performance:
81
- - Base models without fine-tuning will get very low scores (~0.02 F1)
82
- - Fine-tuned models should achieve significantly higher scores
83
-
84
- ### 4) Model card recommendations:
85
- - Specify the training dataset used
86
- - Include model architecture details
87
- - Add performance metrics if available
88
- - Use an open license
89
-
90
- ## Troubleshooting
91
- If your model fails evaluation:
92
- 1. Check that it loads properly with AutoModelForTokenClassification
93
- 2. Verify it's trained for token classification (not just language modeling)
94
- 3. Ensure the model is public and accessible
95
  """
96
 
97
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
98
  CITATION_BUTTON_TEXT = r"""
99
  """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ emea_ner = Task("emea_ner", "f1", "EMEA")
16
+ medline_ner = Task("medline_ner", "f1", "MEDLINE")
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">🏥 Leaderboard NLP Biomédical Français</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ Ce leaderboard évalue les modèles NLP français sur des tâches de reconnaissance d'entités nommées (NER) biomédicales.
29
+ Nous nous concentrons sur les modèles de type BERT avec des plans d'extension vers d'autres architectures.
30
 
31
+ **Tâches actuelles :**
32
+ - **EMEA** : Reconnaissance d'entités sur textes médicaux français de l'EMEA
33
+ - **MEDLINE** : Reconnaissance d'entités sur résumés médicaux français de MEDLINE
34
 
35
+ Les modèles sont évalués par fine-tuning sur chaque tâche.
36
  """
37
 
38
  # Which evaluations are you running? how can people reproduce what you have?
39
  LLM_BENCHMARKS_TEXT = f"""
40
+ ## Comment ça fonctionne
41
 
42
+ Nous évaluons les modèles en les **fine-tunant** sur des tâches de NER médical français :
43
 
44
+ **Paramètres de fine-tuning :**
45
+ - **Optimiseur** : AdamW
46
+ - **Taux d'apprentissage** : 5e-5
47
+ - **Scheduler** : Cosine avec redémarrages
48
+ - **Étapes** : 2000
49
+ - **Batch Size** : 4
50
+ - **Accumulation de gradient** : 4 étapes
51
+ - **Longueur max** : 512 tokens
52
+ - **Sortie** : Couche linéaire simple
53
 
54
+ **Évaluation** : Utilise seqeval avec schéma IOB2 pour le **micro F1**, précision et rappel au niveau des entités.
55
 
56
+ ## Reproductibilité
57
+ Les résultats sont obtenus par fine-tuning approprié, pas par évaluation zero-shot. Chaque modèle est fine-tuné indépendamment sur chaque tâche.
58
 
59
+ **Datasets :**
60
+ Les datasets utilisés sont basés sur des corpus annotés français pour la reconnaissance d'entités biomédicales.
 
61
  """
62
 
63
  EVALUATION_QUEUE_TEXT = """
64
+ ## Avant de soumettre un modèle
65
 
66
+ ### 1) Assurez-vous que votre modèle est compatible avec les AutoClasses :
67
  ```python
68
  from transformers import AutoTokenizer, AutoModelForTokenClassification
69
+ tokenizer = AutoTokenizer.from_pretrained("nom_de_votre_modèle")
70
+ model = AutoModelForTokenClassification.from_pretrained("nom_de_votre_modèle")
71
  ```
72
 
73
+ ### 2) Exigences du modèle :
74
+ - Doit être un modèle fine-tuné pour la classification de tokens (pas juste un modèle de base)
75
+ - Devrait être entraîné sur des données NER médicales françaises
76
+ - Doit être publiquement disponible sur le Hub Hugging Face
77
+ - Préférez le format safetensors pour un chargement plus rapide
78
+
79
+ ### 3) Performance attendue :
80
+ - Les modèles de base sans fine-tuning obtiendront des scores très bas (~0.02 F1)
81
+ - Les modèles fine-tunés devraient atteindre des scores significativement plus élevés
82
+
83
+ ### 4) Recommandations pour la carte du modèle :
84
+ - Spécifiez le dataset d'entraînement utilisé
85
+ - Incluez les détails de l'architecture du modèle
86
+ - Ajoutez les métriques de performance si disponibles
87
+ - Utilisez une licence ouverte
88
+
89
+ ## Dépannage
90
+ Si votre modèle échoue à l'évaluation :
91
+ 1. Vérifiez qu'il se charge correctement avec AutoModelForTokenClassification
92
+ 2. Vérifiez qu'il est entraîné pour la classification de tokens
93
+ 3. Assurez-vous que le modèle est public et accessible
94
  """
95
 
96
+ CITATION_BUTTON_LABEL = "Copiez le snippet suivant pour citer ces résultats"
97
  CITATION_BUTTON_TEXT = r"""
98
  """
src/display/utils.py CHANGED
@@ -23,21 +23,20 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
 
28
  #Scores
29
  for task in Tasks:
30
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
31
  # Model information
32
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
33
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
34
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
35
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
36
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
37
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
38
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
39
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
40
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
41
 
42
  # We use make dataclass to dynamically fill the scores from Tasks
43
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -47,9 +46,7 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
47
  class EvalQueueColumn: # Queue column
48
  model = ColumnContent("model", "markdown", True)
49
  revision = ColumnContent("revision", "str", True)
50
- private = ColumnContent("private", "bool", True)
51
  precision = ColumnContent("precision", "str", True)
52
- weight_type = ColumnContent("weight_type", "str", "Original")
53
  status = ColumnContent("status", "str", True)
54
 
55
  ## All the model information that we might need
@@ -61,10 +58,7 @@ class ModelDetails:
61
 
62
 
63
  class ModelType(Enum):
64
- PT = ModelDetails(name="pretrained", symbol="🟢")
65
  FT = ModelDetails(name="fine-tuned", symbol="🔶")
66
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
67
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
68
  Unknown = ModelDetails(name="", symbol="?")
69
 
70
  def to_str(self, separator=" "):
@@ -74,33 +68,24 @@ class ModelType(Enum):
74
  def from_str(type):
75
  if "fine-tuned" in type or "🔶" in type:
76
  return ModelType.FT
77
- if "pretrained" in type or "🟢" in type:
78
- return ModelType.PT
79
- if "RL-tuned" in type or "🟦" in type:
80
- return ModelType.RL
81
- if "instruction-tuned" in type or "⭕" in type:
82
- return ModelType.IFT
83
  return ModelType.Unknown
84
 
85
  @staticmethod
86
  def from_config(config):
87
  """Determine model type from configuration - for NER models, most will be fine-tuned"""
88
- if hasattr(config, 'num_labels') and config.num_labels == 21:
89
  return ModelType.FT # Fine-tuned for NER
90
- elif hasattr(config, 'num_labels') and config.num_labels == 2:
91
- return ModelType.PT # Base model
92
  return ModelType.Unknown
93
 
94
  class WeightType(Enum):
95
- Adapter = ModelDetails("Adapter")
96
  Original = ModelDetails("Original")
97
- Delta = ModelDetails("Delta")
98
 
99
  class Precision(Enum):
100
  float16 = ModelDetails("float16")
101
  bfloat16 = ModelDetails("bfloat16")
102
  Unknown = ModelDetails("?")
103
 
 
104
  def from_str(precision):
105
  if precision in ["torch.float16", "float16"]:
106
  return Precision.float16
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ auto_eval_column_dict.append(("model_type_symbol", ColumnContent("T", "str", True, never_hidden=True)))
27
+ auto_eval_column_dict.append(("model", ColumnContent("Model", "markdown", True, never_hidden=True)))
28
+ # Average score
29
+ auto_eval_column_dict.append(("average", ColumnContent("Average", "number", True)))
30
  #Scores
31
  for task in Tasks:
32
+ auto_eval_column_dict.append((task.name, ColumnContent(task.value.col_name, "number", True)))
33
  # Model information
34
+ auto_eval_column_dict.append(("precision", ColumnContent("Precision", "str", False)))
35
+ auto_eval_column_dict.append(("license", ColumnContent("Hub License", "str", False)))
36
+ auto_eval_column_dict.append(("params", ColumnContent("#Params (B)", "number", False)))
37
+ auto_eval_column_dict.append(("likes", ColumnContent("Hub ❤️", "number", False)))
38
+ auto_eval_column_dict.append(("still_on_hub", ColumnContent("Available on the hub", "bool", False)))
39
+ auto_eval_column_dict.append(("revision", ColumnContent("Model sha", "str", False, False)))
 
 
 
40
 
41
  # We use make dataclass to dynamically fill the scores from Tasks
42
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
46
  class EvalQueueColumn: # Queue column
47
  model = ColumnContent("model", "markdown", True)
48
  revision = ColumnContent("revision", "str", True)
 
49
  precision = ColumnContent("precision", "str", True)
 
50
  status = ColumnContent("status", "str", True)
51
 
52
  ## All the model information that we might need
 
58
 
59
 
60
  class ModelType(Enum):
 
61
  FT = ModelDetails(name="fine-tuned", symbol="🔶")
 
 
62
  Unknown = ModelDetails(name="", symbol="?")
63
 
64
  def to_str(self, separator=" "):
 
68
  def from_str(type):
69
  if "fine-tuned" in type or "🔶" in type:
70
  return ModelType.FT
 
 
 
 
 
 
71
  return ModelType.Unknown
72
 
73
  @staticmethod
74
  def from_config(config):
75
  """Determine model type from configuration - for NER models, most will be fine-tuned"""
76
+ if hasattr(config, 'num_labels') and config.num_labels > 2:
77
  return ModelType.FT # Fine-tuned for NER
 
 
78
  return ModelType.Unknown
79
 
80
  class WeightType(Enum):
 
81
  Original = ModelDetails("Original")
 
82
 
83
  class Precision(Enum):
84
  float16 = ModelDetails("float16")
85
  bfloat16 = ModelDetails("bfloat16")
86
  Unknown = ModelDetails("?")
87
 
88
+ @staticmethod
89
  def from_str(precision):
90
  if precision in ["torch.float16", "float16"]:
91
  return Precision.float16
src/leaderboard/read_evals.py CHANGED
@@ -33,19 +33,32 @@ class EvalResult:
33
  still_on_hub: bool = False
34
 
35
  @classmethod
36
- def init_from_json_file(self, json_filepath):
37
  """Inits the result from the specific model result file"""
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
40
 
41
- config = data.get("config")
42
-
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  if len(org_and_model) == 1:
51
  org = None
@@ -57,8 +70,11 @@ class EvalResult:
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
 
 
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
  architecture = "?"
64
  if model_config is not None:
@@ -79,14 +95,14 @@ class EvalResult:
79
  mean_acc = np.mean(accs) * 100.0
80
  results[task.benchmark] = mean_acc
81
 
82
- return self(
83
  eval_name=result_key,
84
  full_model=full_model,
85
  org=org,
86
  model=model,
87
  results=results,
88
  precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
92
  )
 
33
  still_on_hub: bool = False
34
 
35
  @classmethod
36
+ def init_from_json_file(cls, json_filepath):
37
  """Inits the result from the specific model result file"""
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
40
 
41
+ config = data.get("config", {})
42
+
43
+ # Precision - handle different field names
44
+ precision_value = config.get("model_dtype") or config.get("precision") or "Unknown"
45
+ precision = Precision.from_str(precision_value)
46
+
47
+ # Get model and org - handle different field names
48
+ org_and_model = config.get("model_name") or config.get("model_args") or config.get("model")
49
+ if org_and_model is None:
50
+ # Try to extract from filename as fallback
51
+ basename = os.path.basename(json_filepath)
52
+ if basename.startswith("results_"):
53
+ org_and_model = basename.replace("results_", "").replace(".json", "")
54
+
55
+ if org_and_model is None:
56
+ raise ValueError(f"Could not determine model name from {json_filepath}")
57
+
58
+ if "/" in org_and_model:
59
+ org_and_model = org_and_model.split("/", 1)
60
+ else:
61
+ org_and_model = [org_and_model]
62
 
63
  if len(org_and_model) == 1:
64
  org = None
 
70
  result_key = f"{org}_{model}_{precision.value.name}"
71
  full_model = "/".join(org_and_model)
72
 
73
+ # Model revision - handle different field names
74
+ revision = config.get("model_sha") or config.get("revision") or "main"
75
+
76
  still_on_hub, _, model_config = is_model_on_hub(
77
+ full_model, revision, trust_remote_code=True, test_tokenizer=False
78
  )
79
  architecture = "?"
80
  if model_config is not None:
 
95
  mean_acc = np.mean(accs) * 100.0
96
  results[task.benchmark] = mean_acc
97
 
98
+ return cls(
99
  eval_name=result_key,
100
  full_model=full_model,
101
  org=org,
102
  model=model,
103
  results=results,
104
  precision=precision,
105
+ revision=revision,
106
  still_on_hub=still_on_hub,
107
  architecture=architecture
108
  )
src/submission/submit.py CHANGED
@@ -16,11 +16,8 @@ USERS_TO_SUBMISSION_DATES = None
16
 
17
  def add_new_eval(
18
  model: str,
19
- base_model: str,
20
  revision: str,
21
  precision: str,
22
- weight_type: str,
23
- model_type: str,
24
  ):
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
@@ -36,23 +33,14 @@ def add_new_eval(
36
  precision = precision.split(" ")[0]
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
  # Does the model actually exist?
43
  if revision == "":
44
  revision = "main"
45
 
46
  # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
 
57
  # Is the model info correctly filled?
58
  try:
@@ -77,13 +65,10 @@ def add_new_eval(
77
 
78
  eval_entry = {
79
  "model": model,
80
- "base_model": base_model,
81
  "revision": revision,
82
  "precision": precision,
83
- "weight_type": weight_type,
84
  "status": "PENDING",
85
  "submitted_time": current_time,
86
- "model_type": model_type,
87
  "likes": model_info.likes,
88
  "params": model_size,
89
  "license": license,
@@ -97,7 +82,7 @@ def add_new_eval(
97
  print("Creating eval file")
98
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
102
  with open(out_path, "w") as f:
103
  f.write(json.dumps(eval_entry))
 
16
 
17
  def add_new_eval(
18
  model: str,
 
19
  revision: str,
20
  precision: str,
 
 
21
  ):
22
  global REQUESTED_MODELS
23
  global USERS_TO_SUBMISSION_DATES
 
33
  precision = precision.split(" ")[0]
34
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
35
 
 
 
 
36
  # Does the model actually exist?
37
  if revision == "":
38
  revision = "main"
39
 
40
  # Is the model on the hub?
41
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
42
+ if not model_on_hub:
43
+ return styled_error(f'Model "{model}" {error}')
 
 
 
 
 
 
44
 
45
  # Is the model info correctly filled?
46
  try:
 
65
 
66
  eval_entry = {
67
  "model": model,
 
68
  "revision": revision,
69
  "precision": precision,
 
70
  "status": "PENDING",
71
  "submitted_time": current_time,
 
72
  "likes": model_info.likes,
73
  "params": model_size,
74
  "license": license,
 
82
  print("Creating eval file")
83
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
84
  os.makedirs(OUT_DIR, exist_ok=True)
85
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_Original.json"
86
 
87
  with open(out_path, "w") as f:
88
  f.write(json.dumps(eval_entry))
test_fixes.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Test script to verify the fixes work correctly."""
3
+
4
+ import sys
5
+ import os
6
+ import json
7
+ from pathlib import Path
8
+
9
+ # Add the src directory to the path
10
+ sys.path.insert(0, str(Path(__file__).parent / "src"))
11
+
12
+ def test_dataclass_creation():
13
+ """Test that the AutoEvalColumn dataclass can be created successfully."""
14
+ print("Testing AutoEvalColumn dataclass creation...")
15
+ try:
16
+ from src.display.utils import AutoEvalColumn, fields
17
+
18
+ # Test that we can access the fields
19
+ all_fields = fields(AutoEvalColumn)
20
+ print(f"✓ Successfully created AutoEvalColumn with {len(all_fields)} fields")
21
+
22
+ # Test that the average field exists
23
+ assert hasattr(AutoEvalColumn, 'average'), "Missing 'average' field"
24
+ print("✓ 'average' field exists")
25
+
26
+ # Test that we can access field names
27
+ field_names = [c.name for c in all_fields]
28
+ assert 'average' in field_names, "Average field not in field names"
29
+ print("✓ Average field accessible in field names")
30
+
31
+ return True
32
+ except Exception as e:
33
+ print(f"✗ Error: {e}")
34
+ return False
35
+
36
+ def test_precision_from_str():
37
+ """Test that the Precision.from_str method works correctly."""
38
+ print("Testing Precision.from_str method...")
39
+ try:
40
+ from src.display.utils import Precision
41
+
42
+ # Test different precision values
43
+ result1 = Precision.from_str("torch.float16")
44
+ assert result1 == Precision.float16, f"Expected float16, got {result1}"
45
+ print("✓ torch.float16 correctly parsed")
46
+
47
+ result2 = Precision.from_str("float16")
48
+ assert result2 == Precision.float16, f"Expected float16, got {result2}"
49
+ print("✓ float16 correctly parsed")
50
+
51
+ result3 = Precision.from_str("torch.bfloat16")
52
+ assert result3 == Precision.bfloat16, f"Expected bfloat16, got {result3}"
53
+ print("✓ torch.bfloat16 correctly parsed")
54
+
55
+ result4 = Precision.from_str("unknown")
56
+ assert result4 == Precision.Unknown, f"Expected Unknown, got {result4}"
57
+ print("✓ Unknown precision correctly parsed")
58
+
59
+ return True
60
+ except Exception as e:
61
+ print(f"✗ Error: {e}")
62
+ return False
63
+
64
+ def test_eval_result_parsing():
65
+ """Test that the EvalResult can parse JSON files correctly."""
66
+ print("Testing EvalResult JSON parsing...")
67
+ try:
68
+ from src.leaderboard.read_evals import EvalResult
69
+ from src.about import Tasks
70
+
71
+ # Create a sample result file
72
+ sample_result = {
73
+ "config": {
74
+ "model_name": "test/model",
75
+ "model_dtype": "torch.float16",
76
+ "model_sha": "abc123"
77
+ },
78
+ "results": {
79
+ "emea_ner": {"f1": 0.85},
80
+ "medline_ner": {"f1": 0.82}
81
+ }
82
+ }
83
+
84
+ # Write to temp file
85
+ temp_file = "/tmp/test_result.json"
86
+ with open(temp_file, 'w') as f:
87
+ json.dump(sample_result, f)
88
+
89
+ # Test parsing
90
+ result = EvalResult.init_from_json_file(temp_file)
91
+
92
+ assert result.full_model == "test/model", f"Expected test/model, got {result.full_model}"
93
+ assert result.org == "test", f"Expected test, got {result.org}"
94
+ assert result.model == "model", f"Expected model, got {result.model}"
95
+ assert result.revision == "abc123", f"Expected abc123, got {result.revision}"
96
+
97
+ print("✓ JSON parsing works correctly")
98
+
99
+ # Test with missing fields
100
+ sample_result_minimal = {
101
+ "config": {
102
+ "model": "test/model2"
103
+ },
104
+ "results": {
105
+ "emea_ner": {"f1": 0.75}
106
+ }
107
+ }
108
+
109
+ temp_file_minimal = "/tmp/test_result_minimal.json"
110
+ with open(temp_file_minimal, 'w') as f:
111
+ json.dump(sample_result_minimal, f)
112
+
113
+ result_minimal = EvalResult.init_from_json_file(temp_file_minimal)
114
+ assert result_minimal.full_model == "test/model2", f"Expected test/model2, got {result_minimal.full_model}"
115
+ print("✓ Minimal JSON parsing works correctly")
116
+
117
+ # Clean up
118
+ os.remove(temp_file)
119
+ os.remove(temp_file_minimal)
120
+
121
+ return True
122
+ except Exception as e:
123
+ print(f"✗ Error: {e}")
124
+ return False
125
+
126
+ def test_to_dict():
127
+ """Test that EvalResult.to_dict works correctly."""
128
+ print("Testing EvalResult.to_dict method...")
129
+ try:
130
+ from src.leaderboard.read_evals import EvalResult
131
+ from src.display.utils import Precision, ModelType, WeightType
132
+
133
+ # Create a test EvalResult
134
+ eval_result = EvalResult(
135
+ eval_name="test_model_float16",
136
+ full_model="test/model",
137
+ org="test",
138
+ model="model",
139
+ revision="abc123",
140
+ results={"emea_ner": 85.0, "medline_ner": 82.0},
141
+ precision=Precision.float16,
142
+ model_type=ModelType.FT,
143
+ weight_type=WeightType.Original,
144
+ architecture="BertForTokenClassification",
145
+ license="MIT",
146
+ likes=10,
147
+ num_params=110,
148
+ date="2023-01-01",
149
+ still_on_hub=True
150
+ )
151
+
152
+ # Test to_dict conversion
153
+ result_dict = eval_result.to_dict()
154
+
155
+ # Check that all required fields are present
156
+ assert "average" in result_dict, "Missing average field in dict"
157
+ assert result_dict["average"] == 83.5, f"Expected average 83.5, got {result_dict['average']}"
158
+
159
+ print("✓ to_dict method works correctly")
160
+ print(f" - Average: {result_dict['average']}")
161
+
162
+ return True
163
+ except Exception as e:
164
+ print(f"✗ Error: {e}")
165
+ return False
166
+
167
+ def main():
168
+ """Run all tests."""
169
+ print("Running bug fix tests...\n")
170
+
171
+ tests = [
172
+ test_dataclass_creation,
173
+ test_precision_from_str,
174
+ test_eval_result_parsing,
175
+ test_to_dict,
176
+ ]
177
+
178
+ results = []
179
+ for test in tests:
180
+ print(f"\n{'='*50}")
181
+ try:
182
+ result = test()
183
+ results.append(result)
184
+ except Exception as e:
185
+ print(f"✗ Test {test.__name__} failed with exception: {e}")
186
+ results.append(False)
187
+
188
+ print(f"\n{'='*50}")
189
+ print(f"Test Results: {sum(results)}/{len(results)} tests passed")
190
+
191
+ if all(results):
192
+ print("🎉 All tests passed! The fixes are working correctly.")
193
+ return 0
194
+ else:
195
+ print("❌ Some tests failed. Please check the output above.")
196
+ return 1
197
+
198
+ if __name__ == "__main__":
199
+ sys.exit(main())