leaderboard-test

Sleeping

rntc Claude commited on Jul 18

Commit

fefe31a

1 Parent(s): f127f6a

Simplify leaderboard to EMEA-sen and MEDLINE tasks only

- Update Tasks enum to only include emea_ner and medline_ner
- Simplify interface text and remove unnecessary complexity
- Update French localization
- Remove hardcoded dataset information

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (7) hide show

app.py +5 -29
pyproject.toml +36 -0
src/about.py +51 -52
src/display/utils.py +13 -28
src/leaderboard/read_evals.py +28 -12
src/submission/submit.py +4 -19
test_fixes.py +199 -0

app.py CHANGED Viewed

@@ -19,9 +19,7 @@ from src.display.utils import (
     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
-    ModelType,
     fields,
-    WeightType,
     Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
@@ -72,7 +70,6 @@ def init_leaderboard(dataframe):
             search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
             hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
             filter_columns=[
-                ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
                 ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
                 ColumnFilter(
                     AutoEvalColumn.params.name,
@@ -100,7 +97,6 @@ def init_leaderboard(dataframe):
         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
             ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
             ColumnFilter(
                 AutoEvalColumn.params.name,
@@ -171,48 +167,28 @@ with demo:
                                 row_count=5,
                             )
             with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
                     precision = gr.Dropdown(
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
                         multiselect=False,
                         value="float16",
                         interactive=True,
                     )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
                 add_new_eval,
                 [
                     model_name_textbox,
-                    base_model_name_textbox,
                     revision_name_textbox,
                     precision,
-                    weight_type,
-                    model_type,
                 ],
                 submission_result,
             )

     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
     fields,
     Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
             search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
             hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
             filter_columns=[
                 ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
                 ColumnFilter(
                     AutoEvalColumn.params.name,
         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
             ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
             ColumnFilter(
                 AutoEvalColumn.params.name,
                                 row_count=5,
                             )
             with gr.Row():
+                gr.Markdown("# ✉️✨ Soumettez votre modèle ici !", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Nom du modèle")
+                    revision_name_textbox = gr.Textbox(label="Révision commit", placeholder="main")
                     precision = gr.Dropdown(
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Précision",
                         multiselect=False,
                         value="float16",
                         interactive=True,
                     )
+            submit_button = gr.Button("Soumettre l'évaluation")
             submission_result = gr.Markdown()
             submit_button.click(
                 add_new_eval,
                 [
                     model_name_textbox,
                     revision_name_textbox,
                     precision,
                 ],
                 submission_result,
             )

pyproject.toml CHANGED Viewed

@@ -1,3 +1,39 @@
 [tool.ruff]
 # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
 select = ["E", "F"]

+[tool.poetry]
+name = "french-medical-nlp-leaderboard"
+version = "0.1.0"
+description = "Leaderboard for French medical NLP models"
+authors = ["rntc <[email protected]>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.8"
+APScheduler = "*"
+black = "*"
+datasets = "*"
+gradio = "*"
+gradio-leaderboard = "0.0.13"
+gradio-client = "*"
+huggingface-hub = ">=0.18.0"
+matplotlib = "*"
+numpy = "*"
+pandas = "*"
+python-dateutil = "*"
+tqdm = "*"
+transformers = "*"
+tokenizers = ">=0.15.0"
+sentencepiece = "*"
+torch = ">=2.6.0"
+seqeval = ">=1.2.2"
+scikit-learn = ">=1.3.0"
+[tool.poetry.group.dev.dependencies]
+ruff = "*"
+isort = "*"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
 [tool.ruff]
 # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
 select = ["E", "F"]

src/about.py CHANGED Viewed

@@ -12,8 +12,8 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    emea_ner = Task("emea_ner", "f1", "EMEA NER")
-    medline_ner = Task("medline_ner", "f1", "MEDLINE NER")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
@@ -21,79 +21,78 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">🏥 French Medical NLP Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-This leaderboard evaluates French NLP models on biomedical Named Entity Recognition (NER) tasks.
-We focus on BERT-like models with plans to extend to other architectures.
-**Current Tasks:**
-- **EMEA NER**: Named Entity Recognition on French medical texts from EMEA (European Medicines Agency)
-- **MEDLINE NER**: Named Entity Recognition on French medical abstracts from MEDLINE
-**Entity Types:** ANAT, CHEM, DEVI, DISO, GEOG, LIVB, OBJC, PHEN, PHYS, PROC
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
-## How it works
-We evaluate models by **fine-tuning** them on French medical NER tasks following the CamemBERT-bio methodology:
-**Fine-tuning Parameters:**
-- **Optimizer**: AdamW (following CamemBERT-bio paper)
-- **Learning Rate**: 5e-5 (optimal from Optuna search - unchanged)
-- **Scheduler**: Cosine with restarts (22.4% warmup ratio)
-- **Steps**: 2000 (same as paper)
-- **Batch Size**: 4 (CPU constraint)
-- **Gradient Accumulation**: 4 steps (effective batch size 16)
-- **Max Length**: 512 tokens
-- **Output**: Simple linear layer (no CRF)
-**Evaluation**: Uses seqeval with IOB2 scheme for entity-level **micro F1**, precision, and recall.
-## Reproducibility
-Results are obtained through proper fine-tuning, not zero-shot evaluation. Each model is fine-tuned independently on each task.
-**Datasets:**
-- EMEA: `rntc/quaero-frenchmed-ner-emea-sen`
-- MEDLINE: `rntc/quaero-frenchmed-ner-medline`
 """
 EVALUATION_QUEUE_TEXT = """
-## Before submitting a model
-### 1) Ensure your model is compatible with AutoClasses:
 ```python
 from transformers import AutoTokenizer, AutoModelForTokenClassification
-tokenizer = AutoTokenizer.from_pretrained("your_model_name")
-model = AutoModelForTokenClassification.from_pretrained("your_model_name")
 ```
-### 2) Model requirements:
-- Must be a fine-tuned model for token classification (not just a base model)
-- Should be trained on French medical NER data
-- Must be publicly available on Hugging Face Hub
-- Prefer safetensors format for faster loading
-### 3) Expected performance:
-- Base models without fine-tuning will get very low scores (~0.02 F1)
-- Fine-tuned models should achieve significantly higher scores
-### 4) Model card recommendations:
-- Specify the training dataset used
-- Include model architecture details
-- Add performance metrics if available
-- Use an open license
-## Troubleshooting
-If your model fails evaluation:
-1. Check that it loads properly with AutoModelForTokenClassification
-2. Verify it's trained for token classification (not just language modeling)
-3. Ensure the model is public and accessible
 """
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    emea_ner = Task("emea_ner", "f1", "EMEA")
+    medline_ner = Task("medline_ner", "f1", "MEDLINE")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">🏥 Leaderboard NLP Biomédical Français</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Ce leaderboard évalue les modèles NLP français sur des tâches de reconnaissance d'entités nommées (NER) biomédicales.
+Nous nous concentrons sur les modèles de type BERT avec des plans d'extension vers d'autres architectures.
+**Tâches actuelles :**
+- **EMEA** : Reconnaissance d'entités sur textes médicaux français de l'EMEA
+- **MEDLINE** : Reconnaissance d'entités sur résumés médicaux français de MEDLINE
+Les modèles sont évalués par fine-tuning sur chaque tâche.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+## Comment ça fonctionne
+Nous évaluons les modèles en les **fine-tunant** sur des tâches de NER médical français :
+**Paramètres de fine-tuning :**
+- **Optimiseur** : AdamW
+- **Taux d'apprentissage** : 5e-5
+- **Scheduler** : Cosine avec redémarrages
+- **Étapes** : 2000
+- **Batch Size** : 4
+- **Accumulation de gradient** : 4 étapes
+- **Longueur max** : 512 tokens
+- **Sortie** : Couche linéaire simple
+**Évaluation** : Utilise seqeval avec schéma IOB2 pour le **micro F1**, précision et rappel au niveau des entités.
+## Reproductibilité
+Les résultats sont obtenus par fine-tuning approprié, pas par évaluation zero-shot. Chaque modèle est fine-tuné indépendamment sur chaque tâche.
+**Datasets :**
+Les datasets utilisés sont basés sur des corpus annotés français pour la reconnaissance d'entités biomédicales.
 """
 EVALUATION_QUEUE_TEXT = """
+## Avant de soumettre un modèle
+### 1) Assurez-vous que votre modèle est compatible avec les AutoClasses :
 ```python
 from transformers import AutoTokenizer, AutoModelForTokenClassification
+tokenizer = AutoTokenizer.from_pretrained("nom_de_votre_modèle")
+model = AutoModelForTokenClassification.from_pretrained("nom_de_votre_modèle")
 ```
+### 2) Exigences du modèle :
+- Doit être un modèle fine-tuné pour la classification de tokens (pas juste un modèle de base)
+- Devrait être entraîné sur des données NER médicales françaises
+- Doit être publiquement disponible sur le Hub Hugging Face
+- Préférez le format safetensors pour un chargement plus rapide
+### 3) Performance attendue :
+- Les modèles de base sans fine-tuning obtiendront des scores très bas (~0.02 F1)
+- Les modèles fine-tunés devraient atteindre des scores significativement plus élevés
+### 4) Recommandations pour la carte du modèle :
+- Spécifiez le dataset d'entraînement utilisé
+- Incluez les détails de l'architecture du modèle
+- Ajoutez les métriques de performance si disponibles
+- Utilisez une licence ouverte
+## Dépannage
+Si votre modèle échoue à l'évaluation :
+1. Vérifiez qu'il se charge correctement avec AutoModelForTokenClassification
+2. Vérifiez qu'il est entraîné pour la classification de tokens
+3. Assurez-vous que le modèle est public et accessible
 """
+CITATION_BUTTON_LABEL = "Copiez le snippet suivant pour citer ces résultats"
 CITATION_BUTTON_TEXT = r"""
 """

src/display/utils.py CHANGED Viewed

@@ -23,21 +23,20 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -47,9 +46,7 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
     precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
@@ -61,10 +58,7 @@ class ModelDetails:
 class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
     FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
@@ -74,33 +68,24 @@ class ModelType(Enum):
     def from_str(type):
         if "fine-tuned" in type or "🔶" in type:
             return ModelType.FT
-        if "pretrained" in type or "🟢" in type:
-            return ModelType.PT
-        if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
-        if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
         return ModelType.Unknown
     @staticmethod
     def from_config(config):
         """Determine model type from configuration - for NER models, most will be fine-tuned"""
-        if hasattr(config, 'num_labels') and config.num_labels == 21:
             return ModelType.FT  # Fine-tuned for NER
-        elif hasattr(config, 'num_labels') and config.num_labels == 2:
-            return ModelType.PT  # Base model
         return ModelType.Unknown
 class WeightType(Enum):
-    Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
-    Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     Unknown = ModelDetails("?")
     def from_str(precision):
         if precision in ["torch.float16", "float16"]:
             return Precision.float16

 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
+auto_eval_column_dict.append(("model_type_symbol", ColumnContent("T", "str", True, never_hidden=True)))
+auto_eval_column_dict.append(("model", ColumnContent("Model", "markdown", True, never_hidden=True)))
+# Average score
+auto_eval_column_dict.append(("average", ColumnContent("Average", "number", True)))
 #Scores
 for task in Tasks:
+    auto_eval_column_dict.append((task.name, ColumnContent(task.value.col_name, "number", True)))
 # Model information
+auto_eval_column_dict.append(("precision", ColumnContent("Precision", "str", False)))
+auto_eval_column_dict.append(("license", ColumnContent("Hub License", "str", False)))
+auto_eval_column_dict.append(("params", ColumnContent("#Params (B)", "number", False)))
+auto_eval_column_dict.append(("likes", ColumnContent("Hub ❤️", "number", False)))
+auto_eval_column_dict.append(("still_on_hub", ColumnContent("Available on the hub", "bool", False)))
+auto_eval_column_dict.append(("revision", ColumnContent("Model sha", "str", False, False)))
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     revision = ColumnContent("revision", "str", True)
     precision = ColumnContent("precision", "str", True)
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
 class ModelType(Enum):
     FT = ModelDetails(name="fine-tuned", symbol="🔶")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
     def from_str(type):
         if "fine-tuned" in type or "🔶" in type:
             return ModelType.FT
         return ModelType.Unknown
     @staticmethod
     def from_config(config):
         """Determine model type from configuration - for NER models, most will be fine-tuned"""
+        if hasattr(config, 'num_labels') and config.num_labels > 2:
             return ModelType.FT  # Fine-tuned for NER
         return ModelType.Unknown
 class WeightType(Enum):
     Original = ModelDetails("Original")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     Unknown = ModelDetails("?")
+    @staticmethod
     def from_str(precision):
         if precision in ["torch.float16", "float16"]:
             return Precision.float16

src/leaderboard/read_evals.py CHANGED Viewed

@@ -33,19 +33,32 @@ class EvalResult:
     still_on_hub: bool = False
     @classmethod
-    def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
-        config = data.get("config")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
         if len(org_and_model) == 1:
             org = None
@@ -57,8 +70,11 @@ class EvalResult:
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
@@ -79,14 +95,14 @@ class EvalResult:
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
-        return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
             results=results,
             precision=precision,
-            revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
         )

     still_on_hub: bool = False
     @classmethod
+    def init_from_json_file(cls, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
+        config = data.get("config", {})
+        # Precision - handle different field names
+        precision_value = config.get("model_dtype") or config.get("precision") or "Unknown"
+        precision = Precision.from_str(precision_value)
+        # Get model and org - handle different field names
+        org_and_model = config.get("model_name") or config.get("model_args") or config.get("model")
+        if org_and_model is None:
+            # Try to extract from filename as fallback
+            basename = os.path.basename(json_filepath)
+            if basename.startswith("results_"):
+                org_and_model = basename.replace("results_", "").replace(".json", "")
+        if org_and_model is None:
+            raise ValueError(f"Could not determine model name from {json_filepath}")
+        if "/" in org_and_model:
+            org_and_model = org_and_model.split("/", 1)
+        else:
+            org_and_model = [org_and_model]
         if len(org_and_model) == 1:
             org = None
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
+        # Model revision - handle different field names
+        revision = config.get("model_sha") or config.get("revision") or "main"
         still_on_hub, _, model_config = is_model_on_hub(
+            full_model, revision, trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
+        return cls(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
             results=results,
             precision=precision,
+            revision=revision,
             still_on_hub=still_on_hub,
             architecture=architecture
         )

src/submission/submit.py CHANGED Viewed

@@ -16,11 +16,8 @@ USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
-    base_model: str,
     revision: str,
     precision: str,
-    weight_type: str,
-    model_type: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -36,23 +33,14 @@ def add_new_eval(
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
     # Does the model actually exist?
     if revision == "":
         revision = "main"
     # Is the model on the hub?
-    if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
-    if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
     # Is the model info correctly filled?
     try:
@@ -77,13 +65,10 @@ def add_new_eval(
     eval_entry = {
         "model": model,
-        "base_model": base_model,
         "revision": revision,
         "precision": precision,
-        "weight_type": weight_type,
         "status": "PENDING",
         "submitted_time": current_time,
-        "model_type": model_type,
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
@@ -97,7 +82,7 @@ def add_new_eval(
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

 def add_new_eval(
     model: str,
     revision: str,
     precision: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     # Does the model actually exist?
     if revision == "":
         revision = "main"
     # Is the model on the hub?
+    model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
+    if not model_on_hub:
+        return styled_error(f'Model "{model}" {error}')
     # Is the model info correctly filled?
     try:
     eval_entry = {
         "model": model,
         "revision": revision,
         "precision": precision,
         "status": "PENDING",
         "submitted_time": current_time,
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_Original.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

test_fixes.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/env python3
+"""Test script to verify the fixes work correctly."""
+import sys
+import os
+import json
+from pathlib import Path
+# Add the src directory to the path
+sys.path.insert(0, str(Path(__file__).parent / "src"))
+def test_dataclass_creation():
+    """Test that the AutoEvalColumn dataclass can be created successfully."""
+    print("Testing AutoEvalColumn dataclass creation...")
+    try:
+        from src.display.utils import AutoEvalColumn, fields
+        # Test that we can access the fields
+        all_fields = fields(AutoEvalColumn)
+        print(f"✓ Successfully created AutoEvalColumn with {len(all_fields)} fields")
+        # Test that the average field exists
+        assert hasattr(AutoEvalColumn, 'average'), "Missing 'average' field"
+        print("✓ 'average' field exists")
+        # Test that we can access field names
+        field_names = [c.name for c in all_fields]
+        assert 'average' in field_names, "Average field not in field names"
+        print("✓ Average field accessible in field names")
+        return True
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return False
+def test_precision_from_str():
+    """Test that the Precision.from_str method works correctly."""
+    print("Testing Precision.from_str method...")
+    try:
+        from src.display.utils import Precision
+        # Test different precision values
+        result1 = Precision.from_str("torch.float16")
+        assert result1 == Precision.float16, f"Expected float16, got {result1}"
+        print("✓ torch.float16 correctly parsed")
+        result2 = Precision.from_str("float16")
+        assert result2 == Precision.float16, f"Expected float16, got {result2}"
+        print("✓ float16 correctly parsed")
+        result3 = Precision.from_str("torch.bfloat16")
+        assert result3 == Precision.bfloat16, f"Expected bfloat16, got {result3}"
+        print("✓ torch.bfloat16 correctly parsed")
+        result4 = Precision.from_str("unknown")
+        assert result4 == Precision.Unknown, f"Expected Unknown, got {result4}"
+        print("✓ Unknown precision correctly parsed")
+        return True
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return False
+def test_eval_result_parsing():
+    """Test that the EvalResult can parse JSON files correctly."""
+    print("Testing EvalResult JSON parsing...")
+    try:
+        from src.leaderboard.read_evals import EvalResult
+        from src.about import Tasks
+        # Create a sample result file
+        sample_result = {
+            "config": {
+                "model_name": "test/model",
+                "model_dtype": "torch.float16",
+                "model_sha": "abc123"
+            },
+            "results": {
+                "emea_ner": {"f1": 0.85},
+                "medline_ner": {"f1": 0.82}
+            }
+        }
+        # Write to temp file
+        temp_file = "/tmp/test_result.json"
+        with open(temp_file, 'w') as f:
+            json.dump(sample_result, f)
+        # Test parsing
+        result = EvalResult.init_from_json_file(temp_file)
+        assert result.full_model == "test/model", f"Expected test/model, got {result.full_model}"
+        assert result.org == "test", f"Expected test, got {result.org}"
+        assert result.model == "model", f"Expected model, got {result.model}"
+        assert result.revision == "abc123", f"Expected abc123, got {result.revision}"
+        print("✓ JSON parsing works correctly")
+        # Test with missing fields
+        sample_result_minimal = {
+            "config": {
+                "model": "test/model2"
+            },
+            "results": {
+                "emea_ner": {"f1": 0.75}
+            }
+        }
+        temp_file_minimal = "/tmp/test_result_minimal.json"
+        with open(temp_file_minimal, 'w') as f:
+            json.dump(sample_result_minimal, f)
+        result_minimal = EvalResult.init_from_json_file(temp_file_minimal)
+        assert result_minimal.full_model == "test/model2", f"Expected test/model2, got {result_minimal.full_model}"
+        print("✓ Minimal JSON parsing works correctly")
+        # Clean up
+        os.remove(temp_file)
+        os.remove(temp_file_minimal)
+        return True
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return False
+def test_to_dict():
+    """Test that EvalResult.to_dict works correctly."""
+    print("Testing EvalResult.to_dict method...")
+    try:
+        from src.leaderboard.read_evals import EvalResult
+        from src.display.utils import Precision, ModelType, WeightType
+        # Create a test EvalResult
+        eval_result = EvalResult(
+            eval_name="test_model_float16",
+            full_model="test/model",
+            org="test",
+            model="model",
+            revision="abc123",
+            results={"emea_ner": 85.0, "medline_ner": 82.0},
+            precision=Precision.float16,
+            model_type=ModelType.FT,
+            weight_type=WeightType.Original,
+            architecture="BertForTokenClassification",
+            license="MIT",
+            likes=10,
+            num_params=110,
+            date="2023-01-01",
+            still_on_hub=True
+        )
+        # Test to_dict conversion
+        result_dict = eval_result.to_dict()
+        # Check that all required fields are present
+        assert "average" in result_dict, "Missing average field in dict"
+        assert result_dict["average"] == 83.5, f"Expected average 83.5, got {result_dict['average']}"
+        print("✓ to_dict method works correctly")
+        print(f"  - Average: {result_dict['average']}")
+        return True
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return False
+def main():
+    """Run all tests."""
+    print("Running bug fix tests...\n")
+    tests = [
+        test_dataclass_creation,
+        test_precision_from_str,
+        test_eval_result_parsing,
+        test_to_dict,
+    ]
+    results = []
+    for test in tests:
+        print(f"\n{'='*50}")
+        try:
+            result = test()
+            results.append(result)
+        except Exception as e:
+            print(f"✗ Test {test.__name__} failed with exception: {e}")
+            results.append(False)
+    print(f"\n{'='*50}")
+    print(f"Test Results: {sum(results)}/{len(results)} tests passed")
+    if all(results):
+        print("🎉 All tests passed! The fixes are working correctly.")
+        return 0
+    else:
+        print("❌ Some tests failed. Please check the output above.")
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())