Spaces:
Sleeping
Sleeping
Add French medical NER leaderboard frontend
Browse files- requirements.txt +5 -1
- src/about.py +55 -28
- src/display/utils.py +9 -0
- src/envs.py +1 -1
- src/submission/check_validity.py +35 -2
requirements.txt
CHANGED
|
@@ -13,4 +13,8 @@ python-dateutil
|
|
| 13 |
tqdm
|
| 14 |
transformers
|
| 15 |
tokenizers>=0.15.0
|
| 16 |
-
sentencepiece
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
tqdm
|
| 14 |
transformers
|
| 15 |
tokenizers>=0.15.0
|
| 16 |
+
sentencepiece
|
| 17 |
+
|
| 18 |
+
# Additional dependencies for French medical NER
|
| 19 |
+
seqeval>=1.2.2
|
| 20 |
+
scikit-learn>=1.3.0
|
src/about.py
CHANGED
|
@@ -12,8 +12,8 @@ class Task:
|
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
-
|
| 16 |
-
|
| 17 |
|
| 18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 19 |
# ---------------------------------------------------
|
|
@@ -21,50 +21,77 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
-
TITLE = """<h1 align="center" id="space-title"
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 32 |
LLM_BENCHMARKS_TEXT = f"""
|
| 33 |
## How it works
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
## Reproducibility
|
| 36 |
-
|
| 37 |
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
EVALUATION_QUEUE_TEXT = """
|
| 41 |
-
##
|
| 42 |
|
| 43 |
-
### 1)
|
| 44 |
```python
|
| 45 |
-
from transformers import
|
| 46 |
-
|
| 47 |
-
model =
|
| 48 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
| 49 |
```
|
| 50 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
| 51 |
-
|
| 52 |
-
Note: make sure your model is public!
|
| 53 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
| 54 |
-
|
| 55 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
| 56 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
| 57 |
-
|
| 58 |
-
### 3) Make sure your model has an open license!
|
| 59 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
| 60 |
-
|
| 61 |
-
### 4) Fill up your model card
|
| 62 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
"""
|
| 69 |
|
| 70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
+
emea_ner = Task("emea_ner", "f1", "EMEA NER")
|
| 16 |
+
medline_ner = Task("medline_ner", "f1", "MEDLINE NER")
|
| 17 |
|
| 18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 19 |
# ---------------------------------------------------
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
+
TITLE = """<h1 align="center" id="space-title">🏥 French Medical NLP Leaderboard</h1>"""
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
+
This leaderboard evaluates French NLP models on biomedical Named Entity Recognition (NER) tasks.
|
| 29 |
+
We focus on BERT-like models with plans to extend to other architectures.
|
| 30 |
+
|
| 31 |
+
**Current Tasks:**
|
| 32 |
+
- **EMEA NER**: Named Entity Recognition on French medical texts from EMEA (European Medicines Agency)
|
| 33 |
+
- **MEDLINE NER**: Named Entity Recognition on French medical abstracts from MEDLINE
|
| 34 |
+
|
| 35 |
+
**Entity Types:** ANAT, CHEM, DEVI, DISO, GEOG, LIVB, OBJC, PHEN, PHYS, PROC
|
| 36 |
"""
|
| 37 |
|
| 38 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 39 |
LLM_BENCHMARKS_TEXT = f"""
|
| 40 |
## How it works
|
| 41 |
|
| 42 |
+
We evaluate models by **fine-tuning** them on French medical NER tasks following the CamemBERT-bio methodology:
|
| 43 |
+
|
| 44 |
+
**Fine-tuning Parameters:**
|
| 45 |
+
- **Optimizer**: AdamW (following CamemBERT-bio paper)
|
| 46 |
+
- **Learning Rate**: 5e-5 (optimal from Optuna search - unchanged)
|
| 47 |
+
- **Scheduler**: Cosine with restarts (22.4% warmup ratio)
|
| 48 |
+
- **Steps**: 2000 (same as paper)
|
| 49 |
+
- **Batch Size**: 4 (CPU constraint)
|
| 50 |
+
- **Gradient Accumulation**: 4 steps (effective batch size 16)
|
| 51 |
+
- **Max Length**: 512 tokens
|
| 52 |
+
- **Output**: Simple linear layer (no CRF)
|
| 53 |
+
|
| 54 |
+
**Evaluation**: Uses seqeval with IOB2 scheme for entity-level **micro F1**, precision, and recall.
|
| 55 |
+
|
| 56 |
## Reproducibility
|
| 57 |
+
Results are obtained through proper fine-tuning, not zero-shot evaluation. Each model is fine-tuned independently on each task.
|
| 58 |
|
| 59 |
+
**Datasets:**
|
| 60 |
+
- EMEA: `rntc/quaero-frenchmed-ner-emea-sen`
|
| 61 |
+
- MEDLINE: `rntc/quaero-frenchmed-ner-medline`
|
| 62 |
"""
|
| 63 |
|
| 64 |
EVALUATION_QUEUE_TEXT = """
|
| 65 |
+
## Before submitting a model
|
| 66 |
|
| 67 |
+
### 1) Ensure your model is compatible with AutoClasses:
|
| 68 |
```python
|
| 69 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 70 |
+
tokenizer = AutoTokenizer.from_pretrained("your_model_name")
|
| 71 |
+
model = AutoModelForTokenClassification.from_pretrained("your_model_name")
|
|
|
|
| 72 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
### 2) Model requirements:
|
| 75 |
+
- Must be a fine-tuned model for token classification (not just a base model)
|
| 76 |
+
- Should be trained on French medical NER data
|
| 77 |
+
- Must be publicly available on Hugging Face Hub
|
| 78 |
+
- Prefer safetensors format for faster loading
|
| 79 |
+
|
| 80 |
+
### 3) Expected performance:
|
| 81 |
+
- Base models without fine-tuning will get very low scores (~0.02 F1)
|
| 82 |
+
- Fine-tuned models should achieve significantly higher scores
|
| 83 |
+
|
| 84 |
+
### 4) Model card recommendations:
|
| 85 |
+
- Specify the training dataset used
|
| 86 |
+
- Include model architecture details
|
| 87 |
+
- Add performance metrics if available
|
| 88 |
+
- Use an open license
|
| 89 |
+
|
| 90 |
+
## Troubleshooting
|
| 91 |
+
If your model fails evaluation:
|
| 92 |
+
1. Check that it loads properly with AutoModelForTokenClassification
|
| 93 |
+
2. Verify it's trained for token classification (not just language modeling)
|
| 94 |
+
3. Ensure the model is public and accessible
|
| 95 |
"""
|
| 96 |
|
| 97 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/display/utils.py
CHANGED
|
@@ -83,6 +83,15 @@ class ModelType(Enum):
|
|
| 83 |
return ModelType.IFT
|
| 84 |
return ModelType.Unknown
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
class WeightType(Enum):
|
| 87 |
Adapter = ModelDetails("Adapter")
|
| 88 |
Original = ModelDetails("Original")
|
|
|
|
| 83 |
return ModelType.IFT
|
| 84 |
return ModelType.Unknown
|
| 85 |
|
| 86 |
+
@staticmethod
|
| 87 |
+
def from_config(config):
|
| 88 |
+
"""Determine model type from configuration - for NER models, most will be fine-tuned"""
|
| 89 |
+
if hasattr(config, 'num_labels') and config.num_labels == 21:
|
| 90 |
+
return ModelType.FT # Fine-tuned for NER
|
| 91 |
+
elif hasattr(config, 'num_labels') and config.num_labels == 2:
|
| 92 |
+
return ModelType.PT # Base model
|
| 93 |
+
return ModelType.Unknown
|
| 94 |
+
|
| 95 |
class WeightType(Enum):
|
| 96 |
Adapter = ModelDetails("Adapter")
|
| 97 |
Original = ModelDetails("Original")
|
src/envs.py
CHANGED
|
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
-
OWNER = "
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
+
OWNER = "rntc" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
REPO_ID = f"{OWNER}/leaderboard"
|
src/submission/check_validity.py
CHANGED
|
@@ -7,7 +7,7 @@ from datetime import datetime, timedelta, timezone
|
|
| 7 |
import huggingface_hub
|
| 8 |
from huggingface_hub import ModelCard
|
| 9 |
from huggingface_hub.hf_api import ModelInfo
|
| 10 |
-
from transformers import AutoConfig
|
| 11 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
| 12 |
|
| 13 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
@@ -35,6 +35,27 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
| 35 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
| 36 |
try:
|
| 37 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
if test_tokenizer:
|
| 39 |
try:
|
| 40 |
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
|
@@ -72,7 +93,19 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
| 72 |
|
| 73 |
def get_model_arch(model_info: ModelInfo):
|
| 74 |
"""Gets the model architecture from the configuration"""
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
| 78 |
"""Gather a list of already submitted models to avoid duplicates"""
|
|
|
|
| 7 |
import huggingface_hub
|
| 8 |
from huggingface_hub import ModelCard
|
| 9 |
from huggingface_hub.hf_api import ModelInfo
|
| 10 |
+
from transformers import AutoConfig, AutoModelForTokenClassification
|
| 11 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
| 12 |
|
| 13 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
|
|
| 35 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
| 36 |
try:
|
| 37 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
| 38 |
+
|
| 39 |
+
# Check if model can be loaded for token classification
|
| 40 |
+
try:
|
| 41 |
+
model = AutoModelForTokenClassification.from_pretrained(
|
| 42 |
+
model_name,
|
| 43 |
+
revision=revision,
|
| 44 |
+
trust_remote_code=trust_remote_code,
|
| 45 |
+
token=token
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Check if it's suitable for our NER task (should have 21 labels)
|
| 49 |
+
if hasattr(model.config, 'num_labels') and model.config.num_labels not in [2, 21]:
|
| 50 |
+
return (
|
| 51 |
+
False,
|
| 52 |
+
f"has {model.config.num_labels} labels, but French medical NER requires models with 2 (base) or 21 (fine-tuned) labels",
|
| 53 |
+
None
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
return (False, f"cannot be loaded for token classification: {e}", None)
|
| 58 |
+
|
| 59 |
if test_tokenizer:
|
| 60 |
try:
|
| 61 |
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
|
|
|
| 93 |
|
| 94 |
def get_model_arch(model_info: ModelInfo):
|
| 95 |
"""Gets the model architecture from the configuration"""
|
| 96 |
+
arch = model_info.config.get("architectures", ["Unknown"])
|
| 97 |
+
if isinstance(arch, list) and len(arch) > 0:
|
| 98 |
+
arch_name = arch[0]
|
| 99 |
+
# Map common architectures to user-friendly names
|
| 100 |
+
if "Camembert" in arch_name:
|
| 101 |
+
return "CamemBERT"
|
| 102 |
+
elif "Bert" in arch_name:
|
| 103 |
+
return "BERT"
|
| 104 |
+
elif "Roberta" in arch_name:
|
| 105 |
+
return "RoBERTa"
|
| 106 |
+
else:
|
| 107 |
+
return arch_name.replace("ForTokenClassification", "")
|
| 108 |
+
return "Unknown"
|
| 109 |
|
| 110 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
| 111 |
"""Gather a list of already submitted models to avoid duplicates"""
|