ycy commited on
Commit
2d1d89a
Β·
1 Parent(s): 621cb73
Files changed (1) hide show
  1. src/about.py +176 -30
src/about.py CHANGED
@@ -12,61 +12,207 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
20
 
21
 
22
 
23
- # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 
 
 
 
 
 
 
 
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
 
31
- # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
 
 
 
 
 
42
 
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
  from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
 
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
 
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model πŸ€—
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("medmcqa", "acc,none", "MedMCQA")
16
+ task1 = Task("medqa_4options", "acc,none", "MedQA")
17
+ task2 = Task("mmlu_anatomy", "acc,none", "MMLU Anatomy")
18
+ task3 = Task("mmlu_clinical_knowledge", "acc,none", "MMLU Clinical Knowledge")
19
+ task4 = Task("mmlu_college_biology", "acc,none", "MMLU College Biology")
20
+ task5 = Task("mmlu_college_medicine", "acc,none", "MMLU College Medicine")
21
+ task6 = Task("mmlu_medical_genetics", "acc,none", "MMLU Medical Genetics")
22
+ task7 = Task("mmlu_professional_medicine", "acc,none", "MMLU Professional Medicine")
23
+ task8 = Task("pubmedqa", "acc,none", "PubMedQA")
24
+
25
+
26
+
27
 
28
  NUM_FEWSHOT = 0 # Change with your few shot
29
  # ---------------------------------------------------
30
 
31
 
32
 
33
+ TITLE = """
34
+
35
+
36
+ <div style="text-align: center; margin-bottom: 20px;">
37
+ <img src="https://raw.githubusercontent.com/monk1337/MultiMedQA/main/assets/logs.png" alt="Descriptive Alt Text" style="display: block; margin: auto; height: 160px;">
38
+ </div>
39
+ <h1 align="center" style="color: #1a237e; font-size: 40px;">Open <span style="color: #990001;">Medical-LLM</span> Leaderboard</h1>
40
+
41
+
42
+ """
43
 
44
  # What does your leaderboard evaluate?
45
  INTRODUCTION_TEXT = """
46
+ 🩺 The Open Medical LLM Leaderboard aims to track, rank and evaluate the performance of large language models (LLMs) on medical question answering tasks. It evaluates LLMs across a diverse array of medical datasets, including MedQA (USMLE), PubMedQA, MedMCQA, and subsets of MMLU related to medicine and biology. The leaderboard offers a comprehensive assessment of each model's medical knowledge and question answering capabilities.
47
+
48
+
49
+ The datasets cover various aspects of medicine such as general medical knowledge, clinical knowledge, anatomy, genetics, and more. They contain multiple-choice and open-ended questions that require medical reasoning and understanding. More details on the datasets can be found in the "LLM Benchmarks Details" section below.
50
+
51
+
52
+ The main evaluation metric used is Accuracy (ACC). Submit a model for automated evaluation on the "Submit" page. If you have comments or suggestions on additional medical datasets to include, please reach out to us in our discussion forum.
53
+
54
+ Evaluation Purpose: The primary role of this leaderboard is to assess and compare the performance of the models. It does not facilitate the distribution, deployment, or clinical use of these models.
55
+ The models on this leaderboard are not approved for clinical use and are intended for research purposes only. Please refer to the "Advisory Notice" section in the "About" page.
56
+
57
+ The backend of the Open Medical LLM Leaderboard uses the Eleuther AI Language Model Evaluation Harness. More technical details can be found in the "About" page.
58
+
59
+ The <a href="https://arxiv.org/abs/2303.13375">GPT-4</a>, and <a href="https://arxiv.org/abs/2305.09617">Med-PaLM-2</a> results are taken from their official papers. Since Med-PaLM doesn't provide zero-shot accuracy, we are using 5-shot accuracy from their paper for comparison. All results presented are in the zero-shot setting, except for Med-PaLM-2 which use 5-shot accuracy. Gemini results are taken from recent Clinical-NLP <a href="https://arxiv.org/abs/2402.07023">(NAACL 24) Paper</a>
60
+
61
+ Model Availability Requirement: To maintain the integrity of the leaderboard, only models that are actively accessible will be included. Submissions must be available either via an API or a public Hugging Face repository to allow validation of the reported results. If a model's repository is empty or its API is inaccessible, the submission will be removed from the leaderboard, as the primary goal is to ensure that models listed here remain accessible for evaluation and comparison.
62
  """
63
 
 
64
  LLM_BENCHMARKS_TEXT = f"""
65
+ <h2 style="color: #2c3e50;"> Why Leaderboard? </h2>
66
+
67
+ Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is crucial as they are increasingly being applied to healthcare and biomedical applications. The Open Medical LLM Leaderboard provides a platform to assess the latest LLMs on their performance on a variety of medical question answering tasks. This can help identify the strengths and gaps in medical understanding of current models.
68
+
69
+ <h2 style="color: #2c3e50;">How it works</h2>
70
+
71
+ πŸ“ˆ We evaluate the models on 9 medical Q&A datasets using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test language models on different tasks.
72
+
73
+ <h2 style="color: #2c3e50;">Advisory Notice</h2>
74
+
75
+
76
+
77
+ The Open Medical-LLM Leaderboard showcases medical models intended solely for research and development purposes. It is important to be aware of the following:
78
+
79
+ Evaluation Purpose: The primary role of this leaderboard is to assess and compare the performance of the models. It does not facilitate the distribution, deployment, or clinical use of these models.
80
+
81
+ Regulatory Status: The models listed on this leaderboard have not been approved or registered by any regulatory authorities, including the US FDA, the European Medicines Agency (EMA), Health Canada, or the Therapeutic Goods Administration (TGA) in Australia. They are not listed in the US FDA Database for approved AI in healthcare or the EUDAMED database.
82
 
83
+ Disclaimer: These models are not intended for direct patient care, clinical decision support, or any other professional medical purposes. Their use should be limited to research, development, and exploratory applications by qualified individuals who understand their limitations and the regulatory requirements.
 
84
 
85
+ Risk Warning: The outputs of these models may contain inaccuracies, biases, or misalignments that could pose risks if relied upon for medical decision-making. The models' performance has not been rigorously evaluated in randomized controlled trials or real-world healthcare environments.
86
+
87
+ Research Tool Only: The models on this leaderboard are intended solely as research tools to assist healthcare professionals and should never be considered a replacement for the professional judgment and expertise of a qualified medical doctor.
88
+
89
+ Further Validation Needed: Proper adaptation and validation of these models for specific medical use cases would require significant additional work, some of them including:
90
+
91
+ 1) Thorough testing and evaluation in relevant clinical scenarios.
92
+ 2) Alignment with evidence-based guidelines and best practices.
93
+ 3) Mitigation of potential biases and failure modes.
94
+ 4) Integration with human oversight and interpretation.
95
+ 5) Compliance with regulatory and ethical standards.
96
+
97
+ Always consult a qualified healthcare provider for personal medical needs.
98
+
99
+
100
+ <h2 style="color: #2c3e50;">About Open Life Science AI</h2>
101
+ An Open Life Science Project to Benchmark and Track AI Progress, Share Models and Datasets in the Life Science Field.
102
+ <a href="https://openlifescience.ai/" target="_blank"> More info </a>
103
+ <h2 style="color: #2c3e50;">Datasets</h2>
104
+
105
+ <div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <ul style="list-style-type: none; padding: 0;"> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.13081" target="_blank" style="color: #3498db;">MedQA (USMLE)</a></h3> <p>1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/1909.06146" target="_blank" style="color: #3498db;">PubMedQA</a></h3> <p>500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://proceedings.mlr.press/v174/pal22a.html" target="_blank" style="color: #3498db;">MedMCQA</a></h3> <p>4183 questions from Indian medical entrance exams (AIIMS & NEET PG) spanning 2.4k healthcare topics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Clinical knowledge</a></h3> <p>265 multiple choice questions on clinical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Medical genetics</a></h3> <p>100 MCQs on medical genetics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Anatomy</a></h3> <p>135 anatomy MCQs</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Professional medicine</a></h3> <p>272 MCQs on professional medicine</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College biology</a></h3> <p>144 MCQs on college-level biology</p> </li> <li> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College medicine</a></h3> <p>173 college medicine MCQs</p> </li> </ul> </div>
106
+
107
+
108
+ <div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <h2 style="color: #2c3e50;">Evaluation Metric</h2> <p>Metric Accuracy (ACC) is used as the main evaluation metric across all datasets.</p> <h2 style="color: #2c3e50;">Details and Logs</h2> <p>Detailed results are available in the results directory:</p> <a href="https://huggingface.co/datasets/openlifescienceai/results" target="_blank" style="color: #3498db;">https://huggingface.co/datasets/openlifescienceai/results</a> <p>Input/outputs for each model can be found in the details page accessible by clicking the πŸ“„ emoji next to the model name.</p> <h2 style="color: #2c3e50;">Reproducibility</h2> <p>To reproduce the results, you can run this evaluation script:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python eval_medical_llm.py</pre> <p>To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python main.py --model=hf-auto --model_args="pretrained=&lt;model&gt;,revision=&lt;revision&gt;,parallelize=True" --tasks=&lt;dataset&gt; --num_fewshot=&lt;n_shots&gt; --batch_size=1 --output_path=&lt;output_dir&gt;</pre> <p>Note some datasets may require additional setup, refer to the Evaluation Harness documentation.</p> <p>Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.</p> <h2 style="color: #2c3e50;">Icons</h2> <ul style="list-style-type: none; padding: 0;"> <li>🟒 Pre-trained model</li> <li>πŸ”Ά Fine-tuned model</li> <li>? Unknown model type</li> <li>β­• Instruction-tuned</li> <li>🟦 RL-tuned</li> </ul> <p>Missing icons indicate the model info is not yet added, feel free to open an issue to include it!</p> </div>
109
+ """
110
+
111
+ LLM_BENCHMARKS_DETAILS = f"""
112
+ Datasets
113
+ <a href="https://arxiv.org/abs/2009.13081" target="_blank">MedQA (USMLE)</a> - 1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge
114
+ <a href="https://arxiv.org/abs/1909.06146" target="_blank">PubMedQA</a> - 500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research
115
+ <a href="https://proceedings.mlr.press/v174/pal22a.html" target="_blank">MedMCQA</a> - 4183 questions from Indian medical entrance exams (AIIMS & NEET PG) spanning 2.4k healthcare topics
116
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-Clinical knowledge</a> - 265 multiple choice questions on clinical knowledge
117
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-Medical genetics</a> - 100 MCQs on medical genetics
118
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-Anatomy</a> - 135 anatomy MCQs
119
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-Professional medicine</a> - 272 MCQs on professional medicine
120
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-College biology</a> - 144 MCQs on college-level biology
121
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-College medicine</a> - 173 college medicine MCQs
122
+ Metric
123
+ Accuracy (ACC) is used as the main evaluation metric across all datasets
124
+ Details and logs
125
+ Detailed results are available in the results directory: https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard/tree/main/results
126
+ Input/outputs for each model can be found in the details page accessible by clicking the πŸ“„ emoji next to the model name
127
+ Reproducibility
128
+ To reproduce the results, you can run this evaluation script: python eval_medical_llm.py.
129
+ To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:
130
+ python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True"
131
+ --tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir>
132
+ Note some datasets may require additional setup, refer to the Evaluation Harness documentation. Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.
133
+ Icons
134
+ 🟒 Pre-trained model
135
+ πŸ”Ά Fine-tuned model
136
+ ? Unknown model type
137
+ β­• instruction-tuned
138
+ 🟦 RL-tuned
139
+ Missing icons indicate the model info is not yet added, feel free to open an issue to include it!
140
+ """
141
+
142
+ Advisory_Notice = """The Open Medical-LLM Leaderboard showcases medical models intended solely for research and development purposes. It is important to be aware of the following:
143
+
144
+ Regulatory Status: The models listed on this leaderboard have not been approved or registered by any regulatory authorities, including the US FDA, the European Medicines Agency (EMA), Health Canada, or the Therapeutic Goods Administration (TGA) in Australia. They are not listed in the US FDA Database for approved AI in healthcare or the EUDAMED database. As such, they are not compliant with regulations such as US FDA 21 CFR 820 and EU MDR 2017/745.
145
+
146
+ Disclaimer: These models are not intended for direct patient care, clinical decision support, or any other professional medical purposes. Their use should be limited to research, development, and exploratory applications by qualified individuals who understand their limitations and the regulatory requirements.
147
+
148
+ Risk Warning: The outputs of these models may contain inaccuracies, biases, or misalignments that could pose risks if relied upon for medical decision-making. The models' performance has not been rigorously evaluated in randomized controlled trials or real-world healthcare environments.
149
+
150
+ Research Tool Only: The models on this leaderboard are intended solely as research tools to assist healthcare professionals and should never be considered a replacement for the professional judgment and expertise of a qualified medical doctor.
151
+
152
+ Further Validation Needed: Proper adaptation and validation of these models for specific medical use cases would require significant additional work, including:
153
+
154
+ 1) Thorough testing and evaluation in relevant clinical scenarios.
155
+ 2) Alignment with evidence-based guidelines and best practices.
156
+ 3) Mitigation of potential biases and failure modes.
157
+ 4) Integration with human oversight and interpretation.
158
+ 5) Compliance with regulatory and ethical standards.
159
+
160
+ For any legal inquiries or concerns, please contact the authors of the MedPaLM papers directly. Always consult a qualified healthcare provider for personal medical needs."""
161
+
162
+ FAQ_TEXT = """
163
+ FAQ
164
+ 1) Submitting a model
165
+ XXX
166
+ 2) Model results
167
+ XXX
168
+ 3) Editing a submission
169
+ XXX
170
  """
171
 
172
  EVALUATION_QUEUE_TEXT = """
173
+ Evaluation Queue for the Open Medical LLM Leaderboard
174
+ Models added here will be automatically evaluated.
175
+
176
+ Before submitting a model
177
+ 1) Verify loading with AutoClasses:
178
+
179
 
 
 
180
  from transformers import AutoConfig, AutoModel, AutoTokenizer
181
+ config = AutoConfig.from_pretrained("model-name", revision=revision)
182
+ model = AutoModel.from_pretrained("model-name", revision=revision)
183
+
184
+ tokenizer = AutoTokenizer.from_pretrained("model-name", revision=revision)
185
+ Debug any loading errors before submission. Make sure the model is public.
186
 
187
+ Note: Models that require use_remote_code=True are not yet supported.
 
188
 
189
+ 2) Convert weights to safetensors
190
+ This allows faster loading and enables showing model parameters in the Extended Viewer.
191
 
192
+ 3) Select correct precision
193
+ Incorrect precision (e.g. loading bf16 as fp16) can cause NaN errors for some models.
194
 
195
+ Debugging failing models
196
+ For models in FAILED status, first ensure the above checks are done.
197
 
198
+ Then test running the Eleuther AI Harness locally using the command in the "Reproducibility" section, specifying all arguments. Add --limit to evaluate on fewer examples per task.
 
 
 
199
  """
200
 
201
+ CITATION_BUTTON_LABEL = "Copy the citation snippet"
202
  CITATION_BUTTON_TEXT = r"""
203
+ @misc{openlifescienceai/open_medical_llm_leaderboard,
204
+ author = {Ankit Pal and Pasquale Minervini and Andreas Geert Motzfeldt and Beatrice Alex},
205
+ title = {openlifescienceai/open_medical_llm_leaderboard},
206
+ year = {2024},
207
+ publisher = {Hugging Face},
208
+ howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard}"
209
+ }
210
+ @misc{singhal2022large,
211
+ title={Large Language Models Encode Clinical Knowledge},
212
+ author={Karan Singhal et al.},
213
+ year={2022},
214
+ eprint={2212.13138},
215
+ archivePrefix={arXiv},
216
+ primaryClass={cs.CL}
217
+ }
218
  """