kevinxie06 commited on
Commit
fb59c30
·
verified ·
1 Parent(s): e5d8b9c

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +921 -0
  2. config.py +37 -0
  3. docs.md +57 -0
  4. requirements.txt +3 -0
  5. task_information.json +698 -0
app.py ADDED
@@ -0,0 +1,921 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns
3
+ import config
4
+ from pathlib import Path
5
+ import pandas as pd
6
+ import json
7
+
8
+ import warnings
9
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Literal
10
+ import pandas as pd
11
+ from pandas.io.formats.style import Styler
12
+
13
+ import semantic_version
14
+ from dataclasses import dataclass, field
15
+
16
+ from gradio.components import Component
17
+ from gradio.data_classes import GradioModel
18
+ from gradio.events import Events
19
+
20
+ @dataclass
21
+ class SelectColumns:
22
+ default_selection: Optional[list[str]] = field(default_factory=list)
23
+ cant_deselect: Optional[list[str]] = field(default_factory=list)
24
+ allow: bool = True
25
+ label: Optional[str] = None
26
+ show_label: bool = True
27
+ info: Optional[str] = None
28
+
29
+ @dataclass
30
+ class ColumnFilter:
31
+ column: str
32
+ type: Literal["slider", "dropdown", "checkboxgroup", "boolean"] = None
33
+ default: Optional[Union[int, float, List[Tuple[str, str]]]] = None
34
+ choices: Optional[Union[int, float, List[Tuple[str, str]]]] = None
35
+ label: Optional[str] = None
36
+ info: Optional[str] = None
37
+ show_label: bool = True
38
+ min: Optional[Union[int, float]] = None
39
+ max: Optional[Union[int, float]] = None
40
+
41
+ class DataframeData(GradioModel):
42
+ headers: List[str]
43
+ data: Union[List[List[Any]], List[Tuple[Any, ...]]]
44
+ metadata: Optional[Dict[str, Optional[List[Any]]]] = None
45
+
46
+
47
+ abs_path = Path(__file__).parent
48
+
49
+ # Load the leaderboard data for
50
+ zero_shot_df = pd.read_json("leaderboards/Zero-Shot_leaderboard_data.json", precise_float=True)
51
+ five_shot_df = pd.read_json("leaderboards/Few-Shot_leaderboard_data.json", precise_float=True)
52
+ cot_df = pd.read_json("leaderboards/CoT_leaderboard_data.json", precise_float=True)
53
+
54
+ # Original Average Performances
55
+ original_zero_shot_avg_perf = zero_shot_df["Average Performance"]
56
+ original_five_shot_avg_perf = five_shot_df["Average Performance"]
57
+ original_cot_avg_perf = cot_df["Average Performance"]
58
+
59
+ # Load the task information json data
60
+ with open("task_information.json", 'r') as file:
61
+ task_information_json = json.load(file)
62
+
63
+ cot_currently_selected_filters = {
64
+ "Language": [],
65
+ "Task Type": [],
66
+ "Clinical Context": [],
67
+ "Data Access": [],
68
+ "Applications": [],
69
+ "Clinical Stage": []
70
+ }
71
+
72
+ five_shot_currently_selected_filters = {
73
+ "Language": [],
74
+ "Task Type": [],
75
+ "Clinical Context": [],
76
+ "Data Access": [],
77
+ "Applications": [],
78
+ "Clinical Stage": []
79
+ }
80
+
81
+ zero_shot_currently_selected_filters = {
82
+ "Language": [],
83
+ "Task Type": [],
84
+ "Clinical Context": [],
85
+ "Data Access": [],
86
+ "Applications": [],
87
+ "Clinical Stage": []
88
+ }
89
+
90
+ # with open("/Users/kevinxie/Desktop/Clinical NLP/Clinical-Text-Leaderboard/leaderboard_data.json", 'r') as file:
91
+ with open("leaderboards/Few-Shot_leaderboard_data.json", 'r') as file:
92
+ five_shot_leaderboard_json = json.load(file)
93
+
94
+ with open("leaderboards/CoT_leaderboard_data.json", 'r') as file:
95
+ CoT_leaderboard_json = json.load(file)
96
+
97
+ with open("leaderboards/Zero-Shot_leaderboard_data.json", 'r') as file:
98
+ zero_shot_leaderboard_json = json.load(file)
99
+
100
+ valid_tasks = {'NUBES', 'NorSynthClinical-NER', 'MEDIQA 2023-sum-A', 'Medication extraction',
101
+ 'IMCS-V2-DAC', 'Cantemist-Coding', 'IFMIR-NER', 'EHRQA-QA', 'Ex4CDS', 'MedDG',
102
+ 'MTS-Temporal', 'CHIP-MDCFNPC', 'n2c2 2014-Diabetes', 'MIMIC-III Outcome.LoS',
103
+ 'n2c2 2014-Hypertension', 'RuCCoN', 'CARES-ICD10 Chapter', 'RuDReC-NER', 'MIMIC-IV DiReCT.Dis',
104
+ 'n2c2 2014-Medication', 'iCorpus', 'Brateca-Hospitalization', 'n2c2 2010-Assertion',
105
+ 'NorSynthClinical-PHI', 'IFMIR - NER&factuality', 'JP-STS', 'NorSynthClinical-RE',
106
+ 'n2c2 2010-Concept', 'BARR2', 'IMCS-V2-NER', 'IMCS-V2-MRG', 'cMedQA', 'MedSTS',
107
+ 'BRONCO150-NER&Status', 'n2c2 2018-ADE&medication', 'CLISTER', 'ClinicalNotes-UPMC',
108
+ 'PPTS', 'CLIP', 'IMCS-V2-SR', 'EHRQA-Sub department', 'BrainMRI-AIS', 'Brateca-Mortality',
109
+ 'meddocan', 'CHIP-CDEE', 'CAS-evidence', 'MEDIQA 2019-RQE', 'Cantemis-Norm', 'MEDIQA 2023-sum-B',
110
+ 'CHIP-CTC', 'C-EMRS', 'CARES ICD10 Block', 'Cantemis-NER', 'CLINpt-NER', 'MEDIQA 2023-chat-A',
111
+ 'n2c2 2014-De-identification', 'n2c2 2014-Hyperlipidemia', 'EHRQA-Primary department',
112
+ 'ADE-Drug dosage', 'IFMIR-Incident type', 'MIMIC-III Outcome.Mortality', 'n2c2 2006-De-identification',
113
+ 'CAS-label', 'MIMIC-IV CDM', 'CodiEsp-ICD-10-CM', 'n2c2 2010-Relation', 'CARES-ICD10 Subblock',
114
+ 'MIE', 'HealthCareMagic-100k', 'ADE-Identification', 'MIMIC-IV DiReCT.PDD', 'ADE-Extraction',
115
+ 'DialMed', 'GOUT-CC-Consensus', 'GraSSCo PHI', 'RuMedNLI', 'RuMedDaNet', 'CBLUE-CDN', 'icliniq-10k',
116
+ 'CARDIO-DE', 'CARES-Area', 'DiSMed-NER', 'CodiEsp-ICD-10-PCS', 'MedNLI', 'MTS', 'MIMIC-IV BHC',
117
+ 'n2c2 2014-CAD'}
118
+
119
+ n_models = int(list(zero_shot_leaderboard_json["Model"].keys())[-1]) + 1
120
+
121
+ def get_filtered_columns(filter_selections):
122
+ """
123
+ Given an array of selected filters, this function will return a list of all
124
+ the columns that match the criteria.
125
+
126
+ Input:
127
+ filter_selections: dictionary of all task type filter selections
128
+
129
+ Output:
130
+ Returns a list of all valid tasks to display (by task name)
131
+ """
132
+ # Need to add a flag to this filter so that it only displays those that match all attributes
133
+ valid_columns = []
134
+ for task in task_information_json:
135
+ task_info = task_information_json[task]
136
+
137
+ # Flag to keep track of whether this task is valid
138
+ isValid = True
139
+
140
+ # Iterate through each attribute of the task
141
+ for attribute in task_info:
142
+ # If the filter is empty
143
+ if not filter_selections[attribute]:
144
+ continue
145
+
146
+ value = task_info[attribute]
147
+
148
+ # print(filter_selections[attribute])
149
+
150
+ # Handle edge case for multiple categories
151
+ if "," in value:
152
+ all_categories = value.split(", ")
153
+
154
+ flag = False
155
+ for category in all_categories:
156
+ if category in filter_selections[attribute]:
157
+ flag = True
158
+ break
159
+
160
+ if flag: # one category matches
161
+ isValid = True
162
+
163
+ else: # none of the categories matched
164
+ isValid = False
165
+
166
+ # Handle Brazilian Edge Case
167
+ elif (value == 'Portuguese\n(Brazilian)') and ('Portuguese' in filter_selections[attribute]):
168
+ isValid = True
169
+ break
170
+
171
+ elif value not in filter_selections[attribute]:
172
+ # if filter_selections[attribute] not in task_info[attribute]:
173
+ isValid = False
174
+ # break
175
+
176
+ if task in valid_tasks and isValid:
177
+ valid_columns.append(task)
178
+
179
+ return valid_columns
180
+
181
+ def isEmpty(currently_selected_filters):
182
+ """
183
+ Checks if there are no selected filters
184
+ """
185
+ flag = True
186
+ for key, value in currently_selected_filters.items():
187
+ if not value:
188
+ continue
189
+ else:
190
+ return False
191
+
192
+ return True
193
+
194
+
195
+ ####################################################################################################
196
+ ####### CoT Filters
197
+ ####################################################################################################
198
+
199
+
200
+ def cot_filter_language(language_choice):
201
+ # Update the Global store for the currently selected filters
202
+ cot_currently_selected_filters["Language"] = language_choice
203
+
204
+ if isEmpty(cot_currently_selected_filters):
205
+ cot_df["Average Performance"] = original_cot_avg_perf
206
+ return cot_df
207
+
208
+ filtered_cols = get_filtered_columns(cot_currently_selected_filters)
209
+
210
+ updated_performance = cot_update_average_performance(filtered_cols)
211
+
212
+ # Convert dictionary keys to integers to match the DataFrame index
213
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
214
+
215
+ # Map the values to the 'Average Performance' column based on index
216
+ cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
217
+
218
+ return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Size (B)', 'Average Performance'] + filtered_cols]
219
+
220
+ def cot_filter_task_type(task_type_choice):
221
+ # Update the Global store for the currently selected filters
222
+ cot_currently_selected_filters["Task Type"] = task_type_choice
223
+
224
+ if isEmpty(cot_currently_selected_filters):
225
+ cot_df["Average Performance"] = original_cot_avg_perf
226
+ return cot_df
227
+
228
+ filtered_cols = get_filtered_columns(cot_currently_selected_filters)
229
+
230
+ updated_performance = cot_update_average_performance(filtered_cols)
231
+
232
+ # Convert dictionary keys to integers to match the DataFrame index
233
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
234
+
235
+ # Map the values to the 'Average Performance' column based on index
236
+ cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
237
+
238
+ return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
239
+
240
+ def cot_filter_clinical_context(clinical_context_choice):
241
+ # Update the Global store for the currently selected filters
242
+ cot_currently_selected_filters["Clinical Context"] = clinical_context_choice
243
+
244
+ if isEmpty(cot_currently_selected_filters):
245
+ cot_df["Average Performance"] = original_cot_avg_perf
246
+ return cot_df
247
+
248
+ filtered_cols = get_filtered_columns(cot_currently_selected_filters)
249
+
250
+ updated_performance = cot_update_average_performance(filtered_cols)
251
+
252
+ # Convert dictionary keys to integers to match the DataFrame index
253
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
254
+
255
+ # Map the values to the 'Average Performance' column based on index
256
+ cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
257
+
258
+ return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
259
+
260
+ def cot_filter_applications(applications_choice):
261
+ # Update the Global store for the currently selected filters
262
+ cot_currently_selected_filters["Applications"] = applications_choice
263
+
264
+ if isEmpty(cot_currently_selected_filters):
265
+ cot_df["Average Performance"] = original_cot_avg_perf
266
+ return cot_df
267
+
268
+ filtered_cols = get_filtered_columns(cot_currently_selected_filters)
269
+
270
+ updated_performance = cot_update_average_performance(filtered_cols)
271
+
272
+ # Convert dictionary keys to integers to match the DataFrame index
273
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
274
+
275
+ # Map the values to the 'Average Performance' column based on index
276
+ cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
277
+
278
+ return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
279
+
280
+ def cot_filter_stage_options(stage_choice):
281
+ # Update the Global store for the currently selected filters
282
+ cot_currently_selected_filters["Clinical Stage"] = stage_choice
283
+
284
+ if isEmpty(cot_currently_selected_filters):
285
+ cot_df["Average Performance"] = original_cot_avg_perf
286
+ return cot_df
287
+
288
+ filtered_cols = get_filtered_columns(cot_currently_selected_filters)
289
+
290
+ updated_performance = cot_update_average_performance(filtered_cols)
291
+
292
+ # Convert dictionary keys to integers to match the DataFrame index
293
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
294
+
295
+ # Map the values to the 'Average Performance' column based on index
296
+ cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
297
+
298
+ return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
299
+
300
+ def cot_filter_data_access(data_access_choice):
301
+ # Update the Global store for the currently selected filters
302
+ cot_currently_selected_filters["Data Access"] = data_access_choice
303
+
304
+ if isEmpty(cot_currently_selected_filters):
305
+ cot_df["Average Performance"] = original_cot_avg_perf
306
+ return cot_df
307
+
308
+ filtered_cols = get_filtered_columns(cot_currently_selected_filters)
309
+
310
+ updated_performance = cot_update_average_performance(filtered_cols)
311
+
312
+ # Convert dictionary keys to integers to match the DataFrame index
313
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
314
+
315
+ # Map the values to the 'Average Performance' column based on index
316
+ cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
317
+
318
+ return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
319
+
320
+ def cot_update_average_performance(selected_columns):
321
+ """
322
+ When a user clicks filters to filter certain tasks, the average performance
323
+ of the model should update. This function takes uses the updated filtered columns
324
+ and calculates the average performances of only those columns. It then updates
325
+ the leaderboard accordingly.
326
+ """
327
+ updated_average_performance = {}
328
+
329
+ for i in range(n_models):
330
+ performance = 0
331
+
332
+ num_tasks = 0
333
+ for task in selected_columns:
334
+ num_tasks += 1
335
+ performance += float(CoT_leaderboard_json[task][str(i)])
336
+
337
+ if num_tasks == 0:
338
+ num_tasks = 1
339
+
340
+ updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2))
341
+
342
+ return updated_average_performance
343
+
344
+
345
+ ####################################################################################################
346
+ ####### Few Shot Filters
347
+ ####################################################################################################
348
+
349
+ def five_shot_filter_language(language_choice):
350
+ # Update the Global store for the currently selected filters
351
+ five_shot_currently_selected_filters["Language"] = language_choice
352
+
353
+ if isEmpty(five_shot_currently_selected_filters):
354
+ five_shot_df["Average Performance"] = original_five_shot_avg_perf
355
+ return five_shot_df
356
+
357
+ filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
358
+
359
+ updated_performance = five_shot_update_average_performance(filtered_cols)
360
+
361
+ # Convert dictionary keys to integers to match the DataFrame index
362
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
363
+
364
+ # Map the values to the 'Average Performance' column based on index
365
+ five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
366
+
367
+ return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
368
+
369
+ def five_shot_filter_task_type(task_type_choice):
370
+ # Update the Global store for the currently selected filters
371
+ five_shot_currently_selected_filters["Task Type"] = task_type_choice
372
+
373
+ if isEmpty(five_shot_currently_selected_filters):
374
+ five_shot_df["Average Performance"] = original_five_shot_avg_perf
375
+ return five_shot_df
376
+
377
+ filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
378
+
379
+ updated_performance = five_shot_update_average_performance(filtered_cols)
380
+
381
+ # Convert dictionary keys to integers to match the DataFrame index
382
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
383
+
384
+ # Map the values to the 'Average Performance' column based on index
385
+ five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
386
+
387
+ return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
388
+
389
+ def five_shot_filter_clinical_context(clinical_context_choice):
390
+ # Update the Global store for the currently selected filters
391
+ five_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice
392
+
393
+ if isEmpty(five_shot_currently_selected_filters):
394
+ five_shot_df["Average Performance"] = original_five_shot_avg_perf
395
+ return five_shot_df
396
+
397
+ filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
398
+
399
+ updated_performance = five_shot_update_average_performance(filtered_cols)
400
+
401
+ # Convert dictionary keys to integers to match the DataFrame index
402
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
403
+
404
+ # Map the values to the 'Average Performance' column based on index
405
+ five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
406
+
407
+ return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
408
+
409
+ def five_shot_filter_applications(applications_choice):
410
+ # Update the Global store for the currently selected filters
411
+ five_shot_currently_selected_filters["Applications"] = applications_choice
412
+
413
+ if isEmpty(five_shot_currently_selected_filters):
414
+ five_shot_df["Average Performance"] = original_five_shot_avg_perf
415
+ return five_shot_df
416
+
417
+ filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
418
+
419
+ updated_performance = five_shot_update_average_performance(filtered_cols)
420
+
421
+ # Convert dictionary keys to integers to match the DataFrame index
422
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
423
+
424
+ # Map the values to the 'Average Performance' column based on index
425
+ five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
426
+
427
+ return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
428
+
429
+ def five_shot_filter_stage_options(stage_choice):
430
+ # Update the Global store for the currently selected filters
431
+ five_shot_currently_selected_filters["Clinical Stage"] = stage_choice
432
+
433
+ if isEmpty(five_shot_currently_selected_filters):
434
+ five_shot_df["Average Performance"] = original_five_shot_avg_perf
435
+ return five_shot_df
436
+
437
+ filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
438
+
439
+ updated_performance = five_shot_update_average_performance(filtered_cols)
440
+
441
+ # Convert dictionary keys to integers to match the DataFrame index
442
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
443
+
444
+ # Map the values to the 'Average Performance' column based on index
445
+ five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
446
+
447
+ return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
448
+
449
+ def five_shot_filter_data_access(data_access_choice):
450
+ # Update the Global store for the currently selected filters
451
+ five_shot_currently_selected_filters["Data Access"] = data_access_choice
452
+
453
+ if isEmpty(five_shot_currently_selected_filters):
454
+ five_shot_df["Average Performance"] = original_five_shot_avg_perf
455
+ return five_shot_df
456
+
457
+ filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
458
+
459
+ updated_performance = five_shot_update_average_performance(filtered_cols)
460
+
461
+ # Convert dictionary keys to integers to match the DataFrame index
462
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
463
+
464
+ # Map the values to the 'Average Performance' column based on index
465
+ five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
466
+
467
+ return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
468
+
469
+
470
+ def five_shot_update_average_performance(selected_columns):
471
+ """
472
+ When a user clicks filters to filter certain tasks, the average performance
473
+ of the model should update. This function takes uses the updated filtered columns
474
+ and calculates the average performances of only those columns. It then updates
475
+ the leaderboard accordingly.
476
+ """
477
+ updated_average_performance = {}
478
+
479
+ for i in range(n_models):
480
+ performance = 0
481
+
482
+ num_tasks = 0
483
+ for task in selected_columns:
484
+ num_tasks += 1
485
+ performance += float(five_shot_leaderboard_json[task][str(i)])
486
+
487
+ if num_tasks == 0:
488
+ num_tasks = 1
489
+
490
+ updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2))
491
+
492
+ return updated_average_performance
493
+
494
+
495
+ ####################################################################################################
496
+ ###### Zero Shot Filters
497
+ ####################################################################################################
498
+
499
+
500
+ def zero_shot_filter_language(language_choice):
501
+ # Update the Global store for the currently selected filters
502
+ zero_shot_currently_selected_filters["Language"] = language_choice
503
+
504
+ if isEmpty(zero_shot_currently_selected_filters):
505
+ zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
506
+ return zero_shot_df
507
+
508
+ filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
509
+
510
+ updated_performance = zero_shot_update_average_performance(filtered_cols)
511
+
512
+ # Convert dictionary keys to integers to match the DataFrame index
513
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
514
+
515
+ # Map the values to the 'Average Performance' column based on index
516
+ zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
517
+
518
+ return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
519
+
520
+ def zero_shot_filter_task_type(task_type_choice):
521
+ # Update the Global store for the currently selected filters
522
+ zero_shot_currently_selected_filters["Task Type"] = task_type_choice
523
+
524
+ if isEmpty(zero_shot_currently_selected_filters):
525
+ zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
526
+ return zero_shot_df
527
+
528
+ filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
529
+
530
+ updated_performance = zero_shot_update_average_performance(filtered_cols)
531
+
532
+ # Convert dictionary keys to integers to match the DataFrame index
533
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
534
+
535
+ # Map the values to the 'Average Performance' column based on index
536
+ zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
537
+
538
+ return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
539
+
540
+ def zero_shot_filter_clinical_context(clinical_context_choice):
541
+ # Update the Global store for the currently selected filters
542
+ zero_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice
543
+
544
+ if isEmpty(zero_shot_currently_selected_filters):
545
+ zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
546
+ return zero_shot_df
547
+
548
+ filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
549
+
550
+ updated_performance = zero_shot_update_average_performance(filtered_cols)
551
+
552
+ # Convert dictionary keys to integers to match the DataFrame index
553
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
554
+
555
+ # Map the values to the 'Average Performance' column based on index
556
+ zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
557
+
558
+ return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
559
+
560
+ def zero_shot_filter_applications(applications_choice):
561
+ # Update the Global store for the currently selected filters
562
+ zero_shot_currently_selected_filters["Applications"] = applications_choice
563
+
564
+ if isEmpty(zero_shot_currently_selected_filters):
565
+ zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
566
+ return zero_shot_df
567
+
568
+ filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
569
+
570
+ updated_performance = zero_shot_update_average_performance(filtered_cols)
571
+
572
+ # Convert dictionary keys to integers to match the DataFrame index
573
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
574
+
575
+ # Map the values to the 'Average Performance' column based on index
576
+ zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
577
+
578
+ return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
579
+
580
+ def zero_shot_filter_stage_options(stage_choice):
581
+ # Update the Global store for the currently selected filters
582
+ zero_shot_currently_selected_filters["Clinical Stage"] = stage_choice
583
+
584
+ if isEmpty(zero_shot_currently_selected_filters):
585
+ zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
586
+ return zero_shot_df
587
+
588
+ filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
589
+
590
+ updated_performance = zero_shot_update_average_performance(filtered_cols)
591
+
592
+ # Convert dictionary keys to integers to match the DataFrame index
593
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
594
+
595
+ # Map the values to the 'Average Performance' column based on index
596
+ zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
597
+
598
+ return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
599
+
600
+ def zero_shot_filter_data_access(data_access_choice):
601
+ # Update the Global store for the currently selected filters
602
+ zero_shot_currently_selected_filters["Data Access"] = data_access_choice
603
+
604
+ if isEmpty(zero_shot_currently_selected_filters):
605
+ zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
606
+ return zero_shot_df
607
+
608
+ filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
609
+
610
+ updated_performance = zero_shot_update_average_performance(filtered_cols)
611
+
612
+ # Convert dictionary keys to integers to match the DataFrame index
613
+ updated_performance_int = {int(k): v for k, v in updated_performance.items()}
614
+
615
+ # Map the values to the 'Average Performance' column based on index
616
+ zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
617
+
618
+ return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
619
+
620
+ def zero_shot_update_average_performance(selected_columns):
621
+ """
622
+ When a user clicks filters to filter certain tasks, the average performance
623
+ of the model should update. This function takes uses the updated filtered columns
624
+ and calculates the average performances of only those columns. It then updates
625
+ the leaderboard accordingly.
626
+ """
627
+ updated_average_performance = {}
628
+
629
+ for i in range(n_models):
630
+ performance = 0
631
+
632
+ num_tasks = 0
633
+ for task in selected_columns:
634
+ num_tasks += 1
635
+ performance += float(zero_shot_leaderboard_json[task][str(i)])
636
+
637
+ if num_tasks == 0:
638
+ num_tasks = 1
639
+
640
+ updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2))
641
+
642
+ return updated_average_performance
643
+
644
+
645
+ def postprocess(self, value: pd.DataFrame) -> DataframeData:
646
+ # Ensure that the "Average Performance" column exists
647
+ if "Average Performance" in value.columns:
648
+ # Sort the DataFrame by the "average performance" column in descending order
649
+ value = value.sort_values(by="Average Performance", ascending=False)
650
+
651
+ return DataframeData(
652
+ headers=list(value.columns), # type: ignore
653
+ data=value.to_dict(orient="split")["data"], # type: ignore
654
+ )
655
+
656
+ if value is None:
657
+ return self.postprocess(pd.DataFrame({"column 1": []}))
658
+ if isinstance(value, (str, pd.DataFrame)):
659
+ if isinstance(value, str):
660
+ value = pd.read_csv(value) # type: ignore
661
+ if len(value) == 0:
662
+ return DataframeData(
663
+ headers=list(value.columns), # type: ignore
664
+ data=[[]], # type: ignore
665
+ )
666
+ return DataframeData(
667
+ headers=list(value.columns), # type: ignore
668
+ data=value.to_dict(orient="split")["data"], # type: ignore
669
+ )
670
+ elif isinstance(value, Styler):
671
+ if semantic_version.Version(pd.__version__) < semantic_version.Version(
672
+ "1.5.0"
673
+ ):
674
+ raise ValueError(
675
+ "Styler objects are only supported in pandas version 1.5.0 or higher. Please try: `pip install --upgrade pandas` to use this feature."
676
+ )
677
+ if self.interactive:
678
+ warnings.warn(
679
+ "Cannot display Styler object in interactive mode. Will display as a regular pandas dataframe instead."
680
+ )
681
+ df: pd.DataFrame = value.data # type: ignore
682
+ if len(df) == 0:
683
+ return DataframeData(
684
+ headers=list(df.columns),
685
+ data=[[]],
686
+ metadata=self.__extract_metadata(value), # type: ignore
687
+ )
688
+ return DataframeData(
689
+ headers=list(df.columns),
690
+ data=df.to_dict(orient="split")["data"], # type: ignore
691
+ metadata=self.__extract_metadata(value), # type: ignore
692
+ )
693
+
694
+ # Models are sorted in order of decreasing average performance (best performance at the top!)
695
+ Leaderboard.postprocess = postprocess
696
+
697
+
698
+ ####################################################################################################
699
+ ###### Leaderboard
700
+ ####################################################################################################
701
+
702
+ with gr.Blocks() as app:
703
+ gr.Markdown("# BRIDGE (Benchmarking Large Language Models in Multilingual Real-world Clinical Text Understanding)")
704
+
705
+ with gr.Tabs():
706
+ with gr.Tab("README"):
707
+ gr.Markdown((Path(__file__).parent / "docs.md").read_text())
708
+
709
+ with gr.Tab("Zero-Shot"):
710
+ leaderboard = Leaderboard(
711
+ value=zero_shot_df,
712
+ select_columns = None,
713
+ search_columns=SearchColumns(primary_column = "Model", secondary_columns = "",
714
+ placeholder="Search by Model Name",
715
+ label="Model Search"),
716
+ hide_columns=["Model: Size Range", "Model: Accessibility"],
717
+ filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"],
718
+ datatype=config.TYPES,
719
+ )
720
+
721
+ # Language Filter
722
+ all_languages = ['English', 'Spanish',
723
+ 'Chinese', 'Norwegian',
724
+ 'Russian', 'Portuguese',
725
+ 'German', 'Japanese', 'French']
726
+
727
+ language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language")
728
+
729
+ # Task Type Filter
730
+ all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition',
731
+ 'Normalization and Coding', 'Natural Language Inference', 'Summarization',
732
+ 'Event Extraction', 'Semantic Similarity']
733
+
734
+
735
+ task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type")
736
+
737
+ all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology',
738
+ 'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology',
739
+ 'General', 'Endocrinology', 'Pediatrics', 'Pharmacology',
740
+ 'Gastroenterology', 'Psychology']
741
+
742
+ cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context")
743
+
744
+ # Applications Filter
745
+ all_applications = ['Procudure information', 'Concept standarization',
746
+ 'Specialist recommendation', 'Negation identification',
747
+ 'Clinical trial matching', 'Consultation summarization',
748
+ 'Semantic relation', 'Post-discharge patient management',
749
+ 'De-identification', 'Billing & Coding', 'Phenotyping',
750
+ 'Data organization', 'Temporal & Causality relation',
751
+ 'Summarization', 'Screen & Consultation', 'Diagnosis',
752
+ 'ADE & Incidents', 'Risk factor extraction', 'Prognosis',
753
+ 'Medication information']
754
+
755
+
756
+ application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application")
757
+
758
+ # Clinical Stage Filter
759
+ all_stages = ['Treatment and Intervention', 'Triage and Referral',
760
+ 'Initial Assessment', 'Discharge and Administration',
761
+ 'Research', 'Diagnosis and Prognosis']
762
+
763
+ stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage")
764
+
765
+ # Data Access Filter
766
+ all_data_access = ['Open Access', 'Regulated']
767
+
768
+ da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access")
769
+
770
+
771
+ language_options.change(fn=zero_shot_filter_language, inputs=language_options, outputs=leaderboard)
772
+ task_type_options.change(fn=zero_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard)
773
+ cc_options.change(fn=zero_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard)
774
+ application_options.change(fn=zero_shot_filter_applications, inputs=application_options, outputs=leaderboard)
775
+ da_options.change(fn=zero_shot_filter_data_access, inputs=da_options, outputs=leaderboard)
776
+ stage_options.change(fn=zero_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard)
777
+
778
+
779
+ with gr.Tab("Few-Shot"):
780
+ leaderboard = Leaderboard(
781
+ value=five_shot_df,
782
+ select_columns = None,
783
+ search_columns=SearchColumns(primary_column = "Model", secondary_columns = "",
784
+ placeholder="Search by Model Name",
785
+ label="Model Search"),
786
+ hide_columns=["Model: Size Range", "Model: Accessibility"],
787
+ filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"],
788
+ datatype=config.TYPES,
789
+ )
790
+
791
+ # Language Filter
792
+ all_languages = ['English', 'Spanish',
793
+ 'Chinese', 'Norwegian',
794
+ 'Russian', 'Portuguese',
795
+ 'German', 'Japanese', 'French']
796
+
797
+ language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language")
798
+
799
+ # Task Type Filter
800
+ all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition',
801
+ 'Normalization and Coding', 'Natural Language Inference', 'Summarization',
802
+ 'Event Extraction', 'Semantic Similarity']
803
+
804
+ task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type")
805
+
806
+
807
+ # Clinical Context Filter
808
+ all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology',
809
+ 'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology',
810
+ 'General', 'Endocrinology', 'Pediatrics', 'Pharmacology',
811
+ 'Gastroenterology', 'Psychology']
812
+
813
+ cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context")
814
+
815
+ # Applications Filter
816
+ all_applications = ['Procudure information', 'Concept standarization',
817
+ 'Specialist recommendation', 'Negation identification',
818
+ 'Clinical trial matching', 'Consultation summarization',
819
+ 'Semantic relation', 'Post-discharge patient management',
820
+ 'De-identification', 'Billing & Coding', 'Phenotyping',
821
+ 'Data organization', 'Temporal & Causality relation',
822
+ 'Summarization', 'Screen & Consultation', 'Diagnosis',
823
+ 'ADE & Incidents', 'Risk factor extraction', 'Prognosis',
824
+ 'Medication information']
825
+
826
+ application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application")
827
+
828
+ # Clinical Stage Filter
829
+ all_stages = ['Treatment and Intervention', 'Triage and Referral',
830
+ 'Initial Assessment', 'Discharge and Administration',
831
+ 'Research', 'Diagnosis and Prognosis']
832
+
833
+ stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage")
834
+
835
+ # Data Access Filter
836
+ all_data_access = ['Open Access', 'Regulated']
837
+
838
+ da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access")
839
+
840
+ language_options.change(fn=five_shot_filter_language, inputs=language_options, outputs=leaderboard)
841
+ task_type_options.change(fn=five_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard)
842
+ cc_options.change(fn=five_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard)
843
+ application_options.change(fn=five_shot_filter_applications, inputs=application_options, outputs=leaderboard)
844
+ da_options.change(fn=five_shot_filter_data_access, inputs=da_options, outputs=leaderboard)
845
+ stage_options.change(fn=five_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard)
846
+
847
+
848
+ with gr.Tab("CoT"):
849
+ leaderboard = Leaderboard(
850
+ value=cot_df,
851
+ select_columns = None,
852
+ search_columns=SearchColumns(primary_column = "Model", secondary_columns = "",
853
+ placeholder="Search by Model Name",
854
+ label="Model Search"),
855
+ hide_columns=["Model: Size Range", "Model: Accessibility"],
856
+ filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"],
857
+ datatype=config.TYPES,
858
+ )
859
+
860
+ # Language Filter
861
+ all_languages = ['English', 'Spanish',
862
+ 'Chinese', 'Norwegian',
863
+ 'Russian', 'Portuguese',
864
+ 'German', 'Japanese', 'French']
865
+
866
+ language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language")
867
+
868
+ # Task Type Filter
869
+ all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition',
870
+ 'Normalization and Coding', 'Natural Language Inference', 'Summarization',
871
+ 'Event Extraction', 'Semantic Similarity']
872
+
873
+ task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type")
874
+
875
+ # Clinical Context Filter
876
+ all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology',
877
+ 'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology',
878
+ 'General', 'Endocrinology', 'Pediatrics', 'Pharmacology',
879
+ 'Gastroenterology', 'Psychology']
880
+
881
+ cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context")
882
+
883
+ # Applications Filter
884
+ all_applications = ['Procudure information', 'Concept standarization',
885
+ 'Specialist recommendation', 'Negation identification',
886
+ 'Clinical trial matching', 'Consultation summarization',
887
+ 'Semantic relation', 'Post-discharge patient management',
888
+ 'De-identification', 'Billing & Coding', 'Phenotyping',
889
+ 'Data organization', 'Temporal & Causality relation',
890
+ 'Summarization', 'Screen & Consultation', 'Diagnosis',
891
+ 'ADE & Incidents', 'Risk factor extraction', 'Prognosis',
892
+ 'Medication information']
893
+
894
+ application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application")
895
+
896
+ # Clinical Stage Filter
897
+ all_stages = ['Treatment and Intervention', 'Triage and Referral',
898
+ 'Initial Assessment', 'Discharge and Administration',
899
+ 'Research', 'Diagnosis and Prognosis']
900
+
901
+ stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage")
902
+
903
+
904
+ # Data Access Filter
905
+ all_data_access = ['Open Access', 'Regulated']
906
+
907
+ da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access")
908
+
909
+
910
+ language_options.change(fn=cot_filter_language, inputs=language_options, outputs=leaderboard)
911
+ task_type_options.change(fn=cot_filter_task_type, inputs=task_type_options, outputs=leaderboard)
912
+ cc_options.change(fn=cot_filter_clinical_context, inputs=cc_options, outputs=leaderboard)
913
+ application_options.change(fn=cot_filter_applications, inputs=application_options, outputs=leaderboard)
914
+ da_options.change(fn=cot_filter_data_access, inputs=da_options, outputs=leaderboard)
915
+
916
+ stage_options.change(fn=cot_filter_stage_options, inputs=stage_options, outputs=leaderboard)
917
+
918
+
919
+ if __name__ == "__main__":
920
+ app.launch()
921
+
config.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ TYPES = [
4
+ "str",
5
+ "markdown",
6
+ "number",
7
+ "number",
8
+ "number",
9
+ "number",
10
+ "number",
11
+ "number",
12
+ "number",
13
+ "str",
14
+ "str",
15
+ "str",
16
+ "str",
17
+ "bool",
18
+ "str",
19
+ "number",
20
+ "number",
21
+ "bool",
22
+ "str",
23
+ "bool",
24
+ "bool",
25
+ "str",
26
+ ]
27
+
28
+ NUMERIC_INTERVALS = {
29
+ "?": pd.Interval(-1, 0, closed="right"),
30
+ "~1.5": pd.Interval(0, 2, closed="right"),
31
+ "~3": pd.Interval(2, 4, closed="right"),
32
+ "~7": pd.Interval(4, 9, closed="right"),
33
+ "~13": pd.Interval(9, 20, closed="right"),
34
+ "~35": pd.Interval(20, 45, closed="right"),
35
+ "~60": pd.Interval(45, 70, closed="right"),
36
+ "70+": pd.Interval(70, 10000, closed="right"),
37
+ }
docs.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/67a040fb6934f9aa1c866f99/lsCIUxFkADB-Wf9cteeB4.png)
3
+
4
+ ![image/webp](https://cdn-uploads.huggingface.co/production/uploads/67a040fb6934f9aa1c866f99/E-WF4uJB0GzplioJkWh5v.webp)
5
+
6
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/67a040fb6934f9aa1c866f99/xQqbGXh0y6zIV78Cw6Vpq.png)
7
+
8
+
9
+
10
+ ## 📜 Background
11
+ Recent advances in **Large Language Models (LLMs)** have demonstrated transformative potential in improving healthcare delivery and clinical research. By combining extensive pretraining with supervised instruction tuning across diverse tasks, LLMs excel in natural language understanding, generation, and reasoning. These capabilities allow LLMs to serve as versatile, general-purpose medical assistants.
12
+
13
+ Despite this promise, concerns remain around the **reliability and clinical validity** of LLM-generated outputs. Real-world contexts often involve unstructured, multilingual text from **electronic health records (EHRs)**, and require support for tasks like phenotype identification and event extraction that remain underexplored. Moreover, the scarcity of **multilingual benchmarks** further limits the global applicability of LLMs in medicine.
14
+
15
+ To address these challenges, we introduce the ***largest multilingual clinical benchmark*** to date, **BRIDGE (Benchmarking Large Language Models in Multilingual Real-world Clinical Text Understanding)**, evaluating 52 LLMs on:
16
+
17
+ - **87 clinical tasks**
18
+ - **9 languages**
19
+ - **1M+ clinical samples**
20
+
21
+ ## 🌍 Key Features
22
+
23
+ Our benchmark spans a wide range of document types and clinical tasks, including classification, event extraction, and generation. It further supports three inference strategies: **zero-shot**, **few-shot**, and **chain-of-thought (CoT)** prompting. We evaluated **52 LLMs**, including general-purpose, open-source, proprietary, and medical-domain models.
24
+
25
+
26
+ - **Multilingual Data**: Clinical tasks in **9 languages** for global relevance.
27
+ - **Diverse Clinical Documents**: Notes, summaries, radiology reports, and more.
28
+ - **Multiple NLP Tasks**: Classification, extraction, QA, summarization, etc.
29
+ - **Evaluation Modes**:
30
+ - **Zero-shot**
31
+ - **Few-shot**
32
+ - **Chain-of-Thought (CoT)** reasoning
33
+
34
+
35
+ ## 🏆 BRIDGE Leaderboard
36
+
37
+ To support ongoing evaluation, we introduce our **BRIDGE Leaderboard**, which provides:
38
+
39
+ - Easy visualizations
40
+ - Side-by-side comparisons
41
+ - Continuous tracking of LLM performance across tasks, languages, and evaluation strategies
42
+
43
+ This leaderboard empowers researchers and clinicians to make informed decisions and track model progress over time.
44
+
45
+
46
+ ## 📚 Citation
47
+
48
+ If you use this benchmark in your research or development, please cite:
49
+
50
+ ```bibtex
51
+ @article{BRIDGE2025,
52
+ title = {PAPER TITLE},
53
+ author = {Your Name and Contributors},
54
+ year = {2025},
55
+ journal = {Your Journal or Conference},
56
+ }
57
+
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==5.18.0
2
+ gradio_leaderboard==0.0.13
3
+ pandas==2.2.3
task_information.json ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ADE-Identification": {
3
+ "Language": "English",
4
+ "Task Type": "Text Classification",
5
+ "Clinical Context": "Pharmacology",
6
+ "Data Access": "Open Access",
7
+ "Applications": "ADE & Incidents",
8
+ "Clinical Stage": "Treatment and Intervention"
9
+ },
10
+ "ADE-Extraction": {
11
+ "Language": "English",
12
+ "Task Type": "Event Extraction",
13
+ "Clinical Context": "Pharmacology",
14
+ "Data Access": "Open Access",
15
+ "Applications": "ADE & Incidents",
16
+ "Clinical Stage": "Treatment and Intervention"
17
+ },
18
+ "ADE-Drug dosage": {
19
+ "Language": "English",
20
+ "Task Type": "Event Extraction",
21
+ "Clinical Context": "Pharmacology",
22
+ "Data Access": "Open Access",
23
+ "Applications": "Medication information",
24
+ "Clinical Stage": "Treatment and Intervention"
25
+ },
26
+ "BARR2": {
27
+ "Language": "Spanish",
28
+ "Task Type": "Event Extraction",
29
+ "Clinical Context": "General",
30
+ "Data Access": "Open Access",
31
+ "Applications": "Concept standarization",
32
+ "Clinical Stage": "Research"
33
+ },
34
+ "BrainMRI-AIS": {
35
+ "Language": "English",
36
+ "Task Type": "Text Classification",
37
+ "Clinical Context": "Neurology, Radiology",
38
+ "Data Access": "Open Access",
39
+ "Applications": "Diagnosis",
40
+ "Clinical Stage": "Diagnosis and Prognosis"
41
+ },
42
+ "Brateca-Hospitalization": {
43
+ "Language": "Portuguese\n(Brazilian)",
44
+ "Task Type": "Text Classification",
45
+ "Clinical Context": "General",
46
+ "Data Access": "Regulated",
47
+ "Applications": "Prognosis",
48
+ "Clinical Stage": "Diagnosis and Prognosis"
49
+ },
50
+ "Brateca-Mortality": {
51
+ "Language": "Portuguese\n(Brazilian)",
52
+ "Task Type": "Text Classification",
53
+ "Clinical Context": "General",
54
+ "Data Access": "Regulated",
55
+ "Applications": "Prognosis",
56
+ "Clinical Stage": "Diagnosis and Prognosis"
57
+ },
58
+ "Cantemist-Coding": {
59
+ "Language": "Spanish",
60
+ "Task Type": "Normalization and Coding",
61
+ "Clinical Context": "Oncology",
62
+ "Data Access": "Open Access",
63
+ "Applications": "Billing & Coding",
64
+ "Clinical Stage": "Discharge and Administration"
65
+ },
66
+ "Cantemis-NER": {
67
+ "Language": "Spanish",
68
+ "Task Type": "Named Entity Recognition",
69
+ "Clinical Context": "Oncology",
70
+ "Data Access": "Open Access",
71
+ "Applications": "Billing & Coding",
72
+ "Clinical Stage": "Discharge and Administration"
73
+ },
74
+ "Cantemis-Norm": {
75
+ "Language": "Spanish",
76
+ "Task Type": "Normalization and Coding",
77
+ "Clinical Context": "Oncology",
78
+ "Data Access": "Open Access",
79
+ "Applications": "Billing & Coding",
80
+ "Clinical Stage": "Discharge and Administration"
81
+ },
82
+ "CARES-Area": {
83
+ "Language": "Spanish",
84
+ "Task Type": "Text Classification",
85
+ "Clinical Context": "Radiology",
86
+ "Data Access": "Open Access",
87
+ "Applications": "Billing & Coding",
88
+ "Clinical Stage": "Discharge and Administration"
89
+ },
90
+ "CARES ICD10 Block": {
91
+ "Language": "Spanish",
92
+ "Task Type": "Normalization and Coding",
93
+ "Clinical Context": "Radiology",
94
+ "Data Access": "Open Access",
95
+ "Applications": "Billing & Coding",
96
+ "Clinical Stage": "Discharge and Administration"
97
+ },
98
+ "CARES-ICD10 Chapter": {
99
+ "Language": "Spanish",
100
+ "Task Type": "Normalization and Coding",
101
+ "Clinical Context": "Radiology",
102
+ "Data Access": "Open Access",
103
+ "Applications": "Billing & Coding",
104
+ "Clinical Stage": "Discharge and Administration"
105
+ },
106
+ "CARES-ICD10 Subblock": {
107
+ "Language": "Spanish",
108
+ "Task Type": "Normalization and Coding",
109
+ "Clinical Context": "Radiology",
110
+ "Data Access": "Open Access",
111
+ "Applications": "Billing & Coding",
112
+ "Clinical Stage": "Discharge and Administration"
113
+ },
114
+ "CHIP-CDEE": {
115
+ "Language": "Chinese",
116
+ "Task Type": "Event Extraction",
117
+ "Clinical Context": "General",
118
+ "Data Access": "Open Access",
119
+ "Applications": "Temporal & Causality relation",
120
+ "Clinical Stage": "Initial Assessment"
121
+ },
122
+ "C-EMRS": {
123
+ "Language": "Chinese",
124
+ "Task Type": "Text Classification",
125
+ "Clinical Context": "Radiology, Endocrinology, Pulmonology, Cardiology, Gastroenterology",
126
+ "Data Access": "Open Access",
127
+ "Applications": "Diagnosis",
128
+ "Clinical Stage": "Diagnosis and Prognosis"
129
+ },
130
+ "CodiEsp-ICD-10-CM": {
131
+ "Language": "Spanish",
132
+ "Task Type": "Normalization and Coding",
133
+ "Clinical Context": "General",
134
+ "Data Access": "Open Access",
135
+ "Applications": "Billing & Coding",
136
+ "Clinical Stage": "Discharge and Administration"
137
+ },
138
+ "CodiEsp-ICD-10-PCS": {
139
+ "Language": "Spanish",
140
+ "Task Type": "Normalization and Coding",
141
+ "Clinical Context": "General",
142
+ "Data Access": "Open Access",
143
+ "Applications": "Billing & Coding",
144
+ "Clinical Stage": "Discharge and Administration"
145
+ },
146
+ "ClinicalNotes-UPMC": {
147
+ "Language": "English",
148
+ "Task Type": "Text Classification",
149
+ "Clinical Context": "General",
150
+ "Data Access": "Open Access",
151
+ "Applications": "Negation identification",
152
+ "Clinical Stage": "Research"
153
+ },
154
+ "PPTS": {
155
+ "Language": "Spanish",
156
+ "Task Type": "Text Classification",
157
+ "Clinical Context": "Pulmonology",
158
+ "Data Access": "Open Access",
159
+ "Applications": "Diagnosis",
160
+ "Clinical Stage": "Diagnosis and Prognosis"
161
+ },
162
+ "CLINpt-NER": {
163
+ "Language": "Portuguese",
164
+ "Task Type": "Named Entity Recognition",
165
+ "Clinical Context": "Neurology",
166
+ "Data Access": "Open Access",
167
+ "Applications": "Procudure information",
168
+ "Clinical Stage": "Treatment and Intervention"
169
+ },
170
+ "CLIP": {
171
+ "Language": "English",
172
+ "Task Type": "Text Classification",
173
+ "Clinical Context": "Critical Care",
174
+ "Data Access": "Regulated",
175
+ "Applications": "Post-discharge patient management",
176
+ "Clinical Stage": "Discharge and Administration"
177
+ },
178
+ "cMedQA": {
179
+ "Language": "Chinese",
180
+ "Task Type": "Question Answering",
181
+ "Clinical Context": "General",
182
+ "Data Access": "Open Access",
183
+ "Applications": "Screen & Consultation",
184
+ "Clinical Stage": "Triage and Referral"
185
+ },
186
+ "DialMed": {
187
+ "Language": "Chinese",
188
+ "Task Type": "Text Classification",
189
+ "Clinical Context": "Pulmonology, Gastroenterology, Dermatology, Pharmacology",
190
+ "Data Access": "Open Access",
191
+ "Applications": "Medication information",
192
+ "Clinical Stage": "Treatment and Intervention"
193
+ },
194
+ "DiSMed-NER": {
195
+ "Language": "Spanish",
196
+ "Task Type": "Named Entity Recognition",
197
+ "Clinical Context": "Radiology",
198
+ "Data Access": "Regulated",
199
+ "Applications": "De-identification",
200
+ "Clinical Stage": "Research"
201
+ },
202
+ "MIE": {
203
+ "Language": "Chinese",
204
+ "Task Type": "Event Extraction",
205
+ "Clinical Context": "Cardiology",
206
+ "Data Access": "Open Access",
207
+ "Applications": "Phenotyping",
208
+ "Clinical Stage": "Initial Assessment"
209
+ },
210
+ "EHRQA-Primary department": {
211
+ "Language": "Chinese",
212
+ "Task Type": "Text Classification",
213
+ "Clinical Context": "General",
214
+ "Data Access": "Regulated",
215
+ "Applications": "Specialist recommendation",
216
+ "Clinical Stage": "Triage and Referral"
217
+ },
218
+ "EHRQA-QA": {
219
+ "Language": "Chinese",
220
+ "Task Type": "Question Answering",
221
+ "Clinical Context": "General",
222
+ "Data Access": "Regulated",
223
+ "Applications": "Screen & Consultation",
224
+ "Clinical Stage": "Triage and Referral"
225
+ },
226
+ "EHRQA-Sub department": {
227
+ "Language": "Chinese",
228
+ "Task Type": "Text Classification",
229
+ "Clinical Context": "General",
230
+ "Data Access": "Regulated",
231
+ "Applications": "Specialist recommendation",
232
+ "Clinical Stage": "Triage and Referral"
233
+ },
234
+ "Ex4CDS": {
235
+ "Language": "German",
236
+ "Task Type": "Named Entity Recognition",
237
+ "Clinical Context": "Nephrology",
238
+ "Data Access": "Open Access",
239
+ "Applications": "Procudure information",
240
+ "Clinical Stage": "Treatment and Intervention"
241
+ },
242
+ "GOUT-CC-Consensus": {
243
+ "Language": "English",
244
+ "Task Type": "Text Classification",
245
+ "Clinical Context": "Endocrinology",
246
+ "Data Access": "Regulated",
247
+ "Applications": "Diagnosis",
248
+ "Clinical Stage": "Diagnosis and Prognosis"
249
+ },
250
+ "n2c2 2006-De-identification": {
251
+ "Language": "English",
252
+ "Task Type": "Named Entity Recognition",
253
+ "Clinical Context": "Pulmonology",
254
+ "Data Access": "Regulated",
255
+ "Applications": "De-identification",
256
+ "Clinical Stage": "Research"
257
+ },
258
+ "Medication extraction": {
259
+ "Language": "English",
260
+ "Task Type": "Event Extraction",
261
+ "Clinical Context": "Pharmacology",
262
+ "Data Access": "Regulated",
263
+ "Applications": "Medication information",
264
+ "Clinical Stage": "Treatment and Intervention"
265
+ },
266
+ "n2c2 2010-Concept": {
267
+ "Language": "English",
268
+ "Task Type": "Named Entity Recognition",
269
+ "Clinical Context": "Critical Care",
270
+ "Data Access": "Regulated",
271
+ "Applications": "Procudure information",
272
+ "Clinical Stage": "Treatment and Intervention"
273
+ },
274
+ "n2c2 2010-Assertion": {
275
+ "Language": "English",
276
+ "Task Type": "Named Entity Recognition",
277
+ "Clinical Context": "Critical Care",
278
+ "Data Access": "Regulated",
279
+ "Applications": "Post-discharge patient management",
280
+ "Clinical Stage": "Discharge and Administration"
281
+ },
282
+ "n2c2 2010-Relation": {
283
+ "Language": "English",
284
+ "Task Type": "Event Extraction",
285
+ "Clinical Context": "Critical Care",
286
+ "Data Access": "Regulated",
287
+ "Applications": "Procudure information",
288
+ "Clinical Stage": "Treatment and Intervention"
289
+ },
290
+ "n2c2 2014-De-identification": {
291
+ "Language": "English",
292
+ "Task Type": "Named Entity Recognition",
293
+ "Clinical Context": "Endocrinology",
294
+ "Data Access": "Regulated",
295
+ "Applications": "De-identification",
296
+ "Clinical Stage": "Research"
297
+ },
298
+ "IMCS-V2-NER": {
299
+ "Language": "Chinese",
300
+ "Task Type": "Named Entity Recognition",
301
+ "Clinical Context": "Pediatrics",
302
+ "Data Access": "Open Access",
303
+ "Applications": "Phenotyping",
304
+ "Clinical Stage": "Initial Assessment"
305
+ },
306
+ "JP-STS": {
307
+ "Language": "Japanese",
308
+ "Task Type": "Semantic Similarity",
309
+ "Clinical Context": "General",
310
+ "Data Access": "Open Access",
311
+ "Applications": "Semantic relation",
312
+ "Clinical Stage": "Research"
313
+ },
314
+ "meddocan": {
315
+ "Language": "Spanish",
316
+ "Task Type": "Named Entity Recognition",
317
+ "Clinical Context": "General",
318
+ "Data Access": "Open Access",
319
+ "Applications": "De-identification",
320
+ "Clinical Stage": "Research"
321
+ },
322
+ "MEDIQA 2019-RQE": {
323
+ "Language": "English",
324
+ "Task Type": "Natural Language Inference",
325
+ "Clinical Context": "General",
326
+ "Data Access": "Open Access",
327
+ "Applications": "Screen & Consultation",
328
+ "Clinical Stage": "Triage and Referral"
329
+ },
330
+ "MedNLI": {
331
+ "Language": "English",
332
+ "Task Type": "Natural Language Inference",
333
+ "Clinical Context": "Critical Care",
334
+ "Data Access": "Regulated",
335
+ "Applications": "Semantic relation",
336
+ "Clinical Stage": "Research"
337
+ },
338
+ "MedSTS": {
339
+ "Language": "English",
340
+ "Task Type": "Semantic Similarity",
341
+ "Clinical Context": "General",
342
+ "Data Access": "Regulated",
343
+ "Applications": "Semantic relation",
344
+ "Clinical Stage": "Research"
345
+ },
346
+ "MTS": {
347
+ "Language": "English",
348
+ "Task Type": "Text Classification",
349
+ "Clinical Context": "General",
350
+ "Data Access": "Open Access",
351
+ "Applications": "Data organization",
352
+ "Clinical Stage": "Research"
353
+ },
354
+ "MTS-Temporal": {
355
+ "Language": "English",
356
+ "Task Type": "Named Entity Recognition",
357
+ "Clinical Context": "Pediatrics, Psychology",
358
+ "Data Access": "Open Access",
359
+ "Applications": "Temporal & Causality relation",
360
+ "Clinical Stage": "Initial Assessment"
361
+ },
362
+ "n2c2 2018-ADE&medication": {
363
+ "Language": "English",
364
+ "Task Type": "Event Extraction",
365
+ "Clinical Context": "Pharmacology",
366
+ "Data Access": "Regulated",
367
+ "Applications": "ADE & Incidents",
368
+ "Clinical Stage": "Treatment and Intervention"
369
+ },
370
+ "NorSynthClinical-NER": {
371
+ "Language": "Norwegian",
372
+ "Task Type": "Named Entity Recognition",
373
+ "Clinical Context": "Cardiology",
374
+ "Data Access": "Open Access",
375
+ "Applications": "Temporal & Causality relation",
376
+ "Clinical Stage": "Initial Assessment"
377
+ },
378
+ "NorSynthClinical-RE": {
379
+ "Language": "Norwegian",
380
+ "Task Type": "Event Extraction",
381
+ "Clinical Context": "Cardiology",
382
+ "Data Access": "Open Access",
383
+ "Applications": "Temporal & Causality relation",
384
+ "Clinical Stage": "Initial Assessment"
385
+ },
386
+ "NUBES": {
387
+ "Language": "Spanish",
388
+ "Task Type": "Event Extraction",
389
+ "Clinical Context": "General",
390
+ "Data Access": "Open Access",
391
+ "Applications": "Negation identification",
392
+ "Clinical Stage": "Research"
393
+ },
394
+ "MEDIQA 2023-chat-A": {
395
+ "Language": "English",
396
+ "Task Type": "Summarization",
397
+ "Clinical Context": "General",
398
+ "Data Access": "Open Access",
399
+ "Applications": "Consultation summarization",
400
+ "Clinical Stage": "Initial Assessment"
401
+ },
402
+ "MEDIQA 2023-sum-A": {
403
+ "Language": "English",
404
+ "Task Type": "Text Classification",
405
+ "Clinical Context": "General",
406
+ "Data Access": "Open Access",
407
+ "Applications": "Data organization",
408
+ "Clinical Stage": "Research"
409
+ },
410
+ "MEDIQA 2023-sum-B": {
411
+ "Language": "English",
412
+ "Task Type": "Summarization",
413
+ "Clinical Context": "General",
414
+ "Data Access": "Open Access",
415
+ "Applications": "Consultation summarization",
416
+ "Clinical Stage": "Initial Assessment"
417
+ },
418
+ "RuMedDaNet": {
419
+ "Language": "Russian",
420
+ "Task Type": "Natural Language Inference",
421
+ "Clinical Context": "General",
422
+ "Data Access": "Open Access",
423
+ "Applications": "Screen & Consultation",
424
+ "Clinical Stage": "Triage and Referral"
425
+ },
426
+ "CBLUE-CDN": {
427
+ "Language": "Chinese",
428
+ "Task Type": "Normalization and Coding",
429
+ "Clinical Context": "General",
430
+ "Data Access": "Open Access",
431
+ "Applications": "Billing & Coding",
432
+ "Clinical Stage": "Discharge and Administration"
433
+ },
434
+ "CHIP-CTC": {
435
+ "Language": "Chinese",
436
+ "Task Type": "Text Classification",
437
+ "Clinical Context": "General",
438
+ "Data Access": "Open Access",
439
+ "Applications": "Clinical trial matching",
440
+ "Clinical Stage": "Research"
441
+ },
442
+ "CHIP-MDCFNPC": {
443
+ "Language": "Chinese",
444
+ "Task Type": "Event Extraction",
445
+ "Clinical Context": "General",
446
+ "Data Access": "Open Access",
447
+ "Applications": "Phenotyping",
448
+ "Clinical Stage": "Initial Assessment"
449
+ },
450
+ "MedDG": {
451
+ "Language": "Chinese",
452
+ "Task Type": "Question Answering",
453
+ "Clinical Context": "Gastroenterology",
454
+ "Data Access": "Open Access",
455
+ "Applications": "Screen & Consultation",
456
+ "Clinical Stage": "Triage and Referral"
457
+ },
458
+ "IMCS-V2-SR": {
459
+ "Language": "Chinese",
460
+ "Task Type": "Event Extraction",
461
+ "Clinical Context": "Pediatrics",
462
+ "Data Access": "Open Access",
463
+ "Applications": "Phenotyping",
464
+ "Clinical Stage": "Initial Assessment"
465
+ },
466
+ "IMCS-V2-MRG": {
467
+ "Language": "Chinese",
468
+ "Task Type": "Summarization",
469
+ "Clinical Context": "Pediatrics",
470
+ "Data Access": "Open Access",
471
+ "Applications": "Consultation summarization",
472
+ "Clinical Stage": "Initial Assessment"
473
+ },
474
+ "IMCS-V2-DAC": {
475
+ "Language": "Chinese",
476
+ "Task Type": "Text Classification",
477
+ "Clinical Context": "Pediatrics",
478
+ "Data Access": "Open Access",
479
+ "Applications": "Screen & Consultation",
480
+ "Clinical Stage": "Triage and Referral"
481
+ },
482
+ "n2c2 2014-Diabetes": {
483
+ "Language": "English",
484
+ "Task Type": "Event Extraction",
485
+ "Clinical Context": "Cardiology, Endocrinology",
486
+ "Data Access": "Regulated",
487
+ "Applications": "Risk factor extraction",
488
+ "Clinical Stage": "Initial Assessment"
489
+ },
490
+ "n2c2 2014-CAD": {
491
+ "Language": "English",
492
+ "Task Type": "Event Extraction",
493
+ "Clinical Context": "Cardiology, Endocrinology",
494
+ "Data Access": "Regulated",
495
+ "Applications": "Risk factor extraction",
496
+ "Clinical Stage": "Initial Assessment"
497
+ },
498
+ "n2c2 2014-Hyperlipidemia": {
499
+ "Language": "English",
500
+ "Task Type": "Event Extraction",
501
+ "Clinical Context": "Cardiology, Endocrinology",
502
+ "Data Access": "Regulated",
503
+ "Applications": "Risk factor extraction",
504
+ "Clinical Stage": "Initial Assessment"
505
+ },
506
+ "n2c2 2014-Hypertension": {
507
+ "Language": "English",
508
+ "Task Type": "Event Extraction",
509
+ "Clinical Context": "Cardiology, Endocrinology",
510
+ "Data Access": "Regulated",
511
+ "Applications": "Risk factor extraction",
512
+ "Clinical Stage": "Initial Assessment"
513
+ },
514
+ "n2c2 2014-Medication": {
515
+ "Language": "English",
516
+ "Task Type": "Event Extraction",
517
+ "Clinical Context": "Cardiology, Endocrinology",
518
+ "Data Access": "Regulated",
519
+ "Applications": "Medication information",
520
+ "Clinical Stage": "Treatment and Intervention"
521
+ },
522
+ "CAS-label": {
523
+ "Language": "French",
524
+ "Task Type": "Event Extraction",
525
+ "Clinical Context": "General",
526
+ "Data Access": "Regulated",
527
+ "Applications": "Post-discharge patient management",
528
+ "Clinical Stage": "Discharge and Administration"
529
+ },
530
+ "CAS-evidence": {
531
+ "Language": "French",
532
+ "Task Type": "Summarization",
533
+ "Clinical Context": "General",
534
+ "Data Access": "Regulated",
535
+ "Applications": "Summarization",
536
+ "Clinical Stage": "Discharge and Administration"
537
+ },
538
+ "RuMedNLI": {
539
+ "Language": "Russian",
540
+ "Task Type": "Natural Language Inference",
541
+ "Clinical Context": "Critical Care",
542
+ "Data Access": "Open Access",
543
+ "Applications": "Semantic relation",
544
+ "Clinical Stage": "Research"
545
+ },
546
+ "RuDReC-NER": {
547
+ "Language": "Russian",
548
+ "Task Type": "Named Entity Recognition",
549
+ "Clinical Context": "Pharmacology",
550
+ "Data Access": "Open Access",
551
+ "Applications": "ADE & Incidents",
552
+ "Clinical Stage": "Treatment and Intervention"
553
+ },
554
+ "NorSynthClinical-PHI": {
555
+ "Language": "Norwegian",
556
+ "Task Type": "Named Entity Recognition",
557
+ "Clinical Context": "Cardiology",
558
+ "Data Access": "Open Access",
559
+ "Applications": "De-identification",
560
+ "Clinical Stage": "Research"
561
+ },
562
+ "RuCCoN": {
563
+ "Language": "Russian",
564
+ "Task Type": "Named Entity Recognition",
565
+ "Clinical Context": "Pulmonology",
566
+ "Data Access": "Open Access",
567
+ "Applications": "Procudure information",
568
+ "Clinical Stage": "Treatment and Intervention"
569
+ },
570
+ "CLISTER": {
571
+ "Language": "French",
572
+ "Task Type": "Semantic Similarity",
573
+ "Clinical Context": "General",
574
+ "Data Access": "Open Access",
575
+ "Applications": "Semantic relation",
576
+ "Clinical Stage": "Research"
577
+ },
578
+ "BRONCO150-NER&Status": {
579
+ "Language": "German",
580
+ "Task Type": "Event Extraction",
581
+ "Clinical Context": "Oncology",
582
+ "Data Access": "Regulated",
583
+ "Applications": "Procudure information",
584
+ "Clinical Stage": "Treatment and Intervention"
585
+ },
586
+ "CARDIO-DE": {
587
+ "Language": "German",
588
+ "Task Type": "Named Entity Recognition",
589
+ "Clinical Context": "Cardiology",
590
+ "Data Access": "Regulated",
591
+ "Applications": "Medication information",
592
+ "Clinical Stage": "Treatment and Intervention"
593
+ },
594
+ "GraSSCo PHI": {
595
+ "Language": "German",
596
+ "Task Type": "Named Entity Recognition",
597
+ "Clinical Context": "General",
598
+ "Data Access": "Open Access",
599
+ "Applications": "De-identification",
600
+ "Clinical Stage": "Research"
601
+ },
602
+ "IFMIR-Incident type": {
603
+ "Language": "Japanese",
604
+ "Task Type": "Text Classification",
605
+ "Clinical Context": "Pharmacology",
606
+ "Data Access": "Open Access",
607
+ "Applications": "ADE & Incidents",
608
+ "Clinical Stage": "Treatment and Intervention"
609
+ },
610
+ "IFMIR-NER": {
611
+ "Language": "Japanese",
612
+ "Task Type": "Named Entity Recognition",
613
+ "Clinical Context": "Pharmacology",
614
+ "Data Access": "Open Access",
615
+ "Applications": "ADE & Incidents",
616
+ "Clinical Stage": "Treatment and Intervention"
617
+ },
618
+ "IFMIR - NER&factuality": {
619
+ "Language": "Japanese",
620
+ "Task Type": "Event Extraction",
621
+ "Clinical Context": "Pharmacology",
622
+ "Data Access": "Open Access",
623
+ "Applications": "ADE & Incidents",
624
+ "Clinical Stage": "Treatment and Intervention"
625
+ },
626
+ "iCorpus": {
627
+ "Language": "Japanese",
628
+ "Task Type": "Named Entity Recognition",
629
+ "Clinical Context": "General",
630
+ "Data Access": "Open Access",
631
+ "Applications": "Procudure information",
632
+ "Clinical Stage": "Treatment and Intervention"
633
+ },
634
+ "icliniq-10k": {
635
+ "Language": "English",
636
+ "Task Type": "Question Answering",
637
+ "Clinical Context": "General",
638
+ "Data Access": "Open Access",
639
+ "Applications": "Screen & Consultation",
640
+ "Clinical Stage": "Triage and Referral"
641
+ },
642
+ "HealthCareMagic-100k": {
643
+ "Language": "English",
644
+ "Task Type": "Question Answering",
645
+ "Clinical Context": "General",
646
+ "Data Access": "Open Access",
647
+ "Applications": "Screen & Consultation",
648
+ "Clinical Stage": "Triage and Referral"
649
+ },
650
+ "MIMIC-IV CDM": {
651
+ "Language": "English",
652
+ "Task Type": "Text Classification",
653
+ "Clinical Context": "Gastroenterology",
654
+ "Data Access": "Regulated",
655
+ "Applications": "Diagnosis",
656
+ "Clinical Stage": "Diagnosis and Prognosis"
657
+ },
658
+ "MIMIC-III Outcome.LoS": {
659
+ "Language": "English",
660
+ "Task Type": "Text Classification",
661
+ "Clinical Context": "Critical Care",
662
+ "Data Access": "Regulated",
663
+ "Applications": "Prognosis",
664
+ "Clinical Stage": "Diagnosis and Prognosis"
665
+ },
666
+ "MIMIC-III Outcome.Mortality": {
667
+ "Language": "English",
668
+ "Task Type": "Text Classification",
669
+ "Clinical Context": "Critical Care",
670
+ "Data Access": "Regulated",
671
+ "Applications": "Prognosis",
672
+ "Clinical Stage": "Diagnosis and Prognosis"
673
+ },
674
+ "MIMIC-IV BHC": {
675
+ "Language": "English",
676
+ "Task Type": "Summarization",
677
+ "Clinical Context": "Critical Care",
678
+ "Data Access": "Regulated",
679
+ "Applications": "Summarization",
680
+ "Clinical Stage": "Discharge and Administration"
681
+ },
682
+ "MIMIC-IV DiReCT.Dis": {
683
+ "Language": "English",
684
+ "Task Type": "Text Classification",
685
+ "Clinical Context": "Cardiology, Gastroenterology, Neurology, Pulmonology, Endocrinology",
686
+ "Data Access": "Regulated",
687
+ "Applications": "Diagnosis",
688
+ "Clinical Stage": "Diagnosis and Prognosis"
689
+ },
690
+ "MIMIC-IV DiReCT.PDD": {
691
+ "Language": "English",
692
+ "Task Type": "Text Classification",
693
+ "Clinical Context": "Cardiology, Gastroenterology, Neurology, Pulmonology, Endocrinology",
694
+ "Data Access": "Regulated",
695
+ "Applications": "Diagnosis",
696
+ "Clinical Stage": "Diagnosis and Prognosis"
697
+ }
698
+ }