kevinxie06 commited on
Commit
2c0cfba
·
verified ·
1 Parent(s): 38047f0

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -923
app.py DELETED
@@ -1,923 +0,0 @@
1
- import gradio as gr
2
- from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns
3
- import config
4
- from pathlib import Path
5
- import pandas as pd
6
- import json
7
-
8
- import warnings
9
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Literal
10
- import pandas as pd
11
- from pandas.io.formats.style import Styler
12
-
13
- import semantic_version
14
- from dataclasses import dataclass, field
15
-
16
- from gradio.components import Component
17
- from gradio.data_classes import GradioModel
18
- from gradio.events import Events
19
-
20
- @dataclass
21
- class SelectColumns:
22
- default_selection: Optional[list[str]] = field(default_factory=list)
23
- cant_deselect: Optional[list[str]] = field(default_factory=list)
24
- allow: bool = True
25
- label: Optional[str] = None
26
- show_label: bool = True
27
- info: Optional[str] = None
28
-
29
- @dataclass
30
- class ColumnFilter:
31
- column: str
32
- type: Literal["slider", "dropdown", "checkboxgroup", "boolean"] = None
33
- default: Optional[Union[int, float, List[Tuple[str, str]]]] = None
34
- choices: Optional[Union[int, float, List[Tuple[str, str]]]] = None
35
- label: Optional[str] = None
36
- info: Optional[str] = None
37
- show_label: bool = True
38
- min: Optional[Union[int, float]] = None
39
- max: Optional[Union[int, float]] = None
40
-
41
- class DataframeData(GradioModel):
42
- headers: List[str]
43
- data: Union[List[List[Any]], List[Tuple[Any, ...]]]
44
- metadata: Optional[Dict[str, Optional[List[Any]]]] = None
45
-
46
-
47
- abs_path = Path(__file__).parent
48
-
49
- # Load the leaderboard data for
50
- zero_shot_df = pd.read_json("leaderboards/Zero-Shot_leaderboard_data.json", precise_float=True)
51
- five_shot_df = pd.read_json("leaderboards/Few-Shot_leaderboard_data.json", precise_float=True)
52
- cot_df = pd.read_json("leaderboards/CoT_leaderboard_data.json", precise_float=True)
53
-
54
- # Original Average Performances
55
- original_zero_shot_avg_perf = zero_shot_df["Average Performance"]
56
- original_five_shot_avg_perf = five_shot_df["Average Performance"]
57
- original_cot_avg_perf = cot_df["Average Performance"]
58
-
59
- # Load the task information json data
60
- with open("task_information.json", 'r') as file:
61
- task_information_json = json.load(file)
62
-
63
- cot_currently_selected_filters = {
64
- "Language": [],
65
- "Task Type": [],
66
- "Clinical Context": [],
67
- "Data Access": [],
68
- "Applications": [],
69
- "Clinical Stage": []
70
- }
71
-
72
- five_shot_currently_selected_filters = {
73
- "Language": [],
74
- "Task Type": [],
75
- "Clinical Context": [],
76
- "Data Access": [],
77
- "Applications": [],
78
- "Clinical Stage": []
79
- }
80
-
81
- zero_shot_currently_selected_filters = {
82
- "Language": [],
83
- "Task Type": [],
84
- "Clinical Context": [],
85
- "Data Access": [],
86
- "Applications": [],
87
- "Clinical Stage": []
88
- }
89
-
90
- # with open("/Users/kevinxie/Desktop/Clinical NLP/Clinical-Text-Leaderboard/leaderboard_data.json", 'r') as file:
91
- with open("leaderboards/Few-Shot_leaderboard_data.json", 'r') as file:
92
- five_shot_leaderboard_json = json.load(file)
93
-
94
- with open("leaderboards/CoT_leaderboard_data.json", 'r') as file:
95
- CoT_leaderboard_json = json.load(file)
96
-
97
- with open("leaderboards/Zero-Shot_leaderboard_data.json", 'r') as file:
98
- zero_shot_leaderboard_json = json.load(file)
99
-
100
- valid_tasks = {'NUBES', 'NorSynthClinical-NER', 'MEDIQA 2023-sum-A', 'Medication extraction',
101
- 'IMCS-V2-DAC', 'Cantemist-Coding', 'IFMIR-NER', 'EHRQA-QA', 'Ex4CDS', 'MedDG',
102
- 'MTS-Temporal', 'CHIP-MDCFNPC', 'n2c2 2014-Diabetes', 'MIMIC-III Outcome.LoS',
103
- 'n2c2 2014-Hypertension', 'RuCCoN', 'CARES-ICD10 Chapter', 'RuDReC-NER', 'MIMIC-IV DiReCT.Dis',
104
- 'n2c2 2014-Medication', 'iCorpus', 'Brateca-Hospitalization', 'n2c2 2010-Assertion',
105
- 'NorSynthClinical-PHI', 'IFMIR - NER&factuality', 'JP-STS', 'NorSynthClinical-RE',
106
- 'n2c2 2010-Concept', 'BARR2', 'IMCS-V2-NER', 'IMCS-V2-MRG', 'cMedQA', 'MedSTS',
107
- 'BRONCO150-NER&Status', 'n2c2 2018-ADE&medication', 'CLISTER', 'ClinicalNotes-UPMC',
108
- 'PPTS', 'CLIP', 'IMCS-V2-SR', 'EHRQA-Sub department', 'BrainMRI-AIS', 'Brateca-Mortality',
109
- 'meddocan', 'CHIP-CDEE', 'CAS-evidence', 'MEDIQA 2019-RQE', 'Cantemis-Norm', 'MEDIQA 2023-sum-B',
110
- 'CHIP-CTC', 'C-EMRS', 'CARES ICD10 Block', 'Cantemis-NER', 'CLINpt-NER', 'MEDIQA 2023-chat-A',
111
- 'n2c2 2014-De-identification', 'n2c2 2014-Hyperlipidemia', 'EHRQA-Primary department',
112
- 'ADE-Drug dosage', 'IFMIR-Incident type', 'MIMIC-III Outcome.Mortality', 'n2c2 2006-De-identification',
113
- 'CAS-label', 'MIMIC-IV CDM', 'CodiEsp-ICD-10-CM', 'n2c2 2010-Relation', 'CARES-ICD10 Subblock',
114
- 'MIE', 'HealthCareMagic-100k', 'ADE-Identification', 'MIMIC-IV DiReCT.PDD', 'ADE-Extraction',
115
- 'DialMed', 'GOUT-CC-Consensus', 'GraSSCo PHI', 'RuMedNLI', 'RuMedDaNet', 'CBLUE-CDN', 'icliniq-10k',
116
- 'CARDIO-DE', 'CARES-Area', 'DiSMed-NER', 'CodiEsp-ICD-10-PCS', 'MedNLI', 'MTS', 'MIMIC-IV BHC',
117
- 'n2c2 2014-CAD'}
118
-
119
- n_models = int(list(zero_shot_leaderboard_json["Model"].keys())[-1]) + 1
120
-
121
- def get_filtered_columns(filter_selections):
122
- """
123
- Given an array of selected filters, this function will return a list of all
124
- the columns that match the criteria.
125
-
126
- Input:
127
- filter_selections: dictionary of all task type filter selections
128
-
129
- Output:
130
- Returns a list of all valid tasks to display (by task name)
131
- """
132
- # Need to add a flag to this filter so that it only displays those that match all attributes
133
- valid_columns = []
134
- for task in task_information_json:
135
- task_info = task_information_json[task]
136
-
137
- # Flag to keep track of whether this task is valid
138
- isValid = True
139
-
140
- # Iterate through each attribute of the task
141
- for attribute in task_info:
142
- # If the filter is empty
143
- if not filter_selections[attribute]:
144
- continue
145
-
146
- value = task_info[attribute]
147
-
148
- # print(filter_selections[attribute])
149
-
150
- # Handle edge case for multiple categories
151
- if "," in value:
152
- all_categories = value.split(", ")
153
-
154
- flag = False
155
- for category in all_categories:
156
- if category in filter_selections[attribute]:
157
- flag = True
158
- break
159
-
160
- if flag: # one category matches
161
- isValid = True
162
-
163
- else: # none of the categories matched
164
- isValid = False
165
-
166
- # Handle Brazilian Edge Case
167
- elif (value == 'Portuguese\n(Brazilian)') and ('Portuguese' in filter_selections[attribute]):
168
- isValid = True
169
- break
170
-
171
- elif value not in filter_selections[attribute]:
172
- # if filter_selections[attribute] not in task_info[attribute]:
173
- isValid = False
174
- # break
175
-
176
- if task in valid_tasks and isValid:
177
- valid_columns.append(task)
178
-
179
- return valid_columns
180
-
181
- def isEmpty(currently_selected_filters):
182
- """
183
- Checks if there are no selected filters
184
- """
185
- flag = True
186
- for key, value in currently_selected_filters.items():
187
- if not value:
188
- continue
189
- else:
190
- return False
191
-
192
- return True
193
-
194
-
195
- ####################################################################################################
196
- ####### CoT Filters
197
- ####################################################################################################
198
-
199
-
200
- def cot_filter_language(language_choice):
201
- # Update the Global store for the currently selected filters
202
- cot_currently_selected_filters["Language"] = language_choice
203
-
204
- if isEmpty(cot_currently_selected_filters):
205
- cot_df["Average Performance"] = original_cot_avg_perf
206
- return cot_df
207
-
208
- filtered_cols = get_filtered_columns(cot_currently_selected_filters)
209
-
210
- updated_performance = cot_update_average_performance(filtered_cols)
211
-
212
- # Convert dictionary keys to integers to match the DataFrame index
213
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
214
-
215
- # Map the values to the 'Average Performance' column based on index
216
- cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
217
-
218
- return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Size (B)', 'Average Performance'] + filtered_cols]
219
-
220
- def cot_filter_task_type(task_type_choice):
221
- # Update the Global store for the currently selected filters
222
- cot_currently_selected_filters["Task Type"] = task_type_choice
223
-
224
- if isEmpty(cot_currently_selected_filters):
225
- cot_df["Average Performance"] = original_cot_avg_perf
226
- return cot_df
227
-
228
- filtered_cols = get_filtered_columns(cot_currently_selected_filters)
229
-
230
- updated_performance = cot_update_average_performance(filtered_cols)
231
-
232
- # Convert dictionary keys to integers to match the DataFrame index
233
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
234
-
235
- # Map the values to the 'Average Performance' column based on index
236
- cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
237
-
238
- return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
239
-
240
- def cot_filter_clinical_context(clinical_context_choice):
241
- # Update the Global store for the currently selected filters
242
- cot_currently_selected_filters["Clinical Context"] = clinical_context_choice
243
-
244
- if isEmpty(cot_currently_selected_filters):
245
- cot_df["Average Performance"] = original_cot_avg_perf
246
- return cot_df
247
-
248
- filtered_cols = get_filtered_columns(cot_currently_selected_filters)
249
-
250
- updated_performance = cot_update_average_performance(filtered_cols)
251
-
252
- # Convert dictionary keys to integers to match the DataFrame index
253
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
254
-
255
- # Map the values to the 'Average Performance' column based on index
256
- cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
257
-
258
- return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
259
-
260
- def cot_filter_applications(applications_choice):
261
- # Update the Global store for the currently selected filters
262
- cot_currently_selected_filters["Applications"] = applications_choice
263
-
264
- if isEmpty(cot_currently_selected_filters):
265
- cot_df["Average Performance"] = original_cot_avg_perf
266
- return cot_df
267
-
268
- filtered_cols = get_filtered_columns(cot_currently_selected_filters)
269
-
270
- updated_performance = cot_update_average_performance(filtered_cols)
271
-
272
- # Convert dictionary keys to integers to match the DataFrame index
273
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
274
-
275
- # Map the values to the 'Average Performance' column based on index
276
- cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
277
-
278
- return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
279
-
280
- def cot_filter_stage_options(stage_choice):
281
- # Update the Global store for the currently selected filters
282
- cot_currently_selected_filters["Clinical Stage"] = stage_choice
283
-
284
- if isEmpty(cot_currently_selected_filters):
285
- cot_df["Average Performance"] = original_cot_avg_perf
286
- return cot_df
287
-
288
- filtered_cols = get_filtered_columns(cot_currently_selected_filters)
289
-
290
- updated_performance = cot_update_average_performance(filtered_cols)
291
-
292
- # Convert dictionary keys to integers to match the DataFrame index
293
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
294
-
295
- # Map the values to the 'Average Performance' column based on index
296
- cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
297
-
298
- return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
299
-
300
- def cot_filter_data_access(data_access_choice):
301
- # Update the Global store for the currently selected filters
302
- cot_currently_selected_filters["Data Access"] = data_access_choice
303
-
304
- if isEmpty(cot_currently_selected_filters):
305
- cot_df["Average Performance"] = original_cot_avg_perf
306
- return cot_df
307
-
308
- filtered_cols = get_filtered_columns(cot_currently_selected_filters)
309
-
310
- updated_performance = cot_update_average_performance(filtered_cols)
311
-
312
- # Convert dictionary keys to integers to match the DataFrame index
313
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
314
-
315
- # Map the values to the 'Average Performance' column based on index
316
- cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
317
-
318
- return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
319
-
320
- def cot_update_average_performance(selected_columns):
321
- """
322
- When a user clicks filters to filter certain tasks, the average performance
323
- of the model should update. This function takes uses the updated filtered columns
324
- and calculates the average performances of only those columns. It then updates
325
- the leaderboard accordingly.
326
- """
327
- updated_average_performance = {}
328
-
329
- for i in range(n_models):
330
- performance = 0
331
-
332
- num_tasks = 0
333
- for task in selected_columns:
334
- num_tasks += 1
335
- performance += float(CoT_leaderboard_json[task][str(i)])
336
-
337
- if num_tasks == 0:
338
- num_tasks = 1
339
-
340
- updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2))
341
-
342
- return updated_average_performance
343
-
344
-
345
- ####################################################################################################
346
- ####### Few Shot Filters
347
- ####################################################################################################
348
-
349
- def five_shot_filter_language(language_choice):
350
- # Update the Global store for the currently selected filters
351
- five_shot_currently_selected_filters["Language"] = language_choice
352
-
353
- if isEmpty(five_shot_currently_selected_filters):
354
- five_shot_df["Average Performance"] = original_five_shot_avg_perf
355
- return five_shot_df
356
-
357
- filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
358
-
359
- updated_performance = five_shot_update_average_performance(filtered_cols)
360
-
361
- # Convert dictionary keys to integers to match the DataFrame index
362
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
363
-
364
- # Map the values to the 'Average Performance' column based on index
365
- five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
366
-
367
- return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
368
-
369
- def five_shot_filter_task_type(task_type_choice):
370
- # Update the Global store for the currently selected filters
371
- five_shot_currently_selected_filters["Task Type"] = task_type_choice
372
-
373
- if isEmpty(five_shot_currently_selected_filters):
374
- five_shot_df["Average Performance"] = original_five_shot_avg_perf
375
- return five_shot_df
376
-
377
- filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
378
-
379
- updated_performance = five_shot_update_average_performance(filtered_cols)
380
-
381
- # Convert dictionary keys to integers to match the DataFrame index
382
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
383
-
384
- # Map the values to the 'Average Performance' column based on index
385
- five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
386
-
387
- return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
388
-
389
- def five_shot_filter_clinical_context(clinical_context_choice):
390
- # Update the Global store for the currently selected filters
391
- five_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice
392
-
393
- if isEmpty(five_shot_currently_selected_filters):
394
- five_shot_df["Average Performance"] = original_five_shot_avg_perf
395
- return five_shot_df
396
-
397
- filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
398
-
399
- updated_performance = five_shot_update_average_performance(filtered_cols)
400
-
401
- # Convert dictionary keys to integers to match the DataFrame index
402
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
403
-
404
- # Map the values to the 'Average Performance' column based on index
405
- five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
406
-
407
- return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
408
-
409
- def five_shot_filter_applications(applications_choice):
410
- # Update the Global store for the currently selected filters
411
- five_shot_currently_selected_filters["Applications"] = applications_choice
412
-
413
- if isEmpty(five_shot_currently_selected_filters):
414
- five_shot_df["Average Performance"] = original_five_shot_avg_perf
415
- return five_shot_df
416
-
417
- filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
418
-
419
- updated_performance = five_shot_update_average_performance(filtered_cols)
420
-
421
- # Convert dictionary keys to integers to match the DataFrame index
422
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
423
-
424
- # Map the values to the 'Average Performance' column based on index
425
- five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
426
-
427
- return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
428
-
429
- def five_shot_filter_stage_options(stage_choice):
430
- # Update the Global store for the currently selected filters
431
- five_shot_currently_selected_filters["Clinical Stage"] = stage_choice
432
-
433
- if isEmpty(five_shot_currently_selected_filters):
434
- five_shot_df["Average Performance"] = original_five_shot_avg_perf
435
- return five_shot_df
436
-
437
- filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
438
-
439
- updated_performance = five_shot_update_average_performance(filtered_cols)
440
-
441
- # Convert dictionary keys to integers to match the DataFrame index
442
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
443
-
444
- # Map the values to the 'Average Performance' column based on index
445
- five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
446
-
447
- return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
448
-
449
- def five_shot_filter_data_access(data_access_choice):
450
- # Update the Global store for the currently selected filters
451
- five_shot_currently_selected_filters["Data Access"] = data_access_choice
452
-
453
- if isEmpty(five_shot_currently_selected_filters):
454
- five_shot_df["Average Performance"] = original_five_shot_avg_perf
455
- return five_shot_df
456
-
457
- filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
458
-
459
- updated_performance = five_shot_update_average_performance(filtered_cols)
460
-
461
- # Convert dictionary keys to integers to match the DataFrame index
462
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
463
-
464
- # Map the values to the 'Average Performance' column based on index
465
- five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
466
-
467
- return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
468
-
469
-
470
- def five_shot_update_average_performance(selected_columns):
471
- """
472
- When a user clicks filters to filter certain tasks, the average performance
473
- of the model should update. This function takes uses the updated filtered columns
474
- and calculates the average performances of only those columns. It then updates
475
- the leaderboard accordingly.
476
- """
477
- updated_average_performance = {}
478
-
479
- for i in range(n_models):
480
- performance = 0
481
-
482
- num_tasks = 0
483
- for task in selected_columns:
484
- num_tasks += 1
485
- performance += float(five_shot_leaderboard_json[task][str(i)])
486
-
487
- if num_tasks == 0:
488
- num_tasks = 1
489
-
490
- updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2))
491
-
492
- return updated_average_performance
493
-
494
-
495
- ####################################################################################################
496
- ###### Zero Shot Filters
497
- ####################################################################################################
498
-
499
-
500
- def zero_shot_filter_language(language_choice):
501
- # Update the Global store for the currently selected filters
502
- zero_shot_currently_selected_filters["Language"] = language_choice
503
-
504
- if isEmpty(zero_shot_currently_selected_filters):
505
- zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
506
- return zero_shot_df
507
-
508
- filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
509
-
510
- updated_performance = zero_shot_update_average_performance(filtered_cols)
511
-
512
- # Convert dictionary keys to integers to match the DataFrame index
513
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
514
-
515
- # Map the values to the 'Average Performance' column based on index
516
- zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
517
-
518
- return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
519
-
520
- def zero_shot_filter_task_type(task_type_choice):
521
- # Update the Global store for the currently selected filters
522
- zero_shot_currently_selected_filters["Task Type"] = task_type_choice
523
-
524
- if isEmpty(zero_shot_currently_selected_filters):
525
- zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
526
- return zero_shot_df
527
-
528
- filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
529
-
530
- updated_performance = zero_shot_update_average_performance(filtered_cols)
531
-
532
- # Convert dictionary keys to integers to match the DataFrame index
533
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
534
-
535
- # Map the values to the 'Average Performance' column based on index
536
- zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
537
-
538
- return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
539
-
540
- def zero_shot_filter_clinical_context(clinical_context_choice):
541
- # Update the Global store for the currently selected filters
542
- zero_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice
543
-
544
- if isEmpty(zero_shot_currently_selected_filters):
545
- zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
546
- return zero_shot_df
547
-
548
- filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
549
-
550
- updated_performance = zero_shot_update_average_performance(filtered_cols)
551
-
552
- # Convert dictionary keys to integers to match the DataFrame index
553
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
554
-
555
- # Map the values to the 'Average Performance' column based on index
556
- zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
557
-
558
- return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
559
-
560
- def zero_shot_filter_applications(applications_choice):
561
- # Update the Global store for the currently selected filters
562
- zero_shot_currently_selected_filters["Applications"] = applications_choice
563
-
564
- if isEmpty(zero_shot_currently_selected_filters):
565
- zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
566
- return zero_shot_df
567
-
568
- filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
569
-
570
- updated_performance = zero_shot_update_average_performance(filtered_cols)
571
-
572
- # Convert dictionary keys to integers to match the DataFrame index
573
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
574
-
575
- # Map the values to the 'Average Performance' column based on index
576
- zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
577
-
578
- return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
579
-
580
- def zero_shot_filter_stage_options(stage_choice):
581
- # Update the Global store for the currently selected filters
582
- zero_shot_currently_selected_filters["Clinical Stage"] = stage_choice
583
-
584
- if isEmpty(zero_shot_currently_selected_filters):
585
- zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
586
- return zero_shot_df
587
-
588
- filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
589
-
590
- updated_performance = zero_shot_update_average_performance(filtered_cols)
591
-
592
- # Convert dictionary keys to integers to match the DataFrame index
593
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
594
-
595
- # Map the values to the 'Average Performance' column based on index
596
- zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
597
-
598
- return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
599
-
600
- def zero_shot_filter_data_access(data_access_choice):
601
- # Update the Global store for the currently selected filters
602
- zero_shot_currently_selected_filters["Data Access"] = data_access_choice
603
-
604
- if isEmpty(zero_shot_currently_selected_filters):
605
- zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
606
- return zero_shot_df
607
-
608
- filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
609
-
610
- updated_performance = zero_shot_update_average_performance(filtered_cols)
611
-
612
- # Convert dictionary keys to integers to match the DataFrame index
613
- updated_performance_int = {int(k): v for k, v in updated_performance.items()}
614
-
615
- # Map the values to the 'Average Performance' column based on index
616
- zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
617
-
618
- return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
619
-
620
- def zero_shot_update_average_performance(selected_columns):
621
- """
622
- When a user clicks filters to filter certain tasks, the average performance
623
- of the model should update. This function takes uses the updated filtered columns
624
- and calculates the average performances of only those columns. It then updates
625
- the leaderboard accordingly.
626
- """
627
- updated_average_performance = {}
628
-
629
- for i in range(n_models):
630
- performance = 0
631
-
632
- num_tasks = 0
633
- for task in selected_columns:
634
- num_tasks += 1
635
- performance += float(zero_shot_leaderboard_json[task][str(i)])
636
-
637
- if num_tasks == 0:
638
- num_tasks = 1
639
-
640
- updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2))
641
-
642
- return updated_average_performance
643
-
644
-
645
- def postprocess(self, value: pd.DataFrame) -> DataframeData:
646
- # Ensure that the "Average Performance" column exists
647
- if "Average Performance" in value.columns:
648
- # Sort the DataFrame by the "average performance" column in descending order
649
- value = value.sort_values(by="Average Performance", ascending=False)
650
-
651
- return DataframeData(
652
- headers=list(value.columns), # type: ignore
653
- data=value.to_dict(orient="split")["data"], # type: ignore
654
- )
655
-
656
- if value is None:
657
- return self.postprocess(pd.DataFrame({"column 1": []}))
658
- if isinstance(value, (str, pd.DataFrame)):
659
- if isinstance(value, str):
660
- value = pd.read_csv(value) # type: ignore
661
- if len(value) == 0:
662
- return DataframeData(
663
- headers=list(value.columns), # type: ignore
664
- data=[[]], # type: ignore
665
- )
666
- return DataframeData(
667
- headers=list(value.columns), # type: ignore
668
- data=value.to_dict(orient="split")["data"], # type: ignore
669
- )
670
- elif isinstance(value, Styler):
671
- if semantic_version.Version(pd.__version__) < semantic_version.Version(
672
- "1.5.0"
673
- ):
674
- raise ValueError(
675
- "Styler objects are only supported in pandas version 1.5.0 or higher. Please try: `pip install --upgrade pandas` to use this feature."
676
- )
677
- if self.interactive:
678
- warnings.warn(
679
- "Cannot display Styler object in interactive mode. Will display as a regular pandas dataframe instead."
680
- )
681
- df: pd.DataFrame = value.data # type: ignore
682
- if len(df) == 0:
683
- return DataframeData(
684
- headers=list(df.columns),
685
- data=[[]],
686
- metadata=self.__extract_metadata(value), # type: ignore
687
- )
688
- return DataframeData(
689
- headers=list(df.columns),
690
- data=df.to_dict(orient="split")["data"], # type: ignore
691
- metadata=self.__extract_metadata(value), # type: ignore
692
- )
693
-
694
- # Models are sorted in order of decreasing average performance (best performance at the top!)
695
- Leaderboard.postprocess = postprocess
696
-
697
-
698
- ####################################################################################################
699
- ###### Leaderboard
700
- ####################################################################################################
701
-
702
- with gr.Blocks() as app:
703
- gr.Markdown("# BRIDGE (Benchmarking Large Language Models for Understanding Real-world Clinical Practice Text)")
704
-
705
- with gr.Tabs():
706
- with gr.Tab("README"):
707
- # gr.Markdown((Path(__file__).parent / "docs.md").read_text())
708
- html_content = (Path(__file__).parent / "docs.md").read_text()
709
- gr.HTML(html_content)
710
-
711
- with gr.Tab("Zero-Shot"):
712
- leaderboard = Leaderboard(
713
- value=zero_shot_df,
714
- select_columns = None,
715
- search_columns=SearchColumns(primary_column = "Model", secondary_columns = "",
716
- placeholder="Search by Model Name",
717
- label="Model Search"),
718
- hide_columns=["Model: Size Range", "Model: Accessibility"],
719
- filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"],
720
- datatype=config.TYPES,
721
- )
722
-
723
- # Language Filter
724
- all_languages = ['English', 'Spanish',
725
- 'Chinese', 'Norwegian',
726
- 'Russian', 'Portuguese',
727
- 'German', 'Japanese', 'French']
728
-
729
- language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language")
730
-
731
- # Task Type Filter
732
- all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition',
733
- 'Normalization and Coding', 'Natural Language Inference', 'Summarization',
734
- 'Event Extraction', 'Semantic Similarity']
735
-
736
-
737
- task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type")
738
-
739
- all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology',
740
- 'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology',
741
- 'General', 'Endocrinology', 'Pediatrics', 'Pharmacology',
742
- 'Gastroenterology', 'Psychology']
743
-
744
- cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context")
745
-
746
- # Applications Filter
747
- all_applications = ['Procudure information', 'Concept standarization',
748
- 'Specialist recommendation', 'Negation identification',
749
- 'Clinical trial matching', 'Consultation summarization',
750
- 'Semantic relation', 'Post-discharge patient management',
751
- 'De-identification', 'Billing & Coding', 'Phenotyping',
752
- 'Data organization', 'Temporal & Causality relation',
753
- 'Summarization', 'Screen & Consultation', 'Diagnosis',
754
- 'ADE & Incidents', 'Risk factor extraction', 'Prognosis',
755
- 'Medication information']
756
-
757
-
758
- application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application")
759
-
760
- # Clinical Stage Filter
761
- all_stages = ['Treatment and Intervention', 'Triage and Referral',
762
- 'Initial Assessment', 'Discharge and Administration',
763
- 'Research', 'Diagnosis and Prognosis']
764
-
765
- stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage")
766
-
767
- # Data Access Filter
768
- all_data_access = ['Open Access', 'Regulated']
769
-
770
- da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access")
771
-
772
-
773
- language_options.change(fn=zero_shot_filter_language, inputs=language_options, outputs=leaderboard)
774
- task_type_options.change(fn=zero_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard)
775
- cc_options.change(fn=zero_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard)
776
- application_options.change(fn=zero_shot_filter_applications, inputs=application_options, outputs=leaderboard)
777
- da_options.change(fn=zero_shot_filter_data_access, inputs=da_options, outputs=leaderboard)
778
- stage_options.change(fn=zero_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard)
779
-
780
-
781
- with gr.Tab("Few-Shot"):
782
- leaderboard = Leaderboard(
783
- value=five_shot_df,
784
- select_columns = None,
785
- search_columns=SearchColumns(primary_column = "Model", secondary_columns = "",
786
- placeholder="Search by Model Name",
787
- label="Model Search"),
788
- hide_columns=["Model: Size Range", "Model: Accessibility"],
789
- filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"],
790
- datatype=config.TYPES,
791
- )
792
-
793
- # Language Filter
794
- all_languages = ['English', 'Spanish',
795
- 'Chinese', 'Norwegian',
796
- 'Russian', 'Portuguese',
797
- 'German', 'Japanese', 'French']
798
-
799
- language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language")
800
-
801
- # Task Type Filter
802
- all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition',
803
- 'Normalization and Coding', 'Natural Language Inference', 'Summarization',
804
- 'Event Extraction', 'Semantic Similarity']
805
-
806
- task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type")
807
-
808
-
809
- # Clinical Context Filter
810
- all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology',
811
- 'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology',
812
- 'General', 'Endocrinology', 'Pediatrics', 'Pharmacology',
813
- 'Gastroenterology', 'Psychology']
814
-
815
- cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context")
816
-
817
- # Applications Filter
818
- all_applications = ['Procudure information', 'Concept standarization',
819
- 'Specialist recommendation', 'Negation identification',
820
- 'Clinical trial matching', 'Consultation summarization',
821
- 'Semantic relation', 'Post-discharge patient management',
822
- 'De-identification', 'Billing & Coding', 'Phenotyping',
823
- 'Data organization', 'Temporal & Causality relation',
824
- 'Summarization', 'Screen & Consultation', 'Diagnosis',
825
- 'ADE & Incidents', 'Risk factor extraction', 'Prognosis',
826
- 'Medication information']
827
-
828
- application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application")
829
-
830
- # Clinical Stage Filter
831
- all_stages = ['Treatment and Intervention', 'Triage and Referral',
832
- 'Initial Assessment', 'Discharge and Administration',
833
- 'Research', 'Diagnosis and Prognosis']
834
-
835
- stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage")
836
-
837
- # Data Access Filter
838
- all_data_access = ['Open Access', 'Regulated']
839
-
840
- da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access")
841
-
842
- language_options.change(fn=five_shot_filter_language, inputs=language_options, outputs=leaderboard)
843
- task_type_options.change(fn=five_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard)
844
- cc_options.change(fn=five_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard)
845
- application_options.change(fn=five_shot_filter_applications, inputs=application_options, outputs=leaderboard)
846
- da_options.change(fn=five_shot_filter_data_access, inputs=da_options, outputs=leaderboard)
847
- stage_options.change(fn=five_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard)
848
-
849
-
850
- with gr.Tab("CoT"):
851
- leaderboard = Leaderboard(
852
- value=cot_df,
853
- select_columns = None,
854
- search_columns=SearchColumns(primary_column = "Model", secondary_columns = "",
855
- placeholder="Search by Model Name",
856
- label="Model Search"),
857
- hide_columns=["Model: Size Range", "Model: Accessibility"],
858
- filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"],
859
- datatype=config.TYPES,
860
- )
861
-
862
- # Language Filter
863
- all_languages = ['English', 'Spanish',
864
- 'Chinese', 'Norwegian',
865
- 'Russian', 'Portuguese',
866
- 'German', 'Japanese', 'French']
867
-
868
- language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language")
869
-
870
- # Task Type Filter
871
- all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition',
872
- 'Normalization and Coding', 'Natural Language Inference', 'Summarization',
873
- 'Event Extraction', 'Semantic Similarity']
874
-
875
- task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type")
876
-
877
- # Clinical Context Filter
878
- all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology',
879
- 'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology',
880
- 'General', 'Endocrinology', 'Pediatrics', 'Pharmacology',
881
- 'Gastroenterology', 'Psychology']
882
-
883
- cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context")
884
-
885
- # Applications Filter
886
- all_applications = ['Procudure information', 'Concept standarization',
887
- 'Specialist recommendation', 'Negation identification',
888
- 'Clinical trial matching', 'Consultation summarization',
889
- 'Semantic relation', 'Post-discharge patient management',
890
- 'De-identification', 'Billing & Coding', 'Phenotyping',
891
- 'Data organization', 'Temporal & Causality relation',
892
- 'Summarization', 'Screen & Consultation', 'Diagnosis',
893
- 'ADE & Incidents', 'Risk factor extraction', 'Prognosis',
894
- 'Medication information']
895
-
896
- application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application")
897
-
898
- # Clinical Stage Filter
899
- all_stages = ['Treatment and Intervention', 'Triage and Referral',
900
- 'Initial Assessment', 'Discharge and Administration',
901
- 'Research', 'Diagnosis and Prognosis']
902
-
903
- stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage")
904
-
905
-
906
- # Data Access Filter
907
- all_data_access = ['Open Access', 'Regulated']
908
-
909
- da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access")
910
-
911
-
912
- language_options.change(fn=cot_filter_language, inputs=language_options, outputs=leaderboard)
913
- task_type_options.change(fn=cot_filter_task_type, inputs=task_type_options, outputs=leaderboard)
914
- cc_options.change(fn=cot_filter_clinical_context, inputs=cc_options, outputs=leaderboard)
915
- application_options.change(fn=cot_filter_applications, inputs=application_options, outputs=leaderboard)
916
- da_options.change(fn=cot_filter_data_access, inputs=da_options, outputs=leaderboard)
917
-
918
- stage_options.change(fn=cot_filter_stage_options, inputs=stage_options, outputs=leaderboard)
919
-
920
-
921
- if __name__ == "__main__":
922
- app.launch()
923
-