|
import gradio as gr |
|
from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns |
|
import config |
|
from pathlib import Path |
|
import pandas as pd |
|
import json |
|
|
|
import warnings |
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Literal |
|
import pandas as pd |
|
from pandas.io.formats.style import Styler |
|
|
|
import semantic_version |
|
from dataclasses import dataclass, field |
|
|
|
from gradio.components import Component |
|
from gradio.data_classes import GradioModel |
|
from gradio.events import Events |
|
|
|
@dataclass |
|
class SelectColumns: |
|
default_selection: Optional[list[str]] = field(default_factory=list) |
|
cant_deselect: Optional[list[str]] = field(default_factory=list) |
|
allow: bool = True |
|
label: Optional[str] = None |
|
show_label: bool = True |
|
info: Optional[str] = None |
|
|
|
@dataclass |
|
class ColumnFilter: |
|
column: str |
|
type: Literal["slider", "dropdown", "checkboxgroup", "boolean"] = None |
|
default: Optional[Union[int, float, List[Tuple[str, str]]]] = None |
|
choices: Optional[Union[int, float, List[Tuple[str, str]]]] = None |
|
label: Optional[str] = None |
|
info: Optional[str] = None |
|
show_label: bool = True |
|
min: Optional[Union[int, float]] = None |
|
max: Optional[Union[int, float]] = None |
|
|
|
class DataframeData(GradioModel): |
|
headers: List[str] |
|
data: Union[List[List[Any]], List[Tuple[Any, ...]]] |
|
metadata: Optional[Dict[str, Optional[List[Any]]]] = None |
|
|
|
|
|
abs_path = Path(__file__).parent |
|
|
|
|
|
zero_shot_df = pd.read_json("leaderboards/Zero-Shot_leaderboard_data.json", precise_float=True) |
|
five_shot_df = pd.read_json("leaderboards/Few-Shot_leaderboard_data.json", precise_float=True) |
|
cot_df = pd.read_json("leaderboards/CoT_leaderboard_data.json", precise_float=True) |
|
|
|
|
|
original_zero_shot_avg_perf = zero_shot_df["Average Performance"] |
|
original_five_shot_avg_perf = five_shot_df["Average Performance"] |
|
original_cot_avg_perf = cot_df["Average Performance"] |
|
|
|
|
|
with open("task_information.json", 'r') as file: |
|
task_information_json = json.load(file) |
|
|
|
cot_currently_selected_filters = { |
|
"Language": [], |
|
"Task Type": [], |
|
"Clinical Context": [], |
|
"Data Access": [], |
|
"Applications": [], |
|
"Clinical Stage": [] |
|
} |
|
|
|
five_shot_currently_selected_filters = { |
|
"Language": [], |
|
"Task Type": [], |
|
"Clinical Context": [], |
|
"Data Access": [], |
|
"Applications": [], |
|
"Clinical Stage": [] |
|
} |
|
|
|
zero_shot_currently_selected_filters = { |
|
"Language": [], |
|
"Task Type": [], |
|
"Clinical Context": [], |
|
"Data Access": [], |
|
"Applications": [], |
|
"Clinical Stage": [] |
|
} |
|
|
|
|
|
with open("leaderboards/Few-Shot_leaderboard_data.json", 'r') as file: |
|
five_shot_leaderboard_json = json.load(file) |
|
|
|
with open("leaderboards/CoT_leaderboard_data.json", 'r') as file: |
|
CoT_leaderboard_json = json.load(file) |
|
|
|
with open("leaderboards/Zero-Shot_leaderboard_data.json", 'r') as file: |
|
zero_shot_leaderboard_json = json.load(file) |
|
|
|
valid_tasks = {'NUBES', 'NorSynthClinical-NER', 'MEDIQA 2023-sum-A', 'Medication extraction', |
|
'IMCS-V2-DAC', 'Cantemist-Coding', 'IFMIR-NER', 'EHRQA-QA', 'Ex4CDS', 'MedDG', |
|
'MTS-Temporal', 'CHIP-MDCFNPC', 'n2c2 2014-Diabetes', 'MIMIC-III Outcome.LoS', |
|
'n2c2 2014-Hypertension', 'RuCCoN', 'CARES-ICD10 Chapter', 'RuDReC-NER', 'MIMIC-IV DiReCT.Dis', |
|
'n2c2 2014-Medication', 'iCorpus', 'Brateca-Hospitalization', 'n2c2 2010-Assertion', |
|
'NorSynthClinical-PHI', 'IFMIR - NER&factuality', 'JP-STS', 'NorSynthClinical-RE', |
|
'n2c2 2010-Concept', 'BARR2', 'IMCS-V2-NER', 'IMCS-V2-MRG', 'cMedQA', 'MedSTS', |
|
'BRONCO150-NER&Status', 'n2c2 2018-ADE&medication', 'CLISTER', 'ClinicalNotes-UPMC', |
|
'PPTS', 'CLIP', 'IMCS-V2-SR', 'EHRQA-Sub department', 'BrainMRI-AIS', 'Brateca-Mortality', |
|
'meddocan', 'CHIP-CDEE', 'CAS-evidence', 'MEDIQA 2019-RQE', 'Cantemis-Norm', 'MEDIQA 2023-sum-B', |
|
'CHIP-CTC', 'C-EMRS', 'CARES ICD10 Block', 'Cantemis-NER', 'CLINpt-NER', 'MEDIQA 2023-chat-A', |
|
'n2c2 2014-De-identification', 'n2c2 2014-Hyperlipidemia', 'EHRQA-Primary department', |
|
'ADE-Drug dosage', 'IFMIR-Incident type', 'MIMIC-III Outcome.Mortality', 'n2c2 2006-De-identification', |
|
'CAS-label', 'MIMIC-IV CDM', 'CodiEsp-ICD-10-CM', 'n2c2 2010-Relation', 'CARES-ICD10 Subblock', |
|
'MIE', 'HealthCareMagic-100k', 'ADE-Identification', 'MIMIC-IV DiReCT.PDD', 'ADE-Extraction', |
|
'DialMed', 'GOUT-CC-Consensus', 'GraSSCo PHI', 'RuMedNLI', 'RuMedDaNet', 'CBLUE-CDN', 'icliniq-10k', |
|
'CARDIO-DE', 'CARES-Area', 'DiSMed-NER', 'CodiEsp-ICD-10-PCS', 'MedNLI', 'MTS', 'MIMIC-IV BHC', |
|
'n2c2 2014-CAD'} |
|
|
|
n_models = int(list(zero_shot_leaderboard_json["Model"].keys())[-1]) + 1 |
|
|
|
def get_filtered_columns(filter_selections): |
|
""" |
|
Given an array of selected filters, this function will return a list of all |
|
the columns that match the criteria. |
|
|
|
Input: |
|
filter_selections: dictionary of all task type filter selections |
|
|
|
Output: |
|
Returns a list of all valid tasks to display (by task name) |
|
""" |
|
|
|
valid_columns = [] |
|
for task in task_information_json: |
|
task_info = task_information_json[task] |
|
|
|
|
|
isValid = True |
|
|
|
|
|
for attribute in task_info: |
|
|
|
if not filter_selections[attribute]: |
|
continue |
|
|
|
value = task_info[attribute] |
|
|
|
|
|
|
|
|
|
if "," in value: |
|
all_categories = value.split(", ") |
|
|
|
flag = False |
|
for category in all_categories: |
|
if category in filter_selections[attribute]: |
|
flag = True |
|
break |
|
|
|
if flag: |
|
isValid = True |
|
|
|
else: |
|
isValid = False |
|
|
|
|
|
elif (value == 'Portuguese\n(Brazilian)') and ('Portuguese' in filter_selections[attribute]): |
|
isValid = True |
|
break |
|
|
|
elif value not in filter_selections[attribute]: |
|
|
|
isValid = False |
|
|
|
|
|
if task in valid_tasks and isValid: |
|
valid_columns.append(task) |
|
|
|
return valid_columns |
|
|
|
def isEmpty(currently_selected_filters): |
|
""" |
|
Checks if there are no selected filters |
|
""" |
|
flag = True |
|
for key, value in currently_selected_filters.items(): |
|
if not value: |
|
continue |
|
else: |
|
return False |
|
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cot_filter_language(language_choice): |
|
|
|
cot_currently_selected_filters["Language"] = language_choice |
|
|
|
if isEmpty(cot_currently_selected_filters): |
|
cot_df["Average Performance"] = original_cot_avg_perf |
|
return cot_df |
|
|
|
filtered_cols = get_filtered_columns(cot_currently_selected_filters) |
|
|
|
updated_performance = cot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) |
|
|
|
return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def cot_filter_task_type(task_type_choice): |
|
|
|
cot_currently_selected_filters["Task Type"] = task_type_choice |
|
|
|
if isEmpty(cot_currently_selected_filters): |
|
cot_df["Average Performance"] = original_cot_avg_perf |
|
return cot_df |
|
|
|
filtered_cols = get_filtered_columns(cot_currently_selected_filters) |
|
|
|
updated_performance = cot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) |
|
|
|
return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def cot_filter_clinical_context(clinical_context_choice): |
|
|
|
cot_currently_selected_filters["Clinical Context"] = clinical_context_choice |
|
|
|
if isEmpty(cot_currently_selected_filters): |
|
cot_df["Average Performance"] = original_cot_avg_perf |
|
return cot_df |
|
|
|
filtered_cols = get_filtered_columns(cot_currently_selected_filters) |
|
|
|
updated_performance = cot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) |
|
|
|
return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def cot_filter_applications(applications_choice): |
|
|
|
cot_currently_selected_filters["Applications"] = applications_choice |
|
|
|
if isEmpty(cot_currently_selected_filters): |
|
cot_df["Average Performance"] = original_cot_avg_perf |
|
return cot_df |
|
|
|
filtered_cols = get_filtered_columns(cot_currently_selected_filters) |
|
|
|
updated_performance = cot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) |
|
|
|
return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def cot_filter_stage_options(stage_choice): |
|
|
|
cot_currently_selected_filters["Clinical Stage"] = stage_choice |
|
|
|
if isEmpty(cot_currently_selected_filters): |
|
cot_df["Average Performance"] = original_cot_avg_perf |
|
return cot_df |
|
|
|
filtered_cols = get_filtered_columns(cot_currently_selected_filters) |
|
|
|
updated_performance = cot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) |
|
|
|
return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def cot_filter_data_access(data_access_choice): |
|
|
|
cot_currently_selected_filters["Data Access"] = data_access_choice |
|
|
|
if isEmpty(cot_currently_selected_filters): |
|
cot_df["Average Performance"] = original_cot_avg_perf |
|
return cot_df |
|
|
|
filtered_cols = get_filtered_columns(cot_currently_selected_filters) |
|
|
|
updated_performance = cot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) |
|
|
|
return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def cot_update_average_performance(selected_columns): |
|
""" |
|
When a user clicks filters to filter certain tasks, the average performance |
|
of the model should update. This function takes uses the updated filtered columns |
|
and calculates the average performances of only those columns. It then updates |
|
the leaderboard accordingly. |
|
""" |
|
updated_average_performance = {} |
|
|
|
for i in range(n_models): |
|
performance = 0 |
|
|
|
num_tasks = 0 |
|
for task in selected_columns: |
|
num_tasks += 1 |
|
performance += float(CoT_leaderboard_json[task][str(i)]) |
|
|
|
if num_tasks == 0: |
|
num_tasks = 1 |
|
|
|
updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2)) |
|
|
|
return updated_average_performance |
|
|
|
|
|
|
|
|
|
|
|
|
|
def five_shot_filter_language(language_choice): |
|
|
|
five_shot_currently_selected_filters["Language"] = language_choice |
|
|
|
if isEmpty(five_shot_currently_selected_filters): |
|
five_shot_df["Average Performance"] = original_five_shot_avg_perf |
|
return five_shot_df |
|
|
|
filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) |
|
|
|
updated_performance = five_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) |
|
|
|
return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def five_shot_filter_task_type(task_type_choice): |
|
|
|
five_shot_currently_selected_filters["Task Type"] = task_type_choice |
|
|
|
if isEmpty(five_shot_currently_selected_filters): |
|
five_shot_df["Average Performance"] = original_five_shot_avg_perf |
|
return five_shot_df |
|
|
|
filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) |
|
|
|
updated_performance = five_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) |
|
|
|
return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def five_shot_filter_clinical_context(clinical_context_choice): |
|
|
|
five_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice |
|
|
|
if isEmpty(five_shot_currently_selected_filters): |
|
five_shot_df["Average Performance"] = original_five_shot_avg_perf |
|
return five_shot_df |
|
|
|
filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) |
|
|
|
updated_performance = five_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) |
|
|
|
return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def five_shot_filter_applications(applications_choice): |
|
|
|
five_shot_currently_selected_filters["Applications"] = applications_choice |
|
|
|
if isEmpty(five_shot_currently_selected_filters): |
|
five_shot_df["Average Performance"] = original_five_shot_avg_perf |
|
return five_shot_df |
|
|
|
filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) |
|
|
|
updated_performance = five_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) |
|
|
|
return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def five_shot_filter_stage_options(stage_choice): |
|
|
|
five_shot_currently_selected_filters["Clinical Stage"] = stage_choice |
|
|
|
if isEmpty(five_shot_currently_selected_filters): |
|
five_shot_df["Average Performance"] = original_five_shot_avg_perf |
|
return five_shot_df |
|
|
|
filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) |
|
|
|
updated_performance = five_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) |
|
|
|
return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def five_shot_filter_data_access(data_access_choice): |
|
|
|
five_shot_currently_selected_filters["Data Access"] = data_access_choice |
|
|
|
if isEmpty(five_shot_currently_selected_filters): |
|
five_shot_df["Average Performance"] = original_five_shot_avg_perf |
|
return five_shot_df |
|
|
|
filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) |
|
|
|
updated_performance = five_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) |
|
|
|
return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
|
|
def five_shot_update_average_performance(selected_columns): |
|
""" |
|
When a user clicks filters to filter certain tasks, the average performance |
|
of the model should update. This function takes uses the updated filtered columns |
|
and calculates the average performances of only those columns. It then updates |
|
the leaderboard accordingly. |
|
""" |
|
updated_average_performance = {} |
|
|
|
for i in range(n_models): |
|
performance = 0 |
|
|
|
num_tasks = 0 |
|
for task in selected_columns: |
|
num_tasks += 1 |
|
performance += float(five_shot_leaderboard_json[task][str(i)]) |
|
|
|
if num_tasks == 0: |
|
num_tasks = 1 |
|
|
|
updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2)) |
|
|
|
return updated_average_performance |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def zero_shot_filter_language(language_choice): |
|
|
|
zero_shot_currently_selected_filters["Language"] = language_choice |
|
|
|
if isEmpty(zero_shot_currently_selected_filters): |
|
zero_shot_df["Average Performance"] = original_zero_shot_avg_perf |
|
return zero_shot_df |
|
|
|
filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) |
|
|
|
updated_performance = zero_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) |
|
|
|
return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def zero_shot_filter_task_type(task_type_choice): |
|
|
|
zero_shot_currently_selected_filters["Task Type"] = task_type_choice |
|
|
|
if isEmpty(zero_shot_currently_selected_filters): |
|
zero_shot_df["Average Performance"] = original_zero_shot_avg_perf |
|
return zero_shot_df |
|
|
|
filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) |
|
|
|
updated_performance = zero_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) |
|
|
|
return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def zero_shot_filter_clinical_context(clinical_context_choice): |
|
|
|
zero_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice |
|
|
|
if isEmpty(zero_shot_currently_selected_filters): |
|
zero_shot_df["Average Performance"] = original_zero_shot_avg_perf |
|
return zero_shot_df |
|
|
|
filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) |
|
|
|
updated_performance = zero_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) |
|
|
|
return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def zero_shot_filter_applications(applications_choice): |
|
|
|
zero_shot_currently_selected_filters["Applications"] = applications_choice |
|
|
|
if isEmpty(zero_shot_currently_selected_filters): |
|
zero_shot_df["Average Performance"] = original_zero_shot_avg_perf |
|
return zero_shot_df |
|
|
|
filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) |
|
|
|
updated_performance = zero_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) |
|
|
|
return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def zero_shot_filter_stage_options(stage_choice): |
|
|
|
zero_shot_currently_selected_filters["Clinical Stage"] = stage_choice |
|
|
|
if isEmpty(zero_shot_currently_selected_filters): |
|
zero_shot_df["Average Performance"] = original_zero_shot_avg_perf |
|
return zero_shot_df |
|
|
|
filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) |
|
|
|
updated_performance = zero_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) |
|
|
|
return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def zero_shot_filter_data_access(data_access_choice): |
|
|
|
zero_shot_currently_selected_filters["Data Access"] = data_access_choice |
|
|
|
if isEmpty(zero_shot_currently_selected_filters): |
|
zero_shot_df["Average Performance"] = original_zero_shot_avg_perf |
|
return zero_shot_df |
|
|
|
filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) |
|
|
|
updated_performance = zero_shot_update_average_performance(filtered_cols) |
|
|
|
|
|
updated_performance_int = {int(k): v for k, v in updated_performance.items()} |
|
|
|
|
|
zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) |
|
|
|
return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] |
|
|
|
def zero_shot_update_average_performance(selected_columns): |
|
""" |
|
When a user clicks filters to filter certain tasks, the average performance |
|
of the model should update. This function takes uses the updated filtered columns |
|
and calculates the average performances of only those columns. It then updates |
|
the leaderboard accordingly. |
|
""" |
|
updated_average_performance = {} |
|
|
|
for i in range(n_models): |
|
performance = 0 |
|
|
|
num_tasks = 0 |
|
for task in selected_columns: |
|
num_tasks += 1 |
|
performance += float(zero_shot_leaderboard_json[task][str(i)]) |
|
|
|
if num_tasks == 0: |
|
num_tasks = 1 |
|
|
|
updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2)) |
|
|
|
return updated_average_performance |
|
|
|
|
|
def postprocess(self, value: pd.DataFrame) -> DataframeData: |
|
|
|
if "Average Performance" in value.columns: |
|
|
|
value = value.sort_values(by="Average Performance", ascending=False) |
|
|
|
return DataframeData( |
|
headers=list(value.columns), |
|
data=value.to_dict(orient="split")["data"], |
|
) |
|
|
|
if value is None: |
|
return self.postprocess(pd.DataFrame({"column 1": []})) |
|
if isinstance(value, (str, pd.DataFrame)): |
|
if isinstance(value, str): |
|
value = pd.read_csv(value) |
|
if len(value) == 0: |
|
return DataframeData( |
|
headers=list(value.columns), |
|
data=[[]], |
|
) |
|
return DataframeData( |
|
headers=list(value.columns), |
|
data=value.to_dict(orient="split")["data"], |
|
) |
|
elif isinstance(value, Styler): |
|
if semantic_version.Version(pd.__version__) < semantic_version.Version( |
|
"1.5.0" |
|
): |
|
raise ValueError( |
|
"Styler objects are only supported in pandas version 1.5.0 or higher. Please try: `pip install --upgrade pandas` to use this feature." |
|
) |
|
if self.interactive: |
|
warnings.warn( |
|
"Cannot display Styler object in interactive mode. Will display as a regular pandas dataframe instead." |
|
) |
|
df: pd.DataFrame = value.data |
|
if len(df) == 0: |
|
return DataframeData( |
|
headers=list(df.columns), |
|
data=[[]], |
|
metadata=self.__extract_metadata(value), |
|
) |
|
return DataframeData( |
|
headers=list(df.columns), |
|
data=df.to_dict(orient="split")["data"], |
|
metadata=self.__extract_metadata(value), |
|
) |
|
|
|
|
|
Leaderboard.postprocess = postprocess |
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown("# BRIDGE (Benchmarking Large Language Models for Understanding Real-world Clinical Practice Text)") |
|
|
|
with gr.Tabs(): |
|
with gr.Tab("README"): |
|
|
|
html_content = (Path(__file__).parent / "docs.md").read_text() |
|
gr.HTML(html_content) |
|
|
|
with gr.Tab("Zero-Shot"): |
|
leaderboard = Leaderboard( |
|
value=zero_shot_df, |
|
select_columns = None, |
|
search_columns=SearchColumns(primary_column = "Model", secondary_columns = "", |
|
placeholder="Search by Model Name", |
|
label="Model Search"), |
|
hide_columns=["Model: Size Range", "Model: Accessibility"], |
|
filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"], |
|
datatype=config.TYPES, |
|
) |
|
|
|
|
|
all_languages = ['English', 'Spanish', |
|
'Chinese', 'Norwegian', |
|
'Russian', 'Portuguese', |
|
'German', 'Japanese', 'French'] |
|
|
|
language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language") |
|
|
|
|
|
all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition', |
|
'Normalization and Coding', 'Natural Language Inference', 'Summarization', |
|
'Event Extraction', 'Semantic Similarity'] |
|
|
|
|
|
task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type") |
|
|
|
all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology', |
|
'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology', |
|
'General', 'Endocrinology', 'Pediatrics', 'Pharmacology', |
|
'Gastroenterology', 'Psychology'] |
|
|
|
cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context") |
|
|
|
|
|
all_applications = ['Procudure information', 'Concept standarization', |
|
'Specialist recommendation', 'Negation identification', |
|
'Clinical trial matching', 'Consultation summarization', |
|
'Semantic relation', 'Post-discharge patient management', |
|
'De-identification', 'Billing & Coding', 'Phenotyping', |
|
'Data organization', 'Temporal & Causality relation', |
|
'Summarization', 'Screen & Consultation', 'Diagnosis', |
|
'ADE & Incidents', 'Risk factor extraction', 'Prognosis', |
|
'Medication information'] |
|
|
|
|
|
application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application") |
|
|
|
|
|
all_stages = ['Treatment and Intervention', 'Triage and Referral', |
|
'Initial Assessment', 'Discharge and Administration', |
|
'Research', 'Diagnosis and Prognosis'] |
|
|
|
stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage") |
|
|
|
|
|
all_data_access = ['Open Access', 'Regulated'] |
|
|
|
da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access") |
|
|
|
|
|
language_options.change(fn=zero_shot_filter_language, inputs=language_options, outputs=leaderboard) |
|
task_type_options.change(fn=zero_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard) |
|
cc_options.change(fn=zero_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard) |
|
application_options.change(fn=zero_shot_filter_applications, inputs=application_options, outputs=leaderboard) |
|
da_options.change(fn=zero_shot_filter_data_access, inputs=da_options, outputs=leaderboard) |
|
stage_options.change(fn=zero_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard) |
|
|
|
|
|
with gr.Tab("Few-Shot"): |
|
leaderboard = Leaderboard( |
|
value=five_shot_df, |
|
select_columns = None, |
|
search_columns=SearchColumns(primary_column = "Model", secondary_columns = "", |
|
placeholder="Search by Model Name", |
|
label="Model Search"), |
|
hide_columns=["Model: Size Range", "Model: Accessibility"], |
|
filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"], |
|
datatype=config.TYPES, |
|
) |
|
|
|
|
|
all_languages = ['English', 'Spanish', |
|
'Chinese', 'Norwegian', |
|
'Russian', 'Portuguese', |
|
'German', 'Japanese', 'French'] |
|
|
|
language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language") |
|
|
|
|
|
all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition', |
|
'Normalization and Coding', 'Natural Language Inference', 'Summarization', |
|
'Event Extraction', 'Semantic Similarity'] |
|
|
|
task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type") |
|
|
|
|
|
|
|
all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology', |
|
'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology', |
|
'General', 'Endocrinology', 'Pediatrics', 'Pharmacology', |
|
'Gastroenterology', 'Psychology'] |
|
|
|
cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context") |
|
|
|
|
|
all_applications = ['Procudure information', 'Concept standarization', |
|
'Specialist recommendation', 'Negation identification', |
|
'Clinical trial matching', 'Consultation summarization', |
|
'Semantic relation', 'Post-discharge patient management', |
|
'De-identification', 'Billing & Coding', 'Phenotyping', |
|
'Data organization', 'Temporal & Causality relation', |
|
'Summarization', 'Screen & Consultation', 'Diagnosis', |
|
'ADE & Incidents', 'Risk factor extraction', 'Prognosis', |
|
'Medication information'] |
|
|
|
application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application") |
|
|
|
|
|
all_stages = ['Treatment and Intervention', 'Triage and Referral', |
|
'Initial Assessment', 'Discharge and Administration', |
|
'Research', 'Diagnosis and Prognosis'] |
|
|
|
stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage") |
|
|
|
|
|
all_data_access = ['Open Access', 'Regulated'] |
|
|
|
da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access") |
|
|
|
language_options.change(fn=five_shot_filter_language, inputs=language_options, outputs=leaderboard) |
|
task_type_options.change(fn=five_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard) |
|
cc_options.change(fn=five_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard) |
|
application_options.change(fn=five_shot_filter_applications, inputs=application_options, outputs=leaderboard) |
|
da_options.change(fn=five_shot_filter_data_access, inputs=da_options, outputs=leaderboard) |
|
stage_options.change(fn=five_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard) |
|
|
|
|
|
with gr.Tab("CoT"): |
|
leaderboard = Leaderboard( |
|
value=cot_df, |
|
select_columns = None, |
|
search_columns=SearchColumns(primary_column = "Model", secondary_columns = "", |
|
placeholder="Search by Model Name", |
|
label="Model Search"), |
|
hide_columns=["Model: Size Range", "Model: Accessibility"], |
|
filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"], |
|
datatype=config.TYPES, |
|
) |
|
|
|
|
|
all_languages = ['English', 'Spanish', |
|
'Chinese', 'Norwegian', |
|
'Russian', 'Portuguese', |
|
'German', 'Japanese', 'French'] |
|
|
|
language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language") |
|
|
|
|
|
all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition', |
|
'Normalization and Coding', 'Natural Language Inference', 'Summarization', |
|
'Event Extraction', 'Semantic Similarity'] |
|
|
|
task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type") |
|
|
|
|
|
all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology', |
|
'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology', |
|
'General', 'Endocrinology', 'Pediatrics', 'Pharmacology', |
|
'Gastroenterology', 'Psychology'] |
|
|
|
cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context") |
|
|
|
|
|
all_applications = ['Procudure information', 'Concept standarization', |
|
'Specialist recommendation', 'Negation identification', |
|
'Clinical trial matching', 'Consultation summarization', |
|
'Semantic relation', 'Post-discharge patient management', |
|
'De-identification', 'Billing & Coding', 'Phenotyping', |
|
'Data organization', 'Temporal & Causality relation', |
|
'Summarization', 'Screen & Consultation', 'Diagnosis', |
|
'ADE & Incidents', 'Risk factor extraction', 'Prognosis', |
|
'Medication information'] |
|
|
|
application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application") |
|
|
|
|
|
all_stages = ['Treatment and Intervention', 'Triage and Referral', |
|
'Initial Assessment', 'Discharge and Administration', |
|
'Research', 'Diagnosis and Prognosis'] |
|
|
|
stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage") |
|
|
|
|
|
|
|
all_data_access = ['Open Access', 'Regulated'] |
|
|
|
da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access") |
|
|
|
|
|
language_options.change(fn=cot_filter_language, inputs=language_options, outputs=leaderboard) |
|
task_type_options.change(fn=cot_filter_task_type, inputs=task_type_options, outputs=leaderboard) |
|
cc_options.change(fn=cot_filter_clinical_context, inputs=cc_options, outputs=leaderboard) |
|
application_options.change(fn=cot_filter_applications, inputs=application_options, outputs=leaderboard) |
|
da_options.change(fn=cot_filter_data_access, inputs=da_options, outputs=leaderboard) |
|
|
|
stage_options.change(fn=cot_filter_stage_options, inputs=stage_options, outputs=leaderboard) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch() |
|
|
|
|