import gradio as gr from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns import config from pathlib import Path import pandas as pd import json import warnings from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Literal import pandas as pd from pandas.io.formats.style import Styler import semantic_version from dataclasses import dataclass, field from gradio.components import Component from gradio.data_classes import GradioModel from gradio.events import Events @dataclass class SelectColumns: default_selection: Optional[list[str]] = field(default_factory=list) cant_deselect: Optional[list[str]] = field(default_factory=list) allow: bool = True label: Optional[str] = None show_label: bool = True info: Optional[str] = None @dataclass class ColumnFilter: column: str type: Literal["slider", "dropdown", "checkboxgroup", "boolean"] = None default: Optional[Union[int, float, List[Tuple[str, str]]]] = None choices: Optional[Union[int, float, List[Tuple[str, str]]]] = None label: Optional[str] = None info: Optional[str] = None show_label: bool = True min: Optional[Union[int, float]] = None max: Optional[Union[int, float]] = None class DataframeData(GradioModel): headers: List[str] data: Union[List[List[Any]], List[Tuple[Any, ...]]] metadata: Optional[Dict[str, Optional[List[Any]]]] = None abs_path = Path(__file__).parent # Load the leaderboard data for zero_shot_df = pd.read_json("leaderboards/Zero-Shot_leaderboard_data.json", precise_float=True) five_shot_df = pd.read_json("leaderboards/Few-Shot_leaderboard_data.json", precise_float=True) cot_df = pd.read_json("leaderboards/CoT_leaderboard_data.json", precise_float=True) # Original Average Performances original_zero_shot_avg_perf = zero_shot_df["Average Performance"] original_five_shot_avg_perf = five_shot_df["Average Performance"] original_cot_avg_perf = cot_df["Average Performance"] # Load the task information json data with open("task_information.json", 'r') as file: task_information_json = json.load(file) cot_currently_selected_filters = { "Language": [], "Task Type": [], "Clinical Context": [], "Data Access": [], "Applications": [], "Clinical Stage": [] } five_shot_currently_selected_filters = { "Language": [], "Task Type": [], "Clinical Context": [], "Data Access": [], "Applications": [], "Clinical Stage": [] } zero_shot_currently_selected_filters = { "Language": [], "Task Type": [], "Clinical Context": [], "Data Access": [], "Applications": [], "Clinical Stage": [] } # with open("/Users/kevinxie/Desktop/Clinical NLP/Clinical-Text-Leaderboard/leaderboard_data.json", 'r') as file: with open("leaderboards/Few-Shot_leaderboard_data.json", 'r') as file: five_shot_leaderboard_json = json.load(file) with open("leaderboards/CoT_leaderboard_data.json", 'r') as file: CoT_leaderboard_json = json.load(file) with open("leaderboards/Zero-Shot_leaderboard_data.json", 'r') as file: zero_shot_leaderboard_json = json.load(file) valid_tasks = {'NUBES', 'NorSynthClinical-NER', 'MEDIQA 2023-sum-A', 'Medication extraction', 'IMCS-V2-DAC', 'Cantemist-Coding', 'IFMIR-NER', 'EHRQA-QA', 'Ex4CDS', 'MedDG', 'MTS-Temporal', 'CHIP-MDCFNPC', 'n2c2 2014-Diabetes', 'MIMIC-III Outcome.LoS', 'n2c2 2014-Hypertension', 'RuCCoN', 'CARES-ICD10 Chapter', 'RuDReC-NER', 'MIMIC-IV DiReCT.Dis', 'n2c2 2014-Medication', 'iCorpus', 'Brateca-Hospitalization', 'n2c2 2010-Assertion', 'NorSynthClinical-PHI', 'IFMIR - NER&factuality', 'JP-STS', 'NorSynthClinical-RE', 'n2c2 2010-Concept', 'BARR2', 'IMCS-V2-NER', 'IMCS-V2-MRG', 'cMedQA', 'MedSTS', 'BRONCO150-NER&Status', 'n2c2 2018-ADE&medication', 'CLISTER', 'ClinicalNotes-UPMC', 'PPTS', 'CLIP', 'IMCS-V2-SR', 'EHRQA-Sub department', 'BrainMRI-AIS', 'Brateca-Mortality', 'meddocan', 'CHIP-CDEE', 'CAS-evidence', 'MEDIQA 2019-RQE', 'Cantemis-Norm', 'MEDIQA 2023-sum-B', 'CHIP-CTC', 'C-EMRS', 'CARES ICD10 Block', 'Cantemis-NER', 'CLINpt-NER', 'MEDIQA 2023-chat-A', 'n2c2 2014-De-identification', 'n2c2 2014-Hyperlipidemia', 'EHRQA-Primary department', 'ADE-Drug dosage', 'IFMIR-Incident type', 'MIMIC-III Outcome.Mortality', 'n2c2 2006-De-identification', 'CAS-label', 'MIMIC-IV CDM', 'CodiEsp-ICD-10-CM', 'n2c2 2010-Relation', 'CARES-ICD10 Subblock', 'MIE', 'HealthCareMagic-100k', 'ADE-Identification', 'MIMIC-IV DiReCT.PDD', 'ADE-Extraction', 'DialMed', 'GOUT-CC-Consensus', 'GraSSCo PHI', 'RuMedNLI', 'RuMedDaNet', 'CBLUE-CDN', 'icliniq-10k', 'CARDIO-DE', 'CARES-Area', 'DiSMed-NER', 'CodiEsp-ICD-10-PCS', 'MedNLI', 'MTS', 'MIMIC-IV BHC', 'n2c2 2014-CAD'} n_models = int(list(zero_shot_leaderboard_json["Model"].keys())[-1]) + 1 def get_filtered_columns(filter_selections): """ Given an array of selected filters, this function will return a list of all the columns that match the criteria. Input: filter_selections: dictionary of all task type filter selections Output: Returns a list of all valid tasks to display (by task name) """ # Need to add a flag to this filter so that it only displays those that match all attributes valid_columns = [] for task in task_information_json: task_info = task_information_json[task] # Flag to keep track of whether this task is valid isValid = True # Iterate through each attribute of the task for attribute in task_info: # If the filter is empty if not filter_selections[attribute]: continue value = task_info[attribute] # print(filter_selections[attribute]) # Handle edge case for multiple categories if "," in value: all_categories = value.split(", ") flag = False for category in all_categories: if category in filter_selections[attribute]: flag = True break if flag: # one category matches isValid = True else: # none of the categories matched isValid = False # Handle Brazilian Edge Case elif (value == 'Portuguese\n(Brazilian)') and ('Portuguese' in filter_selections[attribute]): isValid = True break elif value not in filter_selections[attribute]: # if filter_selections[attribute] not in task_info[attribute]: isValid = False # break if task in valid_tasks and isValid: valid_columns.append(task) return valid_columns def isEmpty(currently_selected_filters): """ Checks if there are no selected filters """ flag = True for key, value in currently_selected_filters.items(): if not value: continue else: return False return True #################################################################################################### ####### CoT Filters #################################################################################################### def cot_filter_language(language_choice): # Update the Global store for the currently selected filters cot_currently_selected_filters["Language"] = language_choice if isEmpty(cot_currently_selected_filters): cot_df["Average Performance"] = original_cot_avg_perf return cot_df filtered_cols = get_filtered_columns(cot_currently_selected_filters) updated_performance = cot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Size (B)', 'Average Performance'] + filtered_cols] def cot_filter_task_type(task_type_choice): # Update the Global store for the currently selected filters cot_currently_selected_filters["Task Type"] = task_type_choice if isEmpty(cot_currently_selected_filters): cot_df["Average Performance"] = original_cot_avg_perf return cot_df filtered_cols = get_filtered_columns(cot_currently_selected_filters) updated_performance = cot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def cot_filter_clinical_context(clinical_context_choice): # Update the Global store for the currently selected filters cot_currently_selected_filters["Clinical Context"] = clinical_context_choice if isEmpty(cot_currently_selected_filters): cot_df["Average Performance"] = original_cot_avg_perf return cot_df filtered_cols = get_filtered_columns(cot_currently_selected_filters) updated_performance = cot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def cot_filter_applications(applications_choice): # Update the Global store for the currently selected filters cot_currently_selected_filters["Applications"] = applications_choice if isEmpty(cot_currently_selected_filters): cot_df["Average Performance"] = original_cot_avg_perf return cot_df filtered_cols = get_filtered_columns(cot_currently_selected_filters) updated_performance = cot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def cot_filter_stage_options(stage_choice): # Update the Global store for the currently selected filters cot_currently_selected_filters["Clinical Stage"] = stage_choice if isEmpty(cot_currently_selected_filters): cot_df["Average Performance"] = original_cot_avg_perf return cot_df filtered_cols = get_filtered_columns(cot_currently_selected_filters) updated_performance = cot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def cot_filter_data_access(data_access_choice): # Update the Global store for the currently selected filters cot_currently_selected_filters["Data Access"] = data_access_choice if isEmpty(cot_currently_selected_filters): cot_df["Average Performance"] = original_cot_avg_perf return cot_df filtered_cols = get_filtered_columns(cot_currently_selected_filters) updated_performance = cot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index cot_df["Average Performance"] = cot_df.index.map(updated_performance_int) return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def cot_update_average_performance(selected_columns): """ When a user clicks filters to filter certain tasks, the average performance of the model should update. This function takes uses the updated filtered columns and calculates the average performances of only those columns. It then updates the leaderboard accordingly. """ updated_average_performance = {} for i in range(n_models): performance = 0 num_tasks = 0 for task in selected_columns: num_tasks += 1 performance += float(CoT_leaderboard_json[task][str(i)]) if num_tasks == 0: num_tasks = 1 updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2)) return updated_average_performance #################################################################################################### ####### Few Shot Filters #################################################################################################### def five_shot_filter_language(language_choice): # Update the Global store for the currently selected filters five_shot_currently_selected_filters["Language"] = language_choice if isEmpty(five_shot_currently_selected_filters): five_shot_df["Average Performance"] = original_five_shot_avg_perf return five_shot_df filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) updated_performance = five_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def five_shot_filter_task_type(task_type_choice): # Update the Global store for the currently selected filters five_shot_currently_selected_filters["Task Type"] = task_type_choice if isEmpty(five_shot_currently_selected_filters): five_shot_df["Average Performance"] = original_five_shot_avg_perf return five_shot_df filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) updated_performance = five_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def five_shot_filter_clinical_context(clinical_context_choice): # Update the Global store for the currently selected filters five_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice if isEmpty(five_shot_currently_selected_filters): five_shot_df["Average Performance"] = original_five_shot_avg_perf return five_shot_df filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) updated_performance = five_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def five_shot_filter_applications(applications_choice): # Update the Global store for the currently selected filters five_shot_currently_selected_filters["Applications"] = applications_choice if isEmpty(five_shot_currently_selected_filters): five_shot_df["Average Performance"] = original_five_shot_avg_perf return five_shot_df filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) updated_performance = five_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def five_shot_filter_stage_options(stage_choice): # Update the Global store for the currently selected filters five_shot_currently_selected_filters["Clinical Stage"] = stage_choice if isEmpty(five_shot_currently_selected_filters): five_shot_df["Average Performance"] = original_five_shot_avg_perf return five_shot_df filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) updated_performance = five_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def five_shot_filter_data_access(data_access_choice): # Update the Global store for the currently selected filters five_shot_currently_selected_filters["Data Access"] = data_access_choice if isEmpty(five_shot_currently_selected_filters): five_shot_df["Average Performance"] = original_five_shot_avg_perf return five_shot_df filtered_cols = get_filtered_columns(five_shot_currently_selected_filters) updated_performance = five_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int) return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def five_shot_update_average_performance(selected_columns): """ When a user clicks filters to filter certain tasks, the average performance of the model should update. This function takes uses the updated filtered columns and calculates the average performances of only those columns. It then updates the leaderboard accordingly. """ updated_average_performance = {} for i in range(n_models): performance = 0 num_tasks = 0 for task in selected_columns: num_tasks += 1 performance += float(five_shot_leaderboard_json[task][str(i)]) if num_tasks == 0: num_tasks = 1 updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2)) return updated_average_performance #################################################################################################### ###### Zero Shot Filters #################################################################################################### def zero_shot_filter_language(language_choice): # Update the Global store for the currently selected filters zero_shot_currently_selected_filters["Language"] = language_choice if isEmpty(zero_shot_currently_selected_filters): zero_shot_df["Average Performance"] = original_zero_shot_avg_perf return zero_shot_df filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) updated_performance = zero_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def zero_shot_filter_task_type(task_type_choice): # Update the Global store for the currently selected filters zero_shot_currently_selected_filters["Task Type"] = task_type_choice if isEmpty(zero_shot_currently_selected_filters): zero_shot_df["Average Performance"] = original_zero_shot_avg_perf return zero_shot_df filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) updated_performance = zero_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def zero_shot_filter_clinical_context(clinical_context_choice): # Update the Global store for the currently selected filters zero_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice if isEmpty(zero_shot_currently_selected_filters): zero_shot_df["Average Performance"] = original_zero_shot_avg_perf return zero_shot_df filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) updated_performance = zero_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def zero_shot_filter_applications(applications_choice): # Update the Global store for the currently selected filters zero_shot_currently_selected_filters["Applications"] = applications_choice if isEmpty(zero_shot_currently_selected_filters): zero_shot_df["Average Performance"] = original_zero_shot_avg_perf return zero_shot_df filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) updated_performance = zero_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def zero_shot_filter_stage_options(stage_choice): # Update the Global store for the currently selected filters zero_shot_currently_selected_filters["Clinical Stage"] = stage_choice if isEmpty(zero_shot_currently_selected_filters): zero_shot_df["Average Performance"] = original_zero_shot_avg_perf return zero_shot_df filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) updated_performance = zero_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def zero_shot_filter_data_access(data_access_choice): # Update the Global store for the currently selected filters zero_shot_currently_selected_filters["Data Access"] = data_access_choice if isEmpty(zero_shot_currently_selected_filters): zero_shot_df["Average Performance"] = original_zero_shot_avg_perf return zero_shot_df filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters) updated_performance = zero_shot_update_average_performance(filtered_cols) # Convert dictionary keys to integers to match the DataFrame index updated_performance_int = {int(k): v for k, v in updated_performance.items()} # Map the values to the 'Average Performance' column based on index zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int) return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols] def zero_shot_update_average_performance(selected_columns): """ When a user clicks filters to filter certain tasks, the average performance of the model should update. This function takes uses the updated filtered columns and calculates the average performances of only those columns. It then updates the leaderboard accordingly. """ updated_average_performance = {} for i in range(n_models): performance = 0 num_tasks = 0 for task in selected_columns: num_tasks += 1 performance += float(zero_shot_leaderboard_json[task][str(i)]) if num_tasks == 0: num_tasks = 1 updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2)) return updated_average_performance def postprocess(self, value: pd.DataFrame) -> DataframeData: # Ensure that the "Average Performance" column exists if "Average Performance" in value.columns: # Sort the DataFrame by the "average performance" column in descending order value = value.sort_values(by="Average Performance", ascending=False) return DataframeData( headers=list(value.columns), # type: ignore data=value.to_dict(orient="split")["data"], # type: ignore ) if value is None: return self.postprocess(pd.DataFrame({"column 1": []})) if isinstance(value, (str, pd.DataFrame)): if isinstance(value, str): value = pd.read_csv(value) # type: ignore if len(value) == 0: return DataframeData( headers=list(value.columns), # type: ignore data=[[]], # type: ignore ) return DataframeData( headers=list(value.columns), # type: ignore data=value.to_dict(orient="split")["data"], # type: ignore ) elif isinstance(value, Styler): if semantic_version.Version(pd.__version__) < semantic_version.Version( "1.5.0" ): raise ValueError( "Styler objects are only supported in pandas version 1.5.0 or higher. Please try: `pip install --upgrade pandas` to use this feature." ) if self.interactive: warnings.warn( "Cannot display Styler object in interactive mode. Will display as a regular pandas dataframe instead." ) df: pd.DataFrame = value.data # type: ignore if len(df) == 0: return DataframeData( headers=list(df.columns), data=[[]], metadata=self.__extract_metadata(value), # type: ignore ) return DataframeData( headers=list(df.columns), data=df.to_dict(orient="split")["data"], # type: ignore metadata=self.__extract_metadata(value), # type: ignore ) # Models are sorted in order of decreasing average performance (best performance at the top!) Leaderboard.postprocess = postprocess #################################################################################################### ###### Leaderboard #################################################################################################### with gr.Blocks() as app: gr.Markdown("# BRIDGE (Benchmarking Large Language Models for Understanding Real-world Clinical Practice Text)") with gr.Tabs(): with gr.Tab("README"): # gr.Markdown((Path(__file__).parent / "docs.md").read_text()) html_content = (Path(__file__).parent / "docs.md").read_text() gr.HTML(html_content) with gr.Tab("Zero-Shot"): leaderboard = Leaderboard( value=zero_shot_df, select_columns = None, search_columns=SearchColumns(primary_column = "Model", secondary_columns = "", placeholder="Search by Model Name", label="Model Search"), hide_columns=["Model: Size Range", "Model: Accessibility"], filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"], datatype=config.TYPES, ) # Language Filter all_languages = ['English', 'Spanish', 'Chinese', 'Norwegian', 'Russian', 'Portuguese', 'German', 'Japanese', 'French'] language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language") # Task Type Filter all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition', 'Normalization and Coding', 'Natural Language Inference', 'Summarization', 'Event Extraction', 'Semantic Similarity'] task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type") all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology', 'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology', 'General', 'Endocrinology', 'Pediatrics', 'Pharmacology', 'Gastroenterology', 'Psychology'] cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context") # Applications Filter all_applications = ['Procudure information', 'Concept standarization', 'Specialist recommendation', 'Negation identification', 'Clinical trial matching', 'Consultation summarization', 'Semantic relation', 'Post-discharge patient management', 'De-identification', 'Billing & Coding', 'Phenotyping', 'Data organization', 'Temporal & Causality relation', 'Summarization', 'Screen & Consultation', 'Diagnosis', 'ADE & Incidents', 'Risk factor extraction', 'Prognosis', 'Medication information'] application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application") # Clinical Stage Filter all_stages = ['Treatment and Intervention', 'Triage and Referral', 'Initial Assessment', 'Discharge and Administration', 'Research', 'Diagnosis and Prognosis'] stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage") # Data Access Filter all_data_access = ['Open Access', 'Regulated'] da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access") language_options.change(fn=zero_shot_filter_language, inputs=language_options, outputs=leaderboard) task_type_options.change(fn=zero_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard) cc_options.change(fn=zero_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard) application_options.change(fn=zero_shot_filter_applications, inputs=application_options, outputs=leaderboard) da_options.change(fn=zero_shot_filter_data_access, inputs=da_options, outputs=leaderboard) stage_options.change(fn=zero_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard) with gr.Tab("Few-Shot"): leaderboard = Leaderboard( value=five_shot_df, select_columns = None, search_columns=SearchColumns(primary_column = "Model", secondary_columns = "", placeholder="Search by Model Name", label="Model Search"), hide_columns=["Model: Size Range", "Model: Accessibility"], filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"], datatype=config.TYPES, ) # Language Filter all_languages = ['English', 'Spanish', 'Chinese', 'Norwegian', 'Russian', 'Portuguese', 'German', 'Japanese', 'French'] language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language") # Task Type Filter all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition', 'Normalization and Coding', 'Natural Language Inference', 'Summarization', 'Event Extraction', 'Semantic Similarity'] task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type") # Clinical Context Filter all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology', 'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology', 'General', 'Endocrinology', 'Pediatrics', 'Pharmacology', 'Gastroenterology', 'Psychology'] cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context") # Applications Filter all_applications = ['Procudure information', 'Concept standarization', 'Specialist recommendation', 'Negation identification', 'Clinical trial matching', 'Consultation summarization', 'Semantic relation', 'Post-discharge patient management', 'De-identification', 'Billing & Coding', 'Phenotyping', 'Data organization', 'Temporal & Causality relation', 'Summarization', 'Screen & Consultation', 'Diagnosis', 'ADE & Incidents', 'Risk factor extraction', 'Prognosis', 'Medication information'] application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application") # Clinical Stage Filter all_stages = ['Treatment and Intervention', 'Triage and Referral', 'Initial Assessment', 'Discharge and Administration', 'Research', 'Diagnosis and Prognosis'] stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage") # Data Access Filter all_data_access = ['Open Access', 'Regulated'] da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access") language_options.change(fn=five_shot_filter_language, inputs=language_options, outputs=leaderboard) task_type_options.change(fn=five_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard) cc_options.change(fn=five_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard) application_options.change(fn=five_shot_filter_applications, inputs=application_options, outputs=leaderboard) da_options.change(fn=five_shot_filter_data_access, inputs=da_options, outputs=leaderboard) stage_options.change(fn=five_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard) with gr.Tab("CoT"): leaderboard = Leaderboard( value=cot_df, select_columns = None, search_columns=SearchColumns(primary_column = "Model", secondary_columns = "", placeholder="Search by Model Name", label="Model Search"), hide_columns=["Model: Size Range", "Model: Accessibility"], filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"], datatype=config.TYPES, ) # Language Filter all_languages = ['English', 'Spanish', 'Chinese', 'Norwegian', 'Russian', 'Portuguese', 'German', 'Japanese', 'French'] language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language") # Task Type Filter all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition', 'Normalization and Coding', 'Natural Language Inference', 'Summarization', 'Event Extraction', 'Semantic Similarity'] task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type") # Clinical Context Filter all_clinical_contexts = ['Neurology', 'Oncology', 'Radiology', 'Pulmonology', 'Cardiology', 'Dermatology', 'Critical Care', 'Nephrology', 'General', 'Endocrinology', 'Pediatrics', 'Pharmacology', 'Gastroenterology', 'Psychology'] cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context") # Applications Filter all_applications = ['Procudure information', 'Concept standarization', 'Specialist recommendation', 'Negation identification', 'Clinical trial matching', 'Consultation summarization', 'Semantic relation', 'Post-discharge patient management', 'De-identification', 'Billing & Coding', 'Phenotyping', 'Data organization', 'Temporal & Causality relation', 'Summarization', 'Screen & Consultation', 'Diagnosis', 'ADE & Incidents', 'Risk factor extraction', 'Prognosis', 'Medication information'] application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application") # Clinical Stage Filter all_stages = ['Treatment and Intervention', 'Triage and Referral', 'Initial Assessment', 'Discharge and Administration', 'Research', 'Diagnosis and Prognosis'] stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage") # Data Access Filter all_data_access = ['Open Access', 'Regulated'] da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access") language_options.change(fn=cot_filter_language, inputs=language_options, outputs=leaderboard) task_type_options.change(fn=cot_filter_task_type, inputs=task_type_options, outputs=leaderboard) cc_options.change(fn=cot_filter_clinical_context, inputs=cc_options, outputs=leaderboard) application_options.change(fn=cot_filter_applications, inputs=application_options, outputs=leaderboard) da_options.change(fn=cot_filter_data_access, inputs=da_options, outputs=leaderboard) stage_options.change(fn=cot_filter_stage_options, inputs=stage_options, outputs=leaderboard) if __name__ == "__main__": app.launch()