Spaces:

YLab-Open
/

BRIDGE-Medical-Leaderboard

Running

App Files Files Community

kevinxie06 commited on Apr 20

Commit

fb59c30

verified ·

1 Parent(s): e5d8b9c

Upload 5 files

Browse files

Files changed (5) hide show

app.py +921 -0
config.py +37 -0
docs.md +57 -0
requirements.txt +3 -0
task_information.json +698 -0

app.py ADDED Viewed

	@@ -0,0 +1,921 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns
+import config
+from pathlib import Path
+import pandas as pd
+import json
+import warnings
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Literal
+import pandas as pd
+from pandas.io.formats.style import Styler
+import semantic_version
+from dataclasses import dataclass, field
+from gradio.components import Component
+from gradio.data_classes import GradioModel
+from gradio.events import Events
+@dataclass
+class SelectColumns:
+    default_selection: Optional[list[str]] = field(default_factory=list)
+    cant_deselect: Optional[list[str]] = field(default_factory=list)
+    allow: bool = True
+    label: Optional[str] = None
+    show_label: bool = True
+    info: Optional[str] = None
+@dataclass
+class ColumnFilter:
+    column: str
+    type: Literal["slider", "dropdown", "checkboxgroup", "boolean"] = None
+    default: Optional[Union[int, float, List[Tuple[str, str]]]] = None
+    choices: Optional[Union[int, float, List[Tuple[str, str]]]] = None
+    label: Optional[str] = None
+    info: Optional[str] = None
+    show_label: bool = True
+    min: Optional[Union[int, float]] = None
+    max: Optional[Union[int, float]] = None
+class DataframeData(GradioModel):
+    headers: List[str]
+    data: Union[List[List[Any]], List[Tuple[Any, ...]]]
+    metadata: Optional[Dict[str, Optional[List[Any]]]] = None
+abs_path = Path(__file__).parent
+# Load the leaderboard data for
+zero_shot_df = pd.read_json("leaderboards/Zero-Shot_leaderboard_data.json", precise_float=True)
+five_shot_df = pd.read_json("leaderboards/Few-Shot_leaderboard_data.json", precise_float=True)
+cot_df = pd.read_json("leaderboards/CoT_leaderboard_data.json", precise_float=True)
+# Original Average Performances
+original_zero_shot_avg_perf = zero_shot_df["Average Performance"]
+original_five_shot_avg_perf = five_shot_df["Average Performance"]
+original_cot_avg_perf = cot_df["Average Performance"]
+# Load the task information json data
+with open("task_information.json", 'r') as file:
+    task_information_json = json.load(file)
+cot_currently_selected_filters = {
+    "Language": [],
+    "Task Type": [],
+    "Clinical Context": [],
+    "Data Access": [],
+    "Applications": [],
+    "Clinical Stage": []
+}
+five_shot_currently_selected_filters = {
+    "Language": [],
+    "Task Type": [],
+    "Clinical Context": [],
+    "Data Access": [],
+    "Applications": [],
+    "Clinical Stage": []
+}
+zero_shot_currently_selected_filters = {
+    "Language": [],
+    "Task Type": [],
+    "Clinical Context": [],
+    "Data Access": [],
+    "Applications": [],
+    "Clinical Stage": []
+}
+# with open("/Users/kevinxie/Desktop/Clinical NLP/Clinical-Text-Leaderboard/leaderboard_data.json", 'r') as file:
+with open("leaderboards/Few-Shot_leaderboard_data.json", 'r') as file:
+    five_shot_leaderboard_json = json.load(file)
+with open("leaderboards/CoT_leaderboard_data.json", 'r') as file:
+    CoT_leaderboard_json = json.load(file)
+with open("leaderboards/Zero-Shot_leaderboard_data.json", 'r') as file:
+    zero_shot_leaderboard_json = json.load(file)
+valid_tasks = {'NUBES', 'NorSynthClinical-NER', 'MEDIQA 2023-sum-A', 'Medication extraction',
+               'IMCS-V2-DAC', 'Cantemist-Coding', 'IFMIR-NER', 'EHRQA-QA', 'Ex4CDS', 'MedDG',
+               'MTS-Temporal', 'CHIP-MDCFNPC', 'n2c2 2014-Diabetes', 'MIMIC-III Outcome.LoS',
+               'n2c2 2014-Hypertension', 'RuCCoN', 'CARES-ICD10 Chapter', 'RuDReC-NER', 'MIMIC-IV DiReCT.Dis',
+               'n2c2 2014-Medication', 'iCorpus', 'Brateca-Hospitalization', 'n2c2 2010-Assertion',
+               'NorSynthClinical-PHI', 'IFMIR - NER&factuality', 'JP-STS', 'NorSynthClinical-RE',
+               'n2c2 2010-Concept', 'BARR2', 'IMCS-V2-NER', 'IMCS-V2-MRG', 'cMedQA', 'MedSTS',
+               'BRONCO150-NER&Status', 'n2c2 2018-ADE&medication', 'CLISTER', 'ClinicalNotes-UPMC',
+               'PPTS', 'CLIP', 'IMCS-V2-SR', 'EHRQA-Sub department', 'BrainMRI-AIS', 'Brateca-Mortality',
+               'meddocan', 'CHIP-CDEE', 'CAS-evidence', 'MEDIQA 2019-RQE', 'Cantemis-Norm', 'MEDIQA 2023-sum-B',
+               'CHIP-CTC', 'C-EMRS', 'CARES ICD10 Block', 'Cantemis-NER', 'CLINpt-NER', 'MEDIQA 2023-chat-A',
+               'n2c2 2014-De-identification', 'n2c2 2014-Hyperlipidemia', 'EHRQA-Primary department',
+               'ADE-Drug dosage', 'IFMIR-Incident type', 'MIMIC-III Outcome.Mortality', 'n2c2 2006-De-identification',
+               'CAS-label', 'MIMIC-IV CDM', 'CodiEsp-ICD-10-CM', 'n2c2 2010-Relation', 'CARES-ICD10 Subblock',
+               'MIE', 'HealthCareMagic-100k', 'ADE-Identification', 'MIMIC-IV DiReCT.PDD', 'ADE-Extraction',
+               'DialMed', 'GOUT-CC-Consensus', 'GraSSCo PHI', 'RuMedNLI', 'RuMedDaNet', 'CBLUE-CDN', 'icliniq-10k',
+               'CARDIO-DE', 'CARES-Area', 'DiSMed-NER', 'CodiEsp-ICD-10-PCS', 'MedNLI', 'MTS', 'MIMIC-IV BHC',
+               'n2c2 2014-CAD'}
+n_models = int(list(zero_shot_leaderboard_json["Model"].keys())[-1]) + 1
+def get_filtered_columns(filter_selections):
+    """
+    Given an array of selected filters, this function will return a list of all
+    the columns that match the criteria.
+    Input:
+        filter_selections: dictionary of all task type filter selections
+    Output:
+        Returns a list of all valid tasks to display (by task name)
+    """
+    # Need to add a flag to this filter so that it only displays those that match all attributes
+    valid_columns = []
+    for task in task_information_json:
+        task_info = task_information_json[task]
+        # Flag to keep track of whether this task is valid
+        isValid = True
+        # Iterate through each attribute of the task
+        for attribute in task_info:
+            # If the filter is empty
+            if not filter_selections[attribute]:
+                continue
+            value = task_info[attribute]
+            # print(filter_selections[attribute])
+            # Handle edge case for multiple categories
+            if "," in value:
+                all_categories = value.split(", ")
+                flag = False
+                for category in all_categories:
+                    if category in filter_selections[attribute]:
+                        flag = True
+                        break
+                if flag:  # one category matches
+                    isValid = True
+                else: # none of the categories matched
+                    isValid  = False
+            # Handle Brazilian Edge Case
+            elif (value == 'Portuguese\n(Brazilian)') and ('Portuguese' in filter_selections[attribute]):
+                isValid = True
+                break
+            elif value not in filter_selections[attribute]:
+            # if filter_selections[attribute] not in task_info[attribute]:
+                isValid = False
+                # break
+        if task in valid_tasks and isValid:
+            valid_columns.append(task)
+    return valid_columns
+def isEmpty(currently_selected_filters):
+    """
+    Checks if there are no selected filters
+    """
+    flag = True
+    for key, value in currently_selected_filters.items():
+        if not value:
+            continue
+        else:
+            return False
+    return True
+####################################################################################################
+####### CoT Filters
+####################################################################################################
+def cot_filter_language(language_choice):
+    # Update the Global store for the currently selected filters
+    cot_currently_selected_filters["Language"] = language_choice
+    if isEmpty(cot_currently_selected_filters):
+        cot_df["Average Performance"] = original_cot_avg_perf
+        return cot_df
+    filtered_cols = get_filtered_columns(cot_currently_selected_filters)
+    updated_performance = cot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
+    return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Size (B)', 'Average Performance'] + filtered_cols]
+def cot_filter_task_type(task_type_choice):
+    # Update the Global store for the currently selected filters
+    cot_currently_selected_filters["Task Type"] = task_type_choice
+    if isEmpty(cot_currently_selected_filters):
+        cot_df["Average Performance"] = original_cot_avg_perf
+        return cot_df
+    filtered_cols = get_filtered_columns(cot_currently_selected_filters)
+    updated_performance = cot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
+    return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def cot_filter_clinical_context(clinical_context_choice):
+    # Update the Global store for the currently selected filters
+    cot_currently_selected_filters["Clinical Context"] = clinical_context_choice
+    if isEmpty(cot_currently_selected_filters):
+        cot_df["Average Performance"] = original_cot_avg_perf
+        return cot_df
+    filtered_cols = get_filtered_columns(cot_currently_selected_filters)
+    updated_performance = cot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
+    return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def cot_filter_applications(applications_choice):
+    # Update the Global store for the currently selected filters
+    cot_currently_selected_filters["Applications"] = applications_choice
+    if isEmpty(cot_currently_selected_filters):
+        cot_df["Average Performance"] = original_cot_avg_perf
+        return cot_df
+    filtered_cols = get_filtered_columns(cot_currently_selected_filters)
+    updated_performance = cot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
+    return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def cot_filter_stage_options(stage_choice):
+    # Update the Global store for the currently selected filters
+    cot_currently_selected_filters["Clinical Stage"] = stage_choice
+    if isEmpty(cot_currently_selected_filters):
+        cot_df["Average Performance"] = original_cot_avg_perf
+        return cot_df
+    filtered_cols = get_filtered_columns(cot_currently_selected_filters)
+    updated_performance = cot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
+    return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def cot_filter_data_access(data_access_choice):
+    # Update the Global store for the currently selected filters
+    cot_currently_selected_filters["Data Access"] = data_access_choice
+    if isEmpty(cot_currently_selected_filters):
+        cot_df["Average Performance"] = original_cot_avg_perf
+        return cot_df
+    filtered_cols = get_filtered_columns(cot_currently_selected_filters)
+    updated_performance = cot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    cot_df["Average Performance"] = cot_df.index.map(updated_performance_int)
+    return cot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def cot_update_average_performance(selected_columns):
+    """
+    When a user clicks filters to filter certain tasks, the average performance
+    of the model should update. This function takes uses the updated filtered columns
+    and calculates the average performances of only those columns. It then updates
+    the leaderboard accordingly.
+    """
+    updated_average_performance = {}
+    for i in range(n_models):
+        performance = 0
+        num_tasks = 0
+        for task in selected_columns:
+            num_tasks += 1
+            performance += float(CoT_leaderboard_json[task][str(i)])
+        if num_tasks == 0:
+            num_tasks = 1
+        updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2))
+    return updated_average_performance
+####################################################################################################
+####### Few Shot Filters
+####################################################################################################
+def five_shot_filter_language(language_choice):
+    # Update the Global store for the currently selected filters
+    five_shot_currently_selected_filters["Language"] = language_choice
+    if isEmpty(five_shot_currently_selected_filters):
+        five_shot_df["Average Performance"] = original_five_shot_avg_perf
+        return five_shot_df
+    filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
+    updated_performance = five_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
+    return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def five_shot_filter_task_type(task_type_choice):
+    # Update the Global store for the currently selected filters
+    five_shot_currently_selected_filters["Task Type"] = task_type_choice
+    if isEmpty(five_shot_currently_selected_filters):
+        five_shot_df["Average Performance"] = original_five_shot_avg_perf
+        return five_shot_df
+    filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
+    updated_performance = five_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
+    return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def five_shot_filter_clinical_context(clinical_context_choice):
+    # Update the Global store for the currently selected filters
+    five_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice
+    if isEmpty(five_shot_currently_selected_filters):
+        five_shot_df["Average Performance"] = original_five_shot_avg_perf
+        return five_shot_df
+    filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
+    updated_performance = five_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
+    return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def five_shot_filter_applications(applications_choice):
+    # Update the Global store for the currently selected filters
+    five_shot_currently_selected_filters["Applications"] = applications_choice
+    if isEmpty(five_shot_currently_selected_filters):
+        five_shot_df["Average Performance"] = original_five_shot_avg_perf
+        return five_shot_df
+    filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
+    updated_performance = five_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
+    return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def five_shot_filter_stage_options(stage_choice):
+    # Update the Global store for the currently selected filters
+    five_shot_currently_selected_filters["Clinical Stage"] = stage_choice
+    if isEmpty(five_shot_currently_selected_filters):
+        five_shot_df["Average Performance"] = original_five_shot_avg_perf
+        return five_shot_df
+    filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
+    updated_performance = five_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
+    return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def five_shot_filter_data_access(data_access_choice):
+    # Update the Global store for the currently selected filters
+    five_shot_currently_selected_filters["Data Access"] = data_access_choice
+    if isEmpty(five_shot_currently_selected_filters):
+        five_shot_df["Average Performance"] = original_five_shot_avg_perf
+        return five_shot_df
+    filtered_cols = get_filtered_columns(five_shot_currently_selected_filters)
+    updated_performance = five_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    five_shot_df["Average Performance"] = five_shot_df.index.map(updated_performance_int)
+    return five_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def five_shot_update_average_performance(selected_columns):
+    """
+    When a user clicks filters to filter certain tasks, the average performance
+    of the model should update. This function takes uses the updated filtered columns
+    and calculates the average performances of only those columns. It then updates
+    the leaderboard accordingly.
+    """
+    updated_average_performance = {}
+    for i in range(n_models):
+        performance = 0
+        num_tasks = 0
+        for task in selected_columns:
+            num_tasks += 1
+            performance += float(five_shot_leaderboard_json[task][str(i)])
+        if num_tasks == 0:
+            num_tasks = 1
+        updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2))
+    return updated_average_performance
+####################################################################################################
+###### Zero Shot Filters
+####################################################################################################
+def zero_shot_filter_language(language_choice):
+    # Update the Global store for the currently selected filters
+    zero_shot_currently_selected_filters["Language"] = language_choice
+    if isEmpty(zero_shot_currently_selected_filters):
+        zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
+        return zero_shot_df
+    filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
+    updated_performance = zero_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
+    return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def zero_shot_filter_task_type(task_type_choice):
+    # Update the Global store for the currently selected filters
+    zero_shot_currently_selected_filters["Task Type"] = task_type_choice
+    if isEmpty(zero_shot_currently_selected_filters):
+        zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
+        return zero_shot_df
+    filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
+    updated_performance = zero_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
+    return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def zero_shot_filter_clinical_context(clinical_context_choice):
+    # Update the Global store for the currently selected filters
+    zero_shot_currently_selected_filters["Clinical Context"] = clinical_context_choice
+    if isEmpty(zero_shot_currently_selected_filters):
+        zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
+        return zero_shot_df
+    filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
+    updated_performance = zero_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
+    return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def zero_shot_filter_applications(applications_choice):
+    # Update the Global store for the currently selected filters
+    zero_shot_currently_selected_filters["Applications"] = applications_choice
+    if isEmpty(zero_shot_currently_selected_filters):
+        zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
+        return zero_shot_df
+    filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
+    updated_performance = zero_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
+    return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def zero_shot_filter_stage_options(stage_choice):
+    # Update the Global store for the currently selected filters
+    zero_shot_currently_selected_filters["Clinical Stage"] = stage_choice
+    if isEmpty(zero_shot_currently_selected_filters):
+        zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
+        return zero_shot_df
+    filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
+    updated_performance = zero_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
+    return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def zero_shot_filter_data_access(data_access_choice):
+    # Update the Global store for the currently selected filters
+    zero_shot_currently_selected_filters["Data Access"] = data_access_choice
+    if isEmpty(zero_shot_currently_selected_filters):
+        zero_shot_df["Average Performance"] = original_zero_shot_avg_perf
+        return zero_shot_df
+    filtered_cols = get_filtered_columns(zero_shot_currently_selected_filters)
+    updated_performance = zero_shot_update_average_performance(filtered_cols)
+    # Convert dictionary keys to integers to match the DataFrame index
+    updated_performance_int = {int(k): v for k, v in updated_performance.items()}
+    # Map the values to the 'Average Performance' column based on index
+    zero_shot_df["Average Performance"] = zero_shot_df.index.map(updated_performance_int)
+    return zero_shot_df[['T', 'Model', 'Model: Domain', 'Model: Accessibility', 'Model: Size Range', 'Size (B)', 'Average Performance'] + filtered_cols]
+def zero_shot_update_average_performance(selected_columns):
+    """
+    When a user clicks filters to filter certain tasks, the average performance
+    of the model should update. This function takes uses the updated filtered columns
+    and calculates the average performances of only those columns. It then updates
+    the leaderboard accordingly.
+    """
+    updated_average_performance = {}
+    for i in range(n_models):
+        performance = 0
+        num_tasks = 0
+        for task in selected_columns:
+            num_tasks += 1
+            performance += float(zero_shot_leaderboard_json[task][str(i)])
+        if num_tasks == 0:
+            num_tasks = 1
+        updated_average_performance[f"{i}"] = float(round(performance / num_tasks, 2))
+    return updated_average_performance
+def postprocess(self, value: pd.DataFrame) -> DataframeData:
+        # Ensure that the "Average Performance" column exists
+        if "Average Performance" in value.columns:
+            # Sort the DataFrame by the "average performance" column in descending order
+            value = value.sort_values(by="Average Performance", ascending=False)
+            return DataframeData(
+                headers=list(value.columns),  # type: ignore
+                data=value.to_dict(orient="split")["data"],  # type: ignore
+            )
+        if value is None:
+            return self.postprocess(pd.DataFrame({"column 1": []}))
+        if isinstance(value, (str, pd.DataFrame)):
+            if isinstance(value, str):
+                value = pd.read_csv(value)  # type: ignore
+            if len(value) == 0:
+                return DataframeData(
+                    headers=list(value.columns),  # type: ignore
+                    data=[[]],  # type: ignore
+                )
+            return DataframeData(
+                headers=list(value.columns),  # type: ignore
+                data=value.to_dict(orient="split")["data"],  # type: ignore
+            )
+        elif isinstance(value, Styler):
+            if semantic_version.Version(pd.__version__) < semantic_version.Version(
+                "1.5.0"
+            ):
+                raise ValueError(
+                    "Styler objects are only supported in pandas version 1.5.0 or higher. Please try: `pip install --upgrade pandas` to use this feature."
+                )
+            if self.interactive:
+                warnings.warn(
+                    "Cannot display Styler object in interactive mode. Will display as a regular pandas dataframe instead."
+                )
+            df: pd.DataFrame = value.data  # type: ignore
+            if len(df) == 0:
+                return DataframeData(
+                    headers=list(df.columns),
+                    data=[[]],
+                    metadata=self.__extract_metadata(value),  # type: ignore
+                )
+            return DataframeData(
+                headers=list(df.columns),
+                data=df.to_dict(orient="split")["data"],  # type: ignore
+                metadata=self.__extract_metadata(value),  # type: ignore
+            )
+# Models are sorted in order of decreasing average performance (best performance at the top!)
+Leaderboard.postprocess = postprocess
+####################################################################################################
+###### Leaderboard
+####################################################################################################
+with gr.Blocks() as app:
+    gr.Markdown("# BRIDGE (Benchmarking Large Language Models in Multilingual Real-world Clinical Text Understanding)")
+    with gr.Tabs():
+        with gr.Tab("README"):
+            gr.Markdown((Path(__file__).parent / "docs.md").read_text())
+        with gr.Tab("Zero-Shot"):
+            leaderboard = Leaderboard(
+                value=zero_shot_df,
+                select_columns = None,
+                search_columns=SearchColumns(primary_column = "Model", secondary_columns = "",
+                                     placeholder="Search by Model Name",
+                                     label="Model Search"),
+                hide_columns=["Model: Size Range", "Model: Accessibility"],
+                filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"],
+                datatype=config.TYPES,
+            )
+            # Language Filter
+            all_languages = ['English', 'Spanish',
+                             'Chinese', 'Norwegian',
+                             'Russian', 'Portuguese',
+                             'German', 'Japanese', 'French']
+            language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language")
+            # Task Type Filter
+            all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition',
+                              'Normalization and Coding', 'Natural Language Inference', 'Summarization',
+                              'Event Extraction', 'Semantic Similarity']
+            task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type")
+            all_clinical_contexts = ['Neurology',  'Oncology',  'Radiology',  'Pulmonology',
+                                     'Cardiology',  'Dermatology',  'Critical Care',  'Nephrology',
+                                     'General',  'Endocrinology',  'Pediatrics',  'Pharmacology',
+                                     'Gastroenterology',  'Psychology']
+            cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context")
+            # Applications Filter
+            all_applications = ['Procudure information', 'Concept standarization',
+                                'Specialist recommendation', 'Negation identification',
+                                'Clinical trial matching', 'Consultation summarization',
+                                'Semantic relation', 'Post-discharge patient management',
+                                'De-identification', 'Billing & Coding', 'Phenotyping',
+                                'Data organization', 'Temporal & Causality relation',
+                                'Summarization', 'Screen & Consultation', 'Diagnosis',
+                                'ADE & Incidents', 'Risk factor extraction', 'Prognosis',
+                                'Medication information']
+            application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application")
+            # Clinical Stage Filter
+            all_stages = ['Treatment and Intervention', 'Triage and Referral',
+                          'Initial Assessment', 'Discharge and Administration',
+                          'Research', 'Diagnosis and Prognosis']
+            stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage")
+            # Data Access Filter
+            all_data_access = ['Open Access', 'Regulated']
+            da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access")
+            language_options.change(fn=zero_shot_filter_language, inputs=language_options, outputs=leaderboard)
+            task_type_options.change(fn=zero_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard)
+            cc_options.change(fn=zero_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard)
+            application_options.change(fn=zero_shot_filter_applications, inputs=application_options, outputs=leaderboard)
+            da_options.change(fn=zero_shot_filter_data_access, inputs=da_options, outputs=leaderboard)
+            stage_options.change(fn=zero_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard)
+        with gr.Tab("Few-Shot"):
+            leaderboard = Leaderboard(
+                value=five_shot_df,
+                select_columns = None,
+                search_columns=SearchColumns(primary_column = "Model", secondary_columns = "",
+                                     placeholder="Search by Model Name",
+                                     label="Model Search"),
+                hide_columns=["Model: Size Range", "Model: Accessibility"],
+                filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"],
+                datatype=config.TYPES,
+            )
+            # Language Filter
+            all_languages = ['English', 'Spanish',
+                             'Chinese', 'Norwegian',
+                             'Russian', 'Portuguese',
+                             'German', 'Japanese', 'French']
+            language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language")
+            # Task Type Filter
+            all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition',
+                              'Normalization and Coding', 'Natural Language Inference', 'Summarization',
+                              'Event Extraction', 'Semantic Similarity']
+            task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type")
+            # Clinical Context Filter
+            all_clinical_contexts = ['Neurology',  'Oncology',  'Radiology',  'Pulmonology',
+                                     'Cardiology',  'Dermatology',  'Critical Care',  'Nephrology',
+                                     'General',  'Endocrinology',  'Pediatrics',  'Pharmacology',
+                                     'Gastroenterology',  'Psychology']
+            cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context")
+            # Applications Filter
+            all_applications = ['Procudure information', 'Concept standarization',
+                                'Specialist recommendation', 'Negation identification',
+                                'Clinical trial matching', 'Consultation summarization',
+                                'Semantic relation', 'Post-discharge patient management',
+                                'De-identification', 'Billing & Coding', 'Phenotyping',
+                                'Data organization', 'Temporal & Causality relation',
+                                'Summarization', 'Screen & Consultation', 'Diagnosis',
+                                'ADE & Incidents', 'Risk factor extraction', 'Prognosis',
+                                'Medication information']
+            application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application")
+            # Clinical Stage Filter
+            all_stages = ['Treatment and Intervention', 'Triage and Referral',
+                          'Initial Assessment', 'Discharge and Administration',
+                          'Research', 'Diagnosis and Prognosis']
+            stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage")
+            # Data Access Filter
+            all_data_access = ['Open Access', 'Regulated']
+            da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access")
+            language_options.change(fn=five_shot_filter_language, inputs=language_options, outputs=leaderboard)
+            task_type_options.change(fn=five_shot_filter_task_type, inputs=task_type_options, outputs=leaderboard)
+            cc_options.change(fn=five_shot_filter_clinical_context, inputs=cc_options, outputs=leaderboard)
+            application_options.change(fn=five_shot_filter_applications, inputs=application_options, outputs=leaderboard)
+            da_options.change(fn=five_shot_filter_data_access, inputs=da_options, outputs=leaderboard)
+            stage_options.change(fn=five_shot_filter_stage_options, inputs=stage_options, outputs=leaderboard)
+        with gr.Tab("CoT"):
+            leaderboard = Leaderboard(
+                value=cot_df,
+                select_columns = None,
+                search_columns=SearchColumns(primary_column = "Model", secondary_columns = "",
+                                     placeholder="Search by Model Name",
+                                     label="Model Search"),
+                hide_columns=["Model: Size Range", "Model: Accessibility"],
+                filter_columns=["Model: Domain", "Model: Size Range", "Model: Accessibility"],
+                datatype=config.TYPES,
+            )
+            # Language Filter
+            all_languages = ['English', 'Spanish',
+                             'Chinese', 'Norwegian',
+                             'Russian', 'Portuguese',
+                             'German', 'Japanese', 'French']
+            language_options = gr.CheckboxGroup(all_languages, label="Filter Task: Language")
+            # Task Type Filter
+            all_task_types = ['Question Answering', 'Text Classification', 'Named Entity Recognition',
+                              'Normalization and Coding', 'Natural Language Inference', 'Summarization',
+                              'Event Extraction', 'Semantic Similarity']
+            task_type_options = gr.CheckboxGroup(all_task_types, label="Filter Task: Task Type")
+            # Clinical Context Filter
+            all_clinical_contexts = ['Neurology',  'Oncology',  'Radiology',  'Pulmonology',
+                                     'Cardiology',  'Dermatology',  'Critical Care',  'Nephrology',
+                                     'General',  'Endocrinology',  'Pediatrics',  'Pharmacology',
+                                     'Gastroenterology',  'Psychology']
+            cc_options = gr.CheckboxGroup(all_clinical_contexts, label="Filter Task: Clinical Context")
+            # Applications Filter
+            all_applications = ['Procudure information', 'Concept standarization',
+                                'Specialist recommendation', 'Negation identification',
+                                'Clinical trial matching', 'Consultation summarization',
+                                'Semantic relation', 'Post-discharge patient management',
+                                'De-identification', 'Billing & Coding', 'Phenotyping',
+                                'Data organization', 'Temporal & Causality relation',
+                                'Summarization', 'Screen & Consultation', 'Diagnosis',
+                                'ADE & Incidents', 'Risk factor extraction', 'Prognosis',
+                                'Medication information']
+            application_options = gr.CheckboxGroup(all_applications, label="Filter Task: Clinical Application")
+            # Clinical Stage Filter
+            all_stages = ['Treatment and Intervention', 'Triage and Referral',
+                          'Initial Assessment', 'Discharge and Administration',
+                          'Research', 'Diagnosis and Prognosis']
+            stage_options = gr.CheckboxGroup(all_stages, label="Filter Task: Clinical Stage")
+            # Data Access Filter
+            all_data_access = ['Open Access', 'Regulated']
+            da_options = gr.CheckboxGroup(all_data_access, label="Filter Task: Data Access")
+            language_options.change(fn=cot_filter_language, inputs=language_options, outputs=leaderboard)
+            task_type_options.change(fn=cot_filter_task_type, inputs=task_type_options, outputs=leaderboard)
+            cc_options.change(fn=cot_filter_clinical_context, inputs=cc_options, outputs=leaderboard)
+            application_options.change(fn=cot_filter_applications, inputs=application_options, outputs=leaderboard)
+            da_options.change(fn=cot_filter_data_access, inputs=da_options, outputs=leaderboard)
+            stage_options.change(fn=cot_filter_stage_options, inputs=stage_options, outputs=leaderboard)
+if __name__ == "__main__":
+    app.launch()

config.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import pandas as pd
+TYPES = [
+    "str",
+    "markdown",
+    "number",
+    "number",
+    "number",
+    "number",
+    "number",
+    "number",
+    "number",
+    "str",
+    "str",
+    "str",
+    "str",
+    "bool",
+    "str",
+    "number",
+    "number",
+    "bool",
+    "str",
+    "bool",
+    "bool",
+    "str",
+]
+NUMERIC_INTERVALS = {
+    "?": pd.Interval(-1, 0, closed="right"),
+    "~1.5": pd.Interval(0, 2, closed="right"),
+    "~3": pd.Interval(2, 4, closed="right"),
+    "~7": pd.Interval(4, 9, closed="right"),
+    "~13": pd.Interval(9, 20, closed="right"),
+    "~35": pd.Interval(20, 45, closed="right"),
+    "~60": pd.Interval(45, 70, closed="right"),
+    "70+": pd.Interval(70, 10000, closed="right"),
+}

docs.md ADDED Viewed

	@@ -0,0 +1,57 @@

+![image/png](https://cdn-uploads.huggingface.co/production/uploads/67a040fb6934f9aa1c866f99/lsCIUxFkADB-Wf9cteeB4.png)
+![image/webp](https://cdn-uploads.huggingface.co/production/uploads/67a040fb6934f9aa1c866f99/E-WF4uJB0GzplioJkWh5v.webp)
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/67a040fb6934f9aa1c866f99/xQqbGXh0y6zIV78Cw6Vpq.png)
+## 📜 Background
+Recent advances in **Large Language Models (LLMs)** have demonstrated transformative potential in improving healthcare delivery and clinical research. By combining extensive pretraining with supervised instruction tuning across diverse tasks, LLMs excel in natural language understanding, generation, and reasoning. These capabilities allow LLMs to serve as versatile, general-purpose medical assistants.
+Despite this promise, concerns remain around the **reliability and clinical validity** of LLM-generated outputs. Real-world contexts often involve unstructured, multilingual text from **electronic health records (EHRs)**, and require support for tasks like phenotype identification and event extraction that remain underexplored. Moreover, the scarcity of **multilingual benchmarks** further limits the global applicability of LLMs in medicine.
+To address these challenges, we introduce the ***largest multilingual clinical benchmark*** to date, **BRIDGE (Benchmarking Large Language Models in Multilingual Real-world Clinical Text Understanding)**, evaluating 52 LLMs on:
+- **87 clinical tasks**
+- **9 languages**
+- **1M+ clinical samples**
+## 🌍 Key Features
+Our benchmark spans a wide range of document types and clinical tasks, including classification, event extraction, and generation. It further supports three inference strategies: **zero-shot**, **few-shot**, and **chain-of-thought (CoT)** prompting. We evaluated **52 LLMs**, including general-purpose, open-source, proprietary, and medical-domain models.
+- **Multilingual Data**: Clinical tasks in **9 languages** for global relevance.
+- **Diverse Clinical Documents**: Notes, summaries, radiology reports, and more.
+- **Multiple NLP Tasks**: Classification, extraction, QA, summarization, etc.
+- **Evaluation Modes**:
+  - **Zero-shot**
+  - **Few-shot**
+  - **Chain-of-Thought (CoT)** reasoning
+## 🏆 BRIDGE Leaderboard
+To support ongoing evaluation, we introduce our **BRIDGE Leaderboard**, which provides:
+- Easy visualizations
+- Side-by-side comparisons
+- Continuous tracking of LLM performance across tasks, languages, and evaluation strategies
+This leaderboard empowers researchers and clinicians to make informed decisions and track model progress over time.
+## 📚 Citation
+If you use this benchmark in your research or development, please cite:
+```bibtex
+@article{BRIDGE2025,
+  title     = {PAPER TITLE},
+  author    = {Your Name and Contributors},
+  year      = {2025},
+  journal   = {Your Journal or Conference},
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio==5.18.0
+gradio_leaderboard==0.0.13
+pandas==2.2.3

task_information.json ADDED Viewed

	@@ -0,0 +1,698 @@

+{
+    "ADE-Identification": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Pharmacology",
+        "Data Access": "Open Access",
+        "Applications": "ADE & Incidents",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "ADE-Extraction": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Pharmacology",
+        "Data Access": "Open Access",
+        "Applications": "ADE & Incidents",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "ADE-Drug dosage": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Pharmacology",
+        "Data Access": "Open Access",
+        "Applications": "Medication information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "BARR2": {
+        "Language": "Spanish",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Concept standarization",
+        "Clinical Stage": "Research"
+    },
+    "BrainMRI-AIS": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Neurology, Radiology",
+        "Data Access": "Open Access",
+        "Applications": "Diagnosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "Brateca-Hospitalization": {
+        "Language": "Portuguese\n(Brazilian)",
+        "Task Type": "Text Classification",
+        "Clinical Context": "General",
+        "Data Access": "Regulated",
+        "Applications": "Prognosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "Brateca-Mortality": {
+        "Language": "Portuguese\n(Brazilian)",
+        "Task Type": "Text Classification",
+        "Clinical Context": "General",
+        "Data Access": "Regulated",
+        "Applications": "Prognosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "Cantemist-Coding": {
+        "Language": "Spanish",
+        "Task Type": "Normalization and Coding",
+        "Clinical Context": "Oncology",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "Cantemis-NER": {
+        "Language": "Spanish",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Oncology",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "Cantemis-Norm": {
+        "Language": "Spanish",
+        "Task Type": "Normalization and Coding",
+        "Clinical Context": "Oncology",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "CARES-Area": {
+        "Language": "Spanish",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Radiology",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "CARES ICD10 Block": {
+        "Language": "Spanish",
+        "Task Type": "Normalization and Coding",
+        "Clinical Context": "Radiology",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "CARES-ICD10 Chapter": {
+        "Language": "Spanish",
+        "Task Type": "Normalization and Coding",
+        "Clinical Context": "Radiology",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "CARES-ICD10 Subblock": {
+        "Language": "Spanish",
+        "Task Type": "Normalization and Coding",
+        "Clinical Context": "Radiology",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "CHIP-CDEE": {
+        "Language": "Chinese",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Temporal & Causality relation",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "C-EMRS": {
+        "Language": "Chinese",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Radiology, Endocrinology, Pulmonology, Cardiology, Gastroenterology",
+        "Data Access": "Open Access",
+        "Applications": "Diagnosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "CodiEsp-ICD-10-CM": {
+        "Language": "Spanish",
+        "Task Type": "Normalization and Coding",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "CodiEsp-ICD-10-PCS": {
+        "Language": "Spanish",
+        "Task Type": "Normalization and Coding",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "ClinicalNotes-UPMC": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Negation identification",
+        "Clinical Stage": "Research"
+    },
+    "PPTS": {
+        "Language": "Spanish",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Pulmonology",
+        "Data Access": "Open Access",
+        "Applications": "Diagnosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "CLINpt-NER": {
+        "Language": "Portuguese",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Neurology",
+        "Data Access": "Open Access",
+        "Applications": "Procudure information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "CLIP": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Critical Care",
+        "Data Access": "Regulated",
+        "Applications": "Post-discharge patient management",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "cMedQA": {
+        "Language": "Chinese",
+        "Task Type": "Question Answering",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Screen & Consultation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "DialMed": {
+        "Language": "Chinese",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Pulmonology, Gastroenterology, Dermatology, Pharmacology",
+        "Data Access": "Open Access",
+        "Applications": "Medication information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "DiSMed-NER": {
+        "Language": "Spanish",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Radiology",
+        "Data Access": "Regulated",
+        "Applications": "De-identification",
+        "Clinical Stage": "Research"
+    },
+    "MIE": {
+        "Language": "Chinese",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Cardiology",
+        "Data Access": "Open Access",
+        "Applications": "Phenotyping",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "EHRQA-Primary department": {
+        "Language": "Chinese",
+        "Task Type": "Text Classification",
+        "Clinical Context": "General",
+        "Data Access": "Regulated",
+        "Applications": "Specialist recommendation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "EHRQA-QA": {
+        "Language": "Chinese",
+        "Task Type": "Question Answering",
+        "Clinical Context": "General",
+        "Data Access": "Regulated",
+        "Applications": "Screen & Consultation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "EHRQA-Sub department": {
+        "Language": "Chinese",
+        "Task Type": "Text Classification",
+        "Clinical Context": "General",
+        "Data Access": "Regulated",
+        "Applications": "Specialist recommendation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "Ex4CDS": {
+        "Language": "German",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Nephrology",
+        "Data Access": "Open Access",
+        "Applications": "Procudure information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "GOUT-CC-Consensus": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Endocrinology",
+        "Data Access": "Regulated",
+        "Applications": "Diagnosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "n2c2 2006-De-identification": {
+        "Language": "English",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Pulmonology",
+        "Data Access": "Regulated",
+        "Applications": "De-identification",
+        "Clinical Stage": "Research"
+    },
+    "Medication extraction": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Pharmacology",
+        "Data Access": "Regulated",
+        "Applications": "Medication information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "n2c2 2010-Concept": {
+        "Language": "English",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Critical Care",
+        "Data Access": "Regulated",
+        "Applications": "Procudure information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "n2c2 2010-Assertion": {
+        "Language": "English",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Critical Care",
+        "Data Access": "Regulated",
+        "Applications": "Post-discharge patient management",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "n2c2 2010-Relation": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Critical Care",
+        "Data Access": "Regulated",
+        "Applications": "Procudure information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "n2c2 2014-De-identification": {
+        "Language": "English",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Endocrinology",
+        "Data Access": "Regulated",
+        "Applications": "De-identification",
+        "Clinical Stage": "Research"
+    },
+    "IMCS-V2-NER": {
+        "Language": "Chinese",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Pediatrics",
+        "Data Access": "Open Access",
+        "Applications": "Phenotyping",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "JP-STS": {
+        "Language": "Japanese",
+        "Task Type": "Semantic Similarity",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Semantic relation",
+        "Clinical Stage": "Research"
+    },
+    "meddocan": {
+        "Language": "Spanish",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "De-identification",
+        "Clinical Stage": "Research"
+    },
+    "MEDIQA 2019-RQE": {
+        "Language": "English",
+        "Task Type": "Natural Language Inference",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Screen & Consultation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "MedNLI": {
+        "Language": "English",
+        "Task Type": "Natural Language Inference",
+        "Clinical Context": "Critical Care",
+        "Data Access": "Regulated",
+        "Applications": "Semantic relation",
+        "Clinical Stage": "Research"
+    },
+    "MedSTS": {
+        "Language": "English",
+        "Task Type": "Semantic Similarity",
+        "Clinical Context": "General",
+        "Data Access": "Regulated",
+        "Applications": "Semantic relation",
+        "Clinical Stage": "Research"
+    },
+    "MTS": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Data organization",
+        "Clinical Stage": "Research"
+    },
+    "MTS-Temporal": {
+        "Language": "English",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Pediatrics, Psychology",
+        "Data Access": "Open Access",
+        "Applications": "Temporal & Causality relation",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "n2c2 2018-ADE&medication": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Pharmacology",
+        "Data Access": "Regulated",
+        "Applications": "ADE & Incidents",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "NorSynthClinical-NER": {
+        "Language": "Norwegian",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Cardiology",
+        "Data Access": "Open Access",
+        "Applications": "Temporal & Causality relation",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "NorSynthClinical-RE": {
+        "Language": "Norwegian",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Cardiology",
+        "Data Access": "Open Access",
+        "Applications": "Temporal & Causality relation",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "NUBES": {
+        "Language": "Spanish",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Negation identification",
+        "Clinical Stage": "Research"
+    },
+    "MEDIQA 2023-chat-A": {
+        "Language": "English",
+        "Task Type": "Summarization",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Consultation summarization",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "MEDIQA 2023-sum-A": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Data organization",
+        "Clinical Stage": "Research"
+    },
+    "MEDIQA 2023-sum-B": {
+        "Language": "English",
+        "Task Type": "Summarization",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Consultation summarization",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "RuMedDaNet": {
+        "Language": "Russian",
+        "Task Type": "Natural Language Inference",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Screen & Consultation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "CBLUE-CDN": {
+        "Language": "Chinese",
+        "Task Type": "Normalization and Coding",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Billing & Coding",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "CHIP-CTC": {
+        "Language": "Chinese",
+        "Task Type": "Text Classification",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Clinical trial matching",
+        "Clinical Stage": "Research"
+    },
+    "CHIP-MDCFNPC": {
+        "Language": "Chinese",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Phenotyping",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "MedDG": {
+        "Language": "Chinese",
+        "Task Type": "Question Answering",
+        "Clinical Context": "Gastroenterology",
+        "Data Access": "Open Access",
+        "Applications": "Screen & Consultation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "IMCS-V2-SR": {
+        "Language": "Chinese",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Pediatrics",
+        "Data Access": "Open Access",
+        "Applications": "Phenotyping",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "IMCS-V2-MRG": {
+        "Language": "Chinese",
+        "Task Type": "Summarization",
+        "Clinical Context": "Pediatrics",
+        "Data Access": "Open Access",
+        "Applications": "Consultation summarization",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "IMCS-V2-DAC": {
+        "Language": "Chinese",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Pediatrics",
+        "Data Access": "Open Access",
+        "Applications": "Screen & Consultation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "n2c2 2014-Diabetes": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Cardiology, Endocrinology",
+        "Data Access": "Regulated",
+        "Applications": "Risk factor extraction",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "n2c2 2014-CAD": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Cardiology, Endocrinology",
+        "Data Access": "Regulated",
+        "Applications": "Risk factor extraction",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "n2c2 2014-Hyperlipidemia": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Cardiology, Endocrinology",
+        "Data Access": "Regulated",
+        "Applications": "Risk factor extraction",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "n2c2 2014-Hypertension": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Cardiology, Endocrinology",
+        "Data Access": "Regulated",
+        "Applications": "Risk factor extraction",
+        "Clinical Stage": "Initial Assessment"
+    },
+    "n2c2 2014-Medication": {
+        "Language": "English",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Cardiology, Endocrinology",
+        "Data Access": "Regulated",
+        "Applications": "Medication information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "CAS-label": {
+        "Language": "French",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "General",
+        "Data Access": "Regulated",
+        "Applications": "Post-discharge patient management",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "CAS-evidence": {
+        "Language": "French",
+        "Task Type": "Summarization",
+        "Clinical Context": "General",
+        "Data Access": "Regulated",
+        "Applications": "Summarization",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "RuMedNLI": {
+        "Language": "Russian",
+        "Task Type": "Natural Language Inference",
+        "Clinical Context": "Critical Care",
+        "Data Access": "Open Access",
+        "Applications": "Semantic relation",
+        "Clinical Stage": "Research"
+    },
+    "RuDReC-NER": {
+        "Language": "Russian",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Pharmacology",
+        "Data Access": "Open Access",
+        "Applications": "ADE & Incidents",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "NorSynthClinical-PHI": {
+        "Language": "Norwegian",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Cardiology",
+        "Data Access": "Open Access",
+        "Applications": "De-identification",
+        "Clinical Stage": "Research"
+    },
+    "RuCCoN": {
+        "Language": "Russian",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Pulmonology",
+        "Data Access": "Open Access",
+        "Applications": "Procudure information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "CLISTER": {
+        "Language": "French",
+        "Task Type": "Semantic Similarity",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Semantic relation",
+        "Clinical Stage": "Research"
+    },
+    "BRONCO150-NER&Status": {
+        "Language": "German",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Oncology",
+        "Data Access": "Regulated",
+        "Applications": "Procudure information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "CARDIO-DE": {
+        "Language": "German",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Cardiology",
+        "Data Access": "Regulated",
+        "Applications": "Medication information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "GraSSCo PHI": {
+        "Language": "German",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "De-identification",
+        "Clinical Stage": "Research"
+    },
+    "IFMIR-Incident type": {
+        "Language": "Japanese",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Pharmacology",
+        "Data Access": "Open Access",
+        "Applications": "ADE & Incidents",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "IFMIR-NER": {
+        "Language": "Japanese",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "Pharmacology",
+        "Data Access": "Open Access",
+        "Applications": "ADE & Incidents",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "IFMIR - NER&factuality": {
+        "Language": "Japanese",
+        "Task Type": "Event Extraction",
+        "Clinical Context": "Pharmacology",
+        "Data Access": "Open Access",
+        "Applications": "ADE & Incidents",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "iCorpus": {
+        "Language": "Japanese",
+        "Task Type": "Named Entity Recognition",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Procudure information",
+        "Clinical Stage": "Treatment and Intervention"
+    },
+    "icliniq-10k": {
+        "Language": "English",
+        "Task Type": "Question Answering",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Screen & Consultation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "HealthCareMagic-100k": {
+        "Language": "English",
+        "Task Type": "Question Answering",
+        "Clinical Context": "General",
+        "Data Access": "Open Access",
+        "Applications": "Screen & Consultation",
+        "Clinical Stage": "Triage and Referral"
+    },
+    "MIMIC-IV CDM": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Gastroenterology",
+        "Data Access": "Regulated",
+        "Applications": "Diagnosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "MIMIC-III Outcome.LoS": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Critical Care",
+        "Data Access": "Regulated",
+        "Applications": "Prognosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "MIMIC-III Outcome.Mortality": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Critical Care",
+        "Data Access": "Regulated",
+        "Applications": "Prognosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "MIMIC-IV BHC": {
+        "Language": "English",
+        "Task Type": "Summarization",
+        "Clinical Context": "Critical Care",
+        "Data Access": "Regulated",
+        "Applications": "Summarization",
+        "Clinical Stage": "Discharge and Administration"
+    },
+    "MIMIC-IV DiReCT.Dis": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Cardiology, Gastroenterology, Neurology, Pulmonology, Endocrinology",
+        "Data Access": "Regulated",
+        "Applications": "Diagnosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    },
+    "MIMIC-IV DiReCT.PDD": {
+        "Language": "English",
+        "Task Type": "Text Classification",
+        "Clinical Context": "Cardiology, Gastroenterology, Neurology, Pulmonology, Endocrinology",
+        "Data Access": "Regulated",
+        "Applications": "Diagnosis",
+        "Clinical Stage": "Diagnosis and Prognosis"
+    }
+}