Spaces:

obi
/

Medical-Note-Deidentification

Running

File size: 14,843 Bytes

45c1511

import json
import random
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from typing import Iterable, Dict, List, Union, Optional, Sequence, NoReturn

from .dataset_builder import Dataset, SentenceDataset
from .preprocessing import PreprocessingLoader

random.seed(41)


class DatasetCreator(object):
    """
    Build a NER token classification dataset
    For training we will build the dataset using the annotated spans (e.g from prodigy)
    For predictions we will assign default labels.
    The dataset is on a sentence level, i.e each note is split into sentences and the de-id
    task is run on a sentence level. Even the predictions are run on a sentence level
    The dataset would be something like:
    Tokens: [[tok1, tok2, ... tok-n], [tok ...], ..., [tok ...]]
    Labels: [[lab1, lab2, ... lab-n], [lab ...], ..., [lab ...]]
    Where the inner list represents the sentences - the tokens in the sentence and the respective
    labels for each token. The labels depend on the notation
    This script can also be used for predictions, the Labels will be filled with some
    default value. This is done so that we can use the same script for building a dataset to train a model
    and a dataset to obtain predictions using a model
    Example:
    Note: Bruce Wayne is a 60yo man. He lives in Gotham
    Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
    Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, .], [He, lives, in, Gotham]]
    Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O], [O, O, O, B-LOC]]
    Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O], [O, O, O, U-LOC]]
    We also can create sentences that uses previous/next chunks as context - in this case the dataset would
    look something like this. (Assume we limit the size of the chunks to 3 tokens)
    Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
    Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, ., He, lives, in], [yo, man, ., He, lives, in, Gotham]]
    Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, B-LOC]]
    Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, U-LOC]]
    NA represents the token is used for context
    """

    def __init__(
            self,
            sentencizer: str,
            tokenizer: str,
            abbreviations: Optional[Sequence[str]] = None,
            max_tokens: int = 128,
            max_prev_sentence_token: int = 32,
            max_next_sentence_token: int = 32,
            default_chunk_size: int = 32,
            ignore_label: str = 'NA'
    ) -> NoReturn:
        """
        Initialize the sentencizer and tokenizer
        Args:
            sentencizer (str): Specify which sentencizer you want to use
            tokenizer (str): Specify which tokenizer you want to use
            abbreviations (Optional[Sequence[str]]): A list of abbreviations for which tokens will not be split
                                                     - works only with with custom clinical tokenizer.
            max_tokens (int): The maximum number of tokens allowed in a sentence/training example,
                              truncate if it exceeds.
            max_prev_sentence_token (int): The maximum number of previous chunk tokens allowed in a
                                           sentence/training example
            max_next_sentence_token (int): The maximum number of next chunk tokens allowed in a
                                           sentence/training example.
            ignore_label (str): The label assigned to the previous and next chunks to distinguish
                                from the current sentence
        """
        self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer=sentencizer)
        self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer=tokenizer, abbreviations=abbreviations)
        # Initialize the object that will be used to get the tokens and the sentences
        self._dataset = Dataset(sentencizer=self._sentencizer, tokenizer=self._tokenizer)
        # Initialize the object that will take all the sentences in the note and return
        # a dataset where each row represents a sentence in the note. The sentence in each
        # row will also contain a previous chunk and next chunk (tokens) that will act as context
        # when training the mode
        # [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
        # which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
        # provide context to the current sentence
        self._sentence_dataset = SentenceDataset(
            max_tokens=max_tokens,
            max_prev_sentence_token=max_prev_sentence_token,
            max_next_sentence_token=max_next_sentence_token,
            default_chunk_size=default_chunk_size,
            ignore_label=ignore_label
        )

    def create(
            self,
            input_file: str,
            mode: str = 'predict',
            notation: str = 'BIO',
            token_text_key: str = 'text',
            metadata_key: str = 'meta',
            note_id_key: str = 'note_id',
            label_key: str = 'labels',
            span_text_key: str = 'spans'
    ) -> Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]:
        """
        This function is used to get the sentences that will be part of the NER dataset.
        We check whether the note belongs to the desired dataset split. If it does,
        we fix any spans that can cause token-span alignment errors. Then we extract
        all the sentences in the notes, the tokens in each sentence. Finally we
        add some context tokens to the sentence if required. This function returns
        an iterable that iterated through each of the processed sentences
        Args:
            input_file (str): Input jsonl file. Make sure the spans are in ascending order (based on start position)
            mode (str): Dataset being built for train or predict.
            notation (str): The NER labelling notation
            token_text_key (str): The key where the note text and token text is present in the json object
            metadata_key (str): The key where the note metadata is present in the json object
            note_id_key (str): The key where the note id is present in the json object
            label_key (str): The key where the token label will be stored in the json object
            span_text_key (str): The key where the note spans is present in the json object
        Returns:
            (Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]): Iterate through the processed
                                                                                       sentences/training examples
        """
        # Go through the notes
        for line in open(input_file, 'r'):
            note = json.loads(line)
            note_text = note[token_text_key]
            note_id = note[metadata_key][note_id_key]
            if mode == 'train':
                note_spans = note[span_text_key]
            # No spans in predict mode
            elif mode == 'predict':
                note_spans = None
            else:
                raise ValueError("Invalid mode - can only be train/predict")
            # Store the list of tokens in the sentence
            # Eventually this list will contain all the tokens in the note (split on the sentence level)
            # Store the start and end positions of the sentence in the note. This can
            # be used later to reconstruct the note from the sentences
            # we also store the note_id for each sentence so that we can map it back
            # to the note and therefore have all the sentences mapped back to the notes they belong to.
            sent_tokens = [sent_tok for sent_tok in self._dataset.get_tokens(
                text=note_text,
                spans=note_spans,
                notation=notation
            )]
            # The following loop goes through each sentence in the note and returns
            # the current sentence and previous and next chunks that will be used for context
            # The chunks will have a default label (e.g NA) to distinguish from the current sentence
            # and so that we can ignore these chunks when calculating loss and updating weights
            # during training
            for ner_sent_index, ner_sentence in self._sentence_dataset.get_sentences(
                    sent_tokens=sent_tokens,
                    token_text_key=token_text_key,
                    label_key=label_key
            ):
                # Return the processed sentence. This sentence will then be used
                # by the model
                current_sent_info = ner_sentence['current_sent_info']
                note_sent_info_store = {'start': current_sent_info[0]['start'],
                                        'end': current_sent_info[-1]['end'], 'note_id': note_id}
                ner_sentence['note_sent_info'] = note_sent_info_store
                yield ner_sentence


def main():
    cli_parser = ArgumentParser(
        description='configuration arguments provided at run time from the CLI',
        formatter_class=ArgumentDefaultsHelpFormatter
    )
    cli_parser.add_argument(
        '--input_file',
        type=str,
        required=True,
        help='the the jsonl file that contains the notes. spans need to be sorted in ascending order (based on start '
             'position) '
    )
    cli_parser.add_argument(
        '--notation',
        type=str,
        default='BIO',
        help='the notation we will be using for the label scheme'
    )
    cli_parser.add_argument(
        '--max_tokens',
        type=int,
        default=128,
        help='The max tokens that a given sentence (training/prediction example) in the note can have'
    )
    cli_parser.add_argument(
        '--default_chunk_size',
        type=int,
        default=32,
        help='the default chunk size for the previous and next chunks for a given sentence (training/prediction '
             'example) in the note can have '
    )
    cli_parser.add_argument(
        '--max_prev_sentence_token',
        type=int,
        default=32,
        help='the max chunk size for the previous chunks for a given sentence (training/prediction example) in the '
             'note can have '
    )
    cli_parser.add_argument(
        '--max_next_sentence_token',
        type=int,
        default=32,
        help='the max chunk size for the next chunks for a given sentence (training/prediction example) in the note '
             'can have '
    )
    cli_parser.add_argument(
        '--mode',
        type=str,
        choices=['train', 'predict'],
        required=True,
        help='whether we are building the dataset for training or prediction'
    )
    cli_parser.add_argument(
        '--sentencizer',
        type=str,
        required=True,
        help='the sentencizer to use for splitting notes into sentences'
    )
    cli_parser.add_argument(
        '--tokenizer',
        type=str,
        required=True,
        help='the tokenizer to use for splitting text into tokens'
    )
    cli_parser.add_argument(
        '--abbreviations',
        type=str,
        default=None,
        help='file that will be used by clinical tokenizer to handle abbreviations'
    )
    cli_parser.add_argument(
        '--ignore_label',
        type=str,
        default='NA',
        help='whether to use the ignore label or not'
    )
    cli_parser.add_argument(
        '--token_text_key',
        type=str,
        default='text',
        help='the key where the note text is present in the json object'
    )
    cli_parser.add_argument(
        '--metadata_key',
        type=str,
        default='meta',
        help='the key where the note metadata is present in the json object'
    )
    cli_parser.add_argument(
        '--note_id_key',
        type=str,
        default='note_id',
        help='the key where the note metadata is present in the json object'
    )
    cli_parser.add_argument(
        '--label_key',
        type=str,
        default='label',
        help='the key where the note label for each token is present in the json object'
    )
    cli_parser.add_argument(
        '--span_text_key',
        type=str,
        default='spans',
        help='the key where the note annotates spans are present in the json object'
    )
    cli_parser.add_argument(
        '--format',
        type=str,
        default='jsonl',
        help='format to store the dataset in: jsonl or conll'
    )
    cli_parser.add_argument(
        '--output_file',
        type=str,
        help='The file where the NER dataset will be stored'
    )
    args = cli_parser.parse_args()
    dataset_creator = DatasetCreator(
        sentencizer=args.sentencizer,
        tokenizer=args.tokenizer,
        abbreviations=args.abbreviations,
        max_tokens=args.max_tokens,
        max_prev_sentence_token=args.max_prev_sentence_token,
        max_next_sentence_token=args.max_next_sentence_token,
        default_chunk_size=args.default_chunk_size,
        ignore_label=args.ignore_label)
    ner_notes = dataset_creator.create(
        input_file=args.input_file,
        mode=args.mode,
        notation=args.notation,
        token_text_key=args.token_text_key,
        metadata_key=args.metadata_key,
        note_id_key=args.note_id_key,
        label_key=args.label_key,
        span_text_key=args.span_text_key
    )
    # Store the NER dataset in the desired format
    if args.format == 'jsonl':
        # Write the dataset to the output file
        with open(args.output_file, 'w') as file:
            for ner_sentence in ner_notes:
                file.write(json.dumps(ner_sentence) + '\n')
    elif args.format == 'conll':
        with open(args.output_file, 'w') as file:
            for ner_sentence in ner_notes:
                tokens = ner_sentence['tokens']
                labels = ner_sentence['labels']
                current_sent_info = ner_sentence['current_sent_info']
                note_id = ner_sentence['note_sent_info']['note_id']
                if len(tokens) != len(labels) or len(labels) != len(current_sent_info):
                    raise ValueError('Length mismatch')
                for token, label, sent_info in zip(tokens, labels, current_sent_info):
                    sent_info['note_id'] = note_id
                    data = token + ' ' + label + ' ' + json.dumps(sent_info) + '\n'
                    file.write(data)
                file.write('\n')


if __name__ == '__main__':

    main()