from typing import Union, Optional, Sequence from .sentencizers import SpacySentencizer, NoteSentencizer from .tokenizers import ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer class PreprocessingLoader(object): @staticmethod def get_sentencizer(sentencizer: str) -> Union[SpacySentencizer, NoteSentencizer]: """ Initialize the sentencizer and tokenizer based We can either use the sci-spacy (en_core_sci_lg or en_core_web_sm) or consider the entire note as a single sentence. Args: sentencizer (str): Specify which sentencizer you want to use Returns: Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]: An object of the requested sentencizer class """ if sentencizer == 'en_core_sci_lg': return SpacySentencizer(spacy_model='en_core_sci_lg') elif sentencizer == 'en_core_sci_sm': return SpacySentencizer(spacy_model='en_core_sci_sm') elif sentencizer == 'en_core_web_sm': return SpacySentencizer(spacy_model='en_core_web_sm') elif sentencizer == 'note': return NoteSentencizer() else: raise ValueError('Invalid sentencizer - does not exist') @staticmethod def get_tokenizer( tokenizer: str, abbreviations: Optional[Sequence[str]] = None, ) -> Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: """ Initialize the tokenizer based on the CLI arguments We can either use the default scipacy (en_core_sci_lg or en_core_web_sm) or the modified scipacy (with regex rule) tokenizer. It also supports the corenlp tokenizer Args: tokenizer (str): Specify which tokenizer you want to use abbreviations (Optional[str]): A list of abbreviations for which tokens will not be split - works only with used with custom clinical tokenizer Returns: Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: An object of the requested tokenizer class """ if tokenizer == 'en_core_sci_lg': return SpacyTokenizer(spacy_model='en_core_sci_lg') elif tokenizer == 'en_core_web_sm': return SpacyTokenizer(spacy_model='en_core_web_sm') elif tokenizer == 'en': return SpacyTokenizer(spacy_model='en') elif tokenizer == 'corenlp': return CoreNLPTokenizer() elif tokenizer == 'clinical': # Abbreviations - we won't split tokens that match these (e.g 18F-FDG) if abbreviations is None: return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations) else: return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations) else: raise ValueError('Invalid tokenizer - does not exist')