|
from typing import Union, Optional, Sequence |
|
|
|
from .sentencizers import SpacySentencizer, NoteSentencizer |
|
from .tokenizers import ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer |
|
|
|
|
|
class PreprocessingLoader(object): |
|
|
|
@staticmethod |
|
def get_sentencizer(sentencizer: str) -> Union[SpacySentencizer, NoteSentencizer]: |
|
""" |
|
Initialize the sentencizer and tokenizer based |
|
We can either use the sci-spacy (en_core_sci_lg or en_core_web_sm) or |
|
consider the entire note as a single sentence. |
|
Args: |
|
sentencizer (str): Specify which sentencizer you want to use |
|
Returns: |
|
Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]: An object of the requested |
|
sentencizer class |
|
""" |
|
if sentencizer == 'en_core_sci_lg': |
|
return SpacySentencizer(spacy_model='en_core_sci_lg') |
|
elif sentencizer == 'en_core_sci_sm': |
|
return SpacySentencizer(spacy_model='en_core_sci_sm') |
|
elif sentencizer == 'en_core_web_sm': |
|
return SpacySentencizer(spacy_model='en_core_web_sm') |
|
elif sentencizer == 'note': |
|
return NoteSentencizer() |
|
else: |
|
raise ValueError('Invalid sentencizer - does not exist') |
|
|
|
@staticmethod |
|
def get_tokenizer( |
|
tokenizer: str, |
|
abbreviations: Optional[Sequence[str]] = None, |
|
) -> Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: |
|
""" |
|
Initialize the tokenizer based on the CLI arguments |
|
We can either use the default scipacy (en_core_sci_lg or en_core_web_sm) |
|
or the modified scipacy (with regex rule) tokenizer. |
|
It also supports the corenlp tokenizer |
|
Args: |
|
tokenizer (str): Specify which tokenizer you want to use |
|
abbreviations (Optional[str]): A list of abbreviations for which tokens will not be split - works only with |
|
used with custom clinical tokenizer |
|
Returns: |
|
Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: An object of the requested tokenizer class |
|
""" |
|
if tokenizer == 'en_core_sci_lg': |
|
return SpacyTokenizer(spacy_model='en_core_sci_lg') |
|
elif tokenizer == 'en_core_web_sm': |
|
return SpacyTokenizer(spacy_model='en_core_web_sm') |
|
elif tokenizer == 'en': |
|
return SpacyTokenizer(spacy_model='en') |
|
elif tokenizer == 'corenlp': |
|
return CoreNLPTokenizer() |
|
elif tokenizer == 'clinical': |
|
|
|
if abbreviations is None: |
|
return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations) |
|
else: |
|
return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations) |
|
else: |
|
raise ValueError('Invalid tokenizer - does not exist') |
|
|