Prajwal Kailas
change scispacy versions
5d427be
raw
history blame
3.07 kB
from typing import Union, Optional, Sequence
from .sentencizers import SpacySentencizer, NoteSentencizer
from .tokenizers import ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer
class PreprocessingLoader(object):
@staticmethod
def get_sentencizer(sentencizer: str) -> Union[SpacySentencizer, NoteSentencizer]:
"""
Initialize the sentencizer and tokenizer based
We can either use the sci-spacy (en_core_sci_lg or en_core_web_sm) or
consider the entire note as a single sentence.
Args:
sentencizer (str): Specify which sentencizer you want to use
Returns:
Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]: An object of the requested
sentencizer class
"""
if sentencizer == 'en_core_sci_lg':
return SpacySentencizer(spacy_model='en_core_sci_lg')
elif sentencizer == 'en_core_sci_sm':
return SpacySentencizer(spacy_model='en_core_sci_sm')
elif sentencizer == 'en_core_web_sm':
return SpacySentencizer(spacy_model='en_core_web_sm')
elif sentencizer == 'note':
return NoteSentencizer()
else:
raise ValueError('Invalid sentencizer - does not exist')
@staticmethod
def get_tokenizer(
tokenizer: str,
abbreviations: Optional[Sequence[str]] = None,
) -> Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]:
"""
Initialize the tokenizer based on the CLI arguments
We can either use the default scipacy (en_core_sci_lg or en_core_web_sm)
or the modified scipacy (with regex rule) tokenizer.
It also supports the corenlp tokenizer
Args:
tokenizer (str): Specify which tokenizer you want to use
abbreviations (Optional[str]): A list of abbreviations for which tokens will not be split - works only with
used with custom clinical tokenizer
Returns:
Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: An object of the requested tokenizer class
"""
if tokenizer == 'en_core_sci_lg':
return SpacyTokenizer(spacy_model='en_core_sci_lg')
elif tokenizer == 'en_core_web_sm':
return SpacyTokenizer(spacy_model='en_core_web_sm')
elif tokenizer == 'en':
return SpacyTokenizer(spacy_model='en')
elif tokenizer == 'corenlp':
return CoreNLPTokenizer()
elif tokenizer == 'clinical':
# Abbreviations - we won't split tokens that match these (e.g 18F-FDG)
if abbreviations is None:
return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
else:
return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
else:
raise ValueError('Invalid tokenizer - does not exist')