File size: 3,073 Bytes
45c1511 5d427be 45c1511 5d427be 45c1511 5d427be 45c1511 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
from typing import Union, Optional, Sequence
from .sentencizers import SpacySentencizer, NoteSentencizer
from .tokenizers import ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer
class PreprocessingLoader(object):
@staticmethod
def get_sentencizer(sentencizer: str) -> Union[SpacySentencizer, NoteSentencizer]:
"""
Initialize the sentencizer and tokenizer based
We can either use the sci-spacy (en_core_sci_lg or en_core_web_sm) or
consider the entire note as a single sentence.
Args:
sentencizer (str): Specify which sentencizer you want to use
Returns:
Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]: An object of the requested
sentencizer class
"""
if sentencizer == 'en_core_sci_lg':
return SpacySentencizer(spacy_model='en_core_sci_lg')
elif sentencizer == 'en_core_sci_sm':
return SpacySentencizer(spacy_model='en_core_sci_sm')
elif sentencizer == 'en_core_web_sm':
return SpacySentencizer(spacy_model='en_core_web_sm')
elif sentencizer == 'note':
return NoteSentencizer()
else:
raise ValueError('Invalid sentencizer - does not exist')
@staticmethod
def get_tokenizer(
tokenizer: str,
abbreviations: Optional[Sequence[str]] = None,
) -> Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]:
"""
Initialize the tokenizer based on the CLI arguments
We can either use the default scipacy (en_core_sci_lg or en_core_web_sm)
or the modified scipacy (with regex rule) tokenizer.
It also supports the corenlp tokenizer
Args:
tokenizer (str): Specify which tokenizer you want to use
abbreviations (Optional[str]): A list of abbreviations for which tokens will not be split - works only with
used with custom clinical tokenizer
Returns:
Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: An object of the requested tokenizer class
"""
if tokenizer == 'en_core_sci_lg':
return SpacyTokenizer(spacy_model='en_core_sci_lg')
elif tokenizer == 'en_core_web_sm':
return SpacyTokenizer(spacy_model='en_core_web_sm')
elif tokenizer == 'en':
return SpacyTokenizer(spacy_model='en')
elif tokenizer == 'corenlp':
return CoreNLPTokenizer()
elif tokenizer == 'clinical':
# Abbreviations - we won't split tokens that match these (e.g 18F-FDG)
if abbreviations is None:
return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
else:
return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
else:
raise ValueError('Invalid tokenizer - does not exist')
|