Spaces:

obi
/

Medical-Note-Deidentification

Running

Medical-Note-Deidentification / ner_datasets /preprocessing /preprocessing_loader.py

Prajwal Kailas

change scispacy versions

5d427be over 3 years ago

3.07 kB

	from typing import Union, Optional, Sequence

	from .sentencizers import SpacySentencizer, NoteSentencizer
	from .tokenizers import ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer


	class PreprocessingLoader(object):

	@staticmethod
	def get_sentencizer(sentencizer: str) -> Union[SpacySentencizer, NoteSentencizer]:
	"""
	Initialize the sentencizer and tokenizer based
	We can either use the sci-spacy (en_core_sci_lg or en_core_web_sm) or
	consider the entire note as a single sentence.
	Args:
	sentencizer (str): Specify which sentencizer you want to use
	Returns:
	Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]: An object of the requested
	sentencizer class
	"""
	if sentencizer == 'en_core_sci_lg':
	return SpacySentencizer(spacy_model='en_core_sci_lg')
	elif sentencizer == 'en_core_sci_sm':
	return SpacySentencizer(spacy_model='en_core_sci_sm')
	elif sentencizer == 'en_core_web_sm':
	return SpacySentencizer(spacy_model='en_core_web_sm')
	elif sentencizer == 'note':
	return NoteSentencizer()
	else:
	raise ValueError('Invalid sentencizer - does not exist')

	@staticmethod
	def get_tokenizer(
	tokenizer: str,
	abbreviations: Optional[Sequence[str]] = None,
	) -> Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]:
	"""
	Initialize the tokenizer based on the CLI arguments
	We can either use the default scipacy (en_core_sci_lg or en_core_web_sm)
	or the modified scipacy (with regex rule) tokenizer.
	It also supports the corenlp tokenizer
	Args:
	tokenizer (str): Specify which tokenizer you want to use
	abbreviations (Optional[str]): A list of abbreviations for which tokens will not be split - works only with
	used with custom clinical tokenizer
	Returns:
	Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: An object of the requested tokenizer class
	"""
	if tokenizer == 'en_core_sci_lg':
	return SpacyTokenizer(spacy_model='en_core_sci_lg')
	elif tokenizer == 'en_core_web_sm':
	return SpacyTokenizer(spacy_model='en_core_web_sm')
	elif tokenizer == 'en':
	return SpacyTokenizer(spacy_model='en')
	elif tokenizer == 'corenlp':
	return CoreNLPTokenizer()
	elif tokenizer == 'clinical':
	# Abbreviations - we won't split tokens that match these (e.g 18F-FDG)
	if abbreviations is None:
	return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
	else:
	return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
	else:
	raise ValueError('Invalid tokenizer - does not exist')