Spaces:

obi
/

Medical-Note-Deidentification

Running

Prajwal Kailas

dependency to run

45c1511 over 3 years ago

1.42 kB

	from typing import Iterable, Dict, Union

	import stanza


	class MimicStanzaSentencizer(object):
	"""
	This class is used to read text and split it into
	sentences (and their start and end positions)
	using the mimic stanza package
	"""

	def __init__(self, package: str):
	"""
	Initialize a mimic stanza model to read text and split it into
	sentences.
	Args:
	package (str): Name of the mimic model
	"""
	self._nlp = stanza.Pipeline('en', package=package, processors='tokenize', use_gpu=True)

	def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
	"""
	Return an integrator that iterates through the sentences in the text
	Args:
	text (str): The text
	Returns:
	(Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
	the start position of the sentence in the entire text
	and the end position of the sentence in the entire text
	"""
	doc = self._nlp(text)
	for sentence in doc.sentences:
	yield {'text': sentence.text,
	'start': sentence.tokens[0].start_char,
	'end': sentence.tokens[-1].end_char,
	'last_token': sentence.tokens[-1].text}