Medical-Note-Deidentification
/
ner_datasets
/preprocessing
/sentencizers
/mimic_stanza_sentencizer.py
from typing import Iterable, Dict, Union | |
import stanza | |
class MimicStanzaSentencizer(object): | |
""" | |
This class is used to read text and split it into | |
sentences (and their start and end positions) | |
using the mimic stanza package | |
""" | |
def __init__(self, package: str): | |
""" | |
Initialize a mimic stanza model to read text and split it into | |
sentences. | |
Args: | |
package (str): Name of the mimic model | |
""" | |
self._nlp = stanza.Pipeline('en', package=package, processors='tokenize', use_gpu=True) | |
def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]: | |
""" | |
Return an integrator that iterates through the sentences in the text | |
Args: | |
text (str): The text | |
Returns: | |
(Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence | |
the start position of the sentence in the entire text | |
and the end position of the sentence in the entire text | |
""" | |
doc = self._nlp(text) | |
for sentence in doc.sentences: | |
yield {'text': sentence.text, | |
'start': sentence.tokens[0].start_char, | |
'end': sentence.tokens[-1].end_char, | |
'last_token': sentence.tokens[-1].text} | |