Prajwal Kailas
dependency to run
45c1511
raw
history blame
1.32 kB
from typing import Iterable, Dict, Union
import spacy
class SpacySentencizer(object):
"""
This class is used to read text and split it into
sentences (and their start and end positions)
using a spacy model
"""
def __init__(self, spacy_model: str):
"""
Initialize a spacy model to read text and split it into
sentences.
Args:
spacy_model (str): Name of the spacy model
"""
self._nlp = spacy.load(spacy_model)
def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
"""
Return an iterator that iterates through the sentences in the text
Args:
text (str): The text
Returns:
(Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
the start position of the sentence in the entire text
and the end position of the sentence in the entire text
"""
document = self._nlp(text)
for sentence in document.sents:
yield {'text': sentence.text,
'start': sentence.start_char,
'end': sentence.end_char,
'last_token': None}