Spaces:
Sleeping
Sleeping
File size: 5,005 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# Natural Language Toolkit: Aligned Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# URL: <https://www.nltk.org/>
# Author: Steven Bird <[email protected]>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import (
StreamBackedCorpusView,
concat,
read_alignedsent_block,
)
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
from nltk.translate import AlignedSent, Alignment
class AlignedCorpusReader(CorpusReader):
"""
Reader for corpora of word-aligned sentences. Tokens are assumed
to be separated by whitespace. Sentences begin on separate lines.
"""
def __init__(
self,
root,
fileids,
sep="/",
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding="latin1",
):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
False,
False,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
False,
True,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def aligned_sents(self, fileids=None):
"""
:return: the given file(s) as a list of AlignedSent objects.
:rtype: list(AlignedSent)
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
True,
True,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class AlignedSentCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for aligned sentences.
``AlignedSentCorpusView`` objects are typically created by
``AlignedCorpusReader`` (not directly by nltk users).
"""
def __init__(
self,
corpus_file,
encoding,
aligned,
group_by_sent,
word_tokenizer,
sent_tokenizer,
alignedsent_block_reader,
):
self._aligned = aligned
self._group_by_sent = group_by_sent
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
block = [
self._word_tokenizer.tokenize(sent_str)
for alignedsent_str in self._alignedsent_block_reader(stream)
for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
]
if self._aligned:
block[2] = Alignment.fromstring(
" ".join(block[2])
) # kludge; we shouldn't have tokenized the alignment string
block = [AlignedSent(*block)]
elif self._group_by_sent:
block = [block[0]]
else:
block = block[0]
return block
|