File size: 5,005 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Natural Language Toolkit: Aligned Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# URL: <https://www.nltk.org/>
# Author: Steven Bird <[email protected]>
# For license information, see LICENSE.TXT

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import (
    StreamBackedCorpusView,
    concat,
    read_alignedsent_block,
)
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
from nltk.translate import AlignedSent, Alignment


class AlignedCorpusReader(CorpusReader):
    """

    Reader for corpora of word-aligned sentences.  Tokens are assumed

    to be separated by whitespace.  Sentences begin on separate lines.

    """

    def __init__(

        self,

        root,

        fileids,

        sep="/",

        word_tokenizer=WhitespaceTokenizer(),

        sent_tokenizer=RegexpTokenizer("\n", gaps=True),

        alignedsent_block_reader=read_alignedsent_block,

        encoding="latin1",

    ):
        """

        Construct a new Aligned Corpus reader for a set of documents

        located at the given root directory.  Example usage:



            >>> root = '/...path to corpus.../'

            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP



        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader

    def words(self, fileids=None):
        """

        :return: the given file(s) as a list of words

            and punctuation symbols.

        :rtype: list(str)

        """
        return concat(
            [
                AlignedSentCorpusView(
                    fileid,
                    enc,
                    False,
                    False,
                    self._word_tokenizer,
                    self._sent_tokenizer,
                    self._alignedsent_block_reader,
                )
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def sents(self, fileids=None):
        """

        :return: the given file(s) as a list of

            sentences or utterances, each encoded as a list of word

            strings.

        :rtype: list(list(str))

        """
        return concat(
            [
                AlignedSentCorpusView(
                    fileid,
                    enc,
                    False,
                    True,
                    self._word_tokenizer,
                    self._sent_tokenizer,
                    self._alignedsent_block_reader,
                )
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def aligned_sents(self, fileids=None):
        """

        :return: the given file(s) as a list of AlignedSent objects.

        :rtype: list(AlignedSent)

        """
        return concat(
            [
                AlignedSentCorpusView(
                    fileid,
                    enc,
                    True,
                    True,
                    self._word_tokenizer,
                    self._sent_tokenizer,
                    self._alignedsent_block_reader,
                )
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )


class AlignedSentCorpusView(StreamBackedCorpusView):
    """

    A specialized corpus view for aligned sentences.

    ``AlignedSentCorpusView`` objects are typically created by

    ``AlignedCorpusReader`` (not directly by nltk users).

    """

    def __init__(

        self,

        corpus_file,

        encoding,

        aligned,

        group_by_sent,

        word_tokenizer,

        sent_tokenizer,

        alignedsent_block_reader,

    ):
        self._aligned = aligned
        self._group_by_sent = group_by_sent
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)

    def read_block(self, stream):
        block = [
            self._word_tokenizer.tokenize(sent_str)
            for alignedsent_str in self._alignedsent_block_reader(stream)
            for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
        ]
        if self._aligned:
            block[2] = Alignment.fromstring(
                " ".join(block[2])
            )  # kludge; we shouldn't have tokenized the alignment string
            block = [AlignedSent(*block)]
        elif self._group_by_sent:
            block = [block[0]]
        else:
            block = block[0]

        return block