File size: 4,699 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Masato Hagiwara <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

import sys

from nltk.corpus.reader import util
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *


class ChasenCorpusReader(CorpusReader):
    def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
        self._sent_splitter = sent_splitter
        CorpusReader.__init__(self, root, fileids, encoding)

    def words(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def tagged_words(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def sents(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def tagged_sents(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def paras(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def tagged_paras(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )


class ChasenCorpusView(StreamBackedCorpusView):
    """

    A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,

    but this'll use fixed sets of word and sentence tokenizer.

    """

    def __init__(

        self,

        corpus_file,

        encoding,

        tagged,

        group_by_sent,

        group_by_para,

        sent_splitter=None,

    ):
        self._tagged = tagged
        self._group_by_sent = group_by_sent
        self._group_by_para = group_by_para
        self._sent_splitter = sent_splitter
        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)

    def read_block(self, stream):
        """Reads one paragraph at a time."""
        block = []
        for para_str in read_regexp_block(stream, r".", r"^EOS\n"):

            para = []

            sent = []
            for line in para_str.splitlines():

                _eos = line.strip() == "EOS"
                _cells = line.split("\t")
                w = (_cells[0], "\t".join(_cells[1:]))
                if not _eos:
                    sent.append(w)

                if _eos or (self._sent_splitter and self._sent_splitter(w)):
                    if not self._tagged:
                        sent = [w for (w, t) in sent]
                    if self._group_by_sent:
                        para.append(sent)
                    else:
                        para.extend(sent)
                    sent = []

            if len(sent) > 0:
                if not self._tagged:
                    sent = [w for (w, t) in sent]

                if self._group_by_sent:
                    para.append(sent)
                else:
                    para.extend(sent)

            if self._group_by_para:
                block.append(para)
            else:
                block.extend(para)

        return block


def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
    print("/".join(jeita.words()[22100:22140]))

    print(
        "\nEOS\n".join(
            "\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent)
            for sent in jeita.tagged_sents()[2170:2173]
        )
    )


def test():

    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")

    assert isinstance(jeita.tagged_words()[0][1], str)


if __name__ == "__main__":
    demo()
    test()