File size: 4,608 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Natural Language Toolkit: Twitter Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Ewan Klein <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

A reader for corpora that consist of Tweets. It is assumed that the Tweets

have been serialised into line-delimited JSON.

"""

import json
import os

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat
from nltk.tokenize import TweetTokenizer


class TwitterCorpusReader(CorpusReader):
    r"""

    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.



    Individual Tweets can be tokenized using the default tokenizer, or by a

    custom tokenizer specified as a parameter to the constructor.



    Construct a new Tweet corpus reader for a set of documents

    located at the given root directory.



    If you made your own tweet collection in a directory called

    `twitter-files`, then you can initialise the reader as::



        from nltk.corpus import TwitterCorpusReader

        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')



    However, the recommended approach is to set the relevant directory as the

    value of the environmental variable `TWITTER`, and then invoke the reader

    as follows::



       root = os.environ['TWITTER']

       reader = TwitterCorpusReader(root, '.*\.json')



    If you want to work directly with the raw Tweets, the `json` library can

    be used::



       import json

       for tweet in reader.docs():

           print(json.dumps(tweet, indent=1, sort_keys=True))



    """

    CorpusView = StreamBackedCorpusView
    """

    The corpus view class used by this reader.

    """

    def __init__(

        self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"

    ):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into

            smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError(f"File {path} is empty")
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer

    def docs(self, fileids=None):
        """

        Returns the full Tweet objects, as specified by `Twitter

        documentation on Tweets

        <https://dev.twitter.com/docs/platform-objects/tweets>`_



        :return: the given file(s) as a list of dictionaries deserialised

            from JSON.

        :rtype: list(dict)

        """
        return concat(
            [
                self.CorpusView(path, self._read_tweets, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def strings(self, fileids=None):
        """

        Returns only the text content of Tweets in the file(s)



        :return: the given file(s) as a list of Tweets.

        :rtype: list(str)

        """
        fulltweets = self.docs(fileids)
        tweets = []
        for jsono in fulltweets:
            try:
                text = jsono["text"]
                if isinstance(text, bytes):
                    text = text.decode(self.encoding)
                tweets.append(text)
            except KeyError:
                pass
        return tweets

    def tokenized(self, fileids=None):
        """

        :return: the given file(s) as a list of the text content of Tweets as

            as a list of words, screenanames, hashtags, URLs and punctuation symbols.



        :rtype: list(list(str))

        """
        tweets = self.strings(fileids)
        tokenizer = self._word_tokenizer
        return [tokenizer.tokenize(t) for t in tweets]

    def _read_tweets(self, stream):
        """

        Assumes that each line in ``stream`` is a JSON-serialised object.

        """
        tweets = []
        for i in range(10):
            line = stream.readline()
            if not line:
                return tweets
            tweet = json.loads(line)
            tweets.append(tweet)
        return tweets