Spaces:
Sleeping
Sleeping
File size: 4,608 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# Natural Language Toolkit: Twitter Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Ewan Klein <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that consist of Tweets. It is assumed that the Tweets
have been serialised into line-delimited JSON.
"""
import json
import os
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat
from nltk.tokenize import TweetTokenizer
class TwitterCorpusReader(CorpusReader):
r"""
Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
Individual Tweets can be tokenized using the default tokenizer, or by a
custom tokenizer specified as a parameter to the constructor.
Construct a new Tweet corpus reader for a set of documents
located at the given root directory.
If you made your own tweet collection in a directory called
`twitter-files`, then you can initialise the reader as::
from nltk.corpus import TwitterCorpusReader
reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
However, the recommended approach is to set the relevant directory as the
value of the environmental variable `TWITTER`, and then invoke the reader
as follows::
root = os.environ['TWITTER']
reader = TwitterCorpusReader(root, '.*\.json')
If you want to work directly with the raw Tweets, the `json` library can
be used::
import json
for tweet in reader.docs():
print(json.dumps(tweet, indent=1, sort_keys=True))
"""
CorpusView = StreamBackedCorpusView
"""
The corpus view class used by this reader.
"""
def __init__(
self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
smaller units, including but not limited to words.
"""
CorpusReader.__init__(self, root, fileids, encoding)
for path in self.abspaths(self._fileids):
if isinstance(path, ZipFilePathPointer):
pass
elif os.path.getsize(path) == 0:
raise ValueError(f"File {path} is empty")
"""Check that all user-created corpus files are non-empty."""
self._word_tokenizer = word_tokenizer
def docs(self, fileids=None):
"""
Returns the full Tweet objects, as specified by `Twitter
documentation on Tweets
<https://dev.twitter.com/docs/platform-objects/tweets>`_
:return: the given file(s) as a list of dictionaries deserialised
from JSON.
:rtype: list(dict)
"""
return concat(
[
self.CorpusView(path, self._read_tweets, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def strings(self, fileids=None):
"""
Returns only the text content of Tweets in the file(s)
:return: the given file(s) as a list of Tweets.
:rtype: list(str)
"""
fulltweets = self.docs(fileids)
tweets = []
for jsono in fulltweets:
try:
text = jsono["text"]
if isinstance(text, bytes):
text = text.decode(self.encoding)
tweets.append(text)
except KeyError:
pass
return tweets
def tokenized(self, fileids=None):
"""
:return: the given file(s) as a list of the text content of Tweets as
as a list of words, screenanames, hashtags, URLs and punctuation symbols.
:rtype: list(list(str))
"""
tweets = self.strings(fileids)
tokenizer = self._word_tokenizer
return [tokenizer.tokenize(t) for t in tweets]
def _read_tweets(self, stream):
"""
Assumes that each line in ``stream`` is a JSON-serialised object.
"""
tweets = []
for i in range(10):
line = stream.readline()
if not line:
return tweets
tweet = json.loads(line)
tweets.append(tweet)
return tweets
|