Spaces:
Sleeping
Sleeping
File size: 4,547 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# Natural Language Toolkit: Switchboard Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag, str2tuple
class SwitchboardTurn(list):
"""
A specialized list object used to encode switchboard utterances.
The elements of the list are the words in the utterance; and two
attributes, ``speaker`` and ``id``, are provided to retrieve the
spearker identifier and utterance id. Note that utterance ids
are only unique within a given discourse.
"""
def __init__(self, words, speaker, id):
list.__init__(self, words)
self.speaker = speaker
self.id = int(id)
def __repr__(self):
if len(self) == 0:
text = ""
elif isinstance(self[0], tuple):
text = " ".join("%s/%s" % w for w in self)
else:
text = " ".join(self)
return f"<{self.speaker}.{self.id}: {text!r}>"
class SwitchboardCorpusReader(CorpusReader):
_FILES = ["tagged"]
# Use the "tagged" file even for non-tagged data methods, since
# it's tokenized.
def __init__(self, root, tagset=None):
CorpusReader.__init__(self, root, self._FILES)
self._tagset = tagset
def words(self):
return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
def tagged_words(self, tagset=None):
def tagged_words_block_reader(stream):
return self._tagged_words_block_reader(stream, tagset)
return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
def turns(self):
return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
def tagged_turns(self, tagset=None):
def tagged_turns_block_reader(stream):
return self._tagged_turns_block_reader(stream, tagset)
return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
def discourses(self):
return StreamBackedCorpusView(
self.abspath("tagged"), self._discourses_block_reader
)
def tagged_discourses(self, tagset=False):
def tagged_discourses_block_reader(stream):
return self._tagged_discourses_block_reader(stream, tagset)
return StreamBackedCorpusView(
self.abspath("tagged"), tagged_discourses_block_reader
)
def _discourses_block_reader(self, stream):
# returns at most 1 discourse. (The other methods depend on this.)
return [
[
self._parse_utterance(u, include_tag=False)
for b in read_blankline_block(stream)
for u in b.split("\n")
if u.strip()
]
]
def _tagged_discourses_block_reader(self, stream, tagset=None):
# returns at most 1 discourse. (The other methods depend on this.)
return [
[
self._parse_utterance(u, include_tag=True, tagset=tagset)
for b in read_blankline_block(stream)
for u in b.split("\n")
if u.strip()
]
]
def _turns_block_reader(self, stream):
return self._discourses_block_reader(stream)[0]
def _tagged_turns_block_reader(self, stream, tagset=None):
return self._tagged_discourses_block_reader(stream, tagset)[0]
def _words_block_reader(self, stream):
return sum(self._discourses_block_reader(stream)[0], [])
def _tagged_words_block_reader(self, stream, tagset=None):
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
_UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
_SEP = "/"
def _parse_utterance(self, utterance, include_tag, tagset=None):
m = self._UTTERANCE_RE.match(utterance)
if m is None:
raise ValueError("Bad utterance %r" % utterance)
speaker, id, text = m.groups()
words = [str2tuple(s, self._SEP) for s in text.split()]
if not include_tag:
words = [w for (w, t) in words]
elif tagset and tagset != self._tagset:
words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
return SwitchboardTurn(words, speaker, id)
|