Spaces:
Sleeping
Sleeping
File size: 9,716 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 |
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""Corpus reader for the XML version of the British National Corpus."""
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView
class BNCCorpusReader(XMLCorpusReader):
r"""Corpus reader for the XML version of the British National Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
You can obtain the full version of the BNC corpus at
https://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can
instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
"""
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
def words(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, False, None, strip_space, stem)
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = "c5" if c5 else "pos"
return self._views(fileids, False, tag, strip_space, stem)
def sents(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, True, None, strip_space, stem)
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = "c5" if c5 else "pos"
return self._views(
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
)
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
f = BNCWordView if self._lazy else self._words
return concat(
[
f(fileid, sent, tag, strip_space, stem)
for fileid in self.abspaths(fileids)
]
)
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
"""
Helper used to implement the view methods -- returns a list of
words or a list of sentences, optionally tagged.
:param fileid: The name of the underlying file.
:param bracket_sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
result = []
xmldoc = ElementTree.parse(fileid).getroot()
for xmlsent in xmldoc.findall(".//s"):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
word = xmlword.text
if not word:
word = "" # fixes issue 337?
if strip_space or stem:
word = word.strip()
if stem:
word = xmlword.get("hw", word)
if tag == "c5":
word = (word, xmlword.get("c5"))
elif tag == "pos":
word = (word, xmlword.get("pos", xmlword.get("c5")))
sent.append(word)
if bracket_sent:
result.append(BNCSentence(xmlsent.attrib["n"], sent))
else:
result.extend(sent)
assert None not in result
return result
def _all_xmlwords_in(elt, result=None):
if result is None:
result = []
for child in elt:
if child.tag in ("c", "w"):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result
class BNCSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
class BNCWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
tags_to_ignore = {
"pb",
"gap",
"vocal",
"event",
"unclear",
"shift",
"pause",
"align",
}
"""These tags are ignored. For their description refer to the
technical documentation, for example,
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
"""
def __init__(self, fileid, sent, tag, strip_space, stem):
"""
:param fileid: The name of the underlying file.
:param sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
if sent:
tagspec = ".*/s"
else:
tagspec = ".*/s/(.*/)?(c|w)"
self._sent = sent
self._tag = tag
self._strip_space = strip_space
self._stem = stem
self.title = None #: Title of the document.
self.author = None #: Author of the document.
self.editor = None #: Editor
self.resps = None #: Statement of responsibility
XMLCorpusView.__init__(self, fileid, tagspec)
# Read in a tasty header.
self._open()
self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
self.close()
# Reset tag context.
self._tag_context = {0: ()}
def handle_header(self, elt, context):
# Set up some metadata!
titles = elt.findall("titleStmt/title")
if titles:
self.title = "\n".join(title.text.strip() for title in titles)
authors = elt.findall("titleStmt/author")
if authors:
self.author = "\n".join(author.text.strip() for author in authors)
editors = elt.findall("titleStmt/editor")
if editors:
self.editor = "\n".join(editor.text.strip() for editor in editors)
resps = elt.findall("titleStmt/respStmt")
if resps:
self.resps = "\n\n".join(
"\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
)
def handle_elt(self, elt, context):
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)
def handle_word(self, elt):
word = elt.text
if not word:
word = "" # fixes issue 337?
if self._strip_space or self._stem:
word = word.strip()
if self._stem:
word = elt.get("hw", word)
if self._tag == "c5":
word = (word, elt.get("c5"))
elif self._tag == "pos":
word = (word, elt.get("pos", elt.get("c5")))
return word
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag in ("mw", "hi", "corr", "trunc"):
sent += [self.handle_word(w) for w in child]
elif child.tag in ("w", "c"):
sent.append(self.handle_word(child))
elif child.tag not in self.tags_to_ignore:
raise ValueError("Unexpected element %s" % child.tag)
return BNCSentence(elt.attrib["n"], sent)
|