Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: Plaintext Corpus Reader | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Edward Loper <[email protected]> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
"""Corpus reader for the XML version of the British National Corpus.""" | |
from nltk.corpus.reader.util import concat | |
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView | |
class BNCCorpusReader(XMLCorpusReader): | |
r"""Corpus reader for the XML version of the British National Corpus. | |
For access to the complete XML data structure, use the ``xml()`` | |
method. For access to simple word lists and tagged word lists, use | |
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. | |
You can obtain the full version of the BNC corpus at | |
https://www.ota.ox.ac.uk/desc/2554 | |
If you extracted the archive to a directory called `BNC`, then you can | |
instantiate the reader as:: | |
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml') | |
""" | |
def __init__(self, root, fileids, lazy=True): | |
XMLCorpusReader.__init__(self, root, fileids) | |
self._lazy = lazy | |
def words(self, fileids=None, strip_space=True, stem=False): | |
""" | |
:return: the given file(s) as a list of words | |
and punctuation symbols. | |
:rtype: list(str) | |
:param strip_space: If true, then strip trailing spaces from | |
word tokens. Otherwise, leave the spaces on the tokens. | |
:param stem: If true, then use word stems instead of word strings. | |
""" | |
return self._views(fileids, False, None, strip_space, stem) | |
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False): | |
""" | |
:return: the given file(s) as a list of tagged | |
words and punctuation symbols, encoded as tuples | |
``(word,tag)``. | |
:rtype: list(tuple(str,str)) | |
:param c5: If true, then the tags used will be the more detailed | |
c5 tags. Otherwise, the simplified tags will be used. | |
:param strip_space: If true, then strip trailing spaces from | |
word tokens. Otherwise, leave the spaces on the tokens. | |
:param stem: If true, then use word stems instead of word strings. | |
""" | |
tag = "c5" if c5 else "pos" | |
return self._views(fileids, False, tag, strip_space, stem) | |
def sents(self, fileids=None, strip_space=True, stem=False): | |
""" | |
:return: the given file(s) as a list of | |
sentences or utterances, each encoded as a list of word | |
strings. | |
:rtype: list(list(str)) | |
:param strip_space: If true, then strip trailing spaces from | |
word tokens. Otherwise, leave the spaces on the tokens. | |
:param stem: If true, then use word stems instead of word strings. | |
""" | |
return self._views(fileids, True, None, strip_space, stem) | |
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False): | |
""" | |
:return: the given file(s) as a list of | |
sentences, each encoded as a list of ``(word,tag)`` tuples. | |
:rtype: list(list(tuple(str,str))) | |
:param c5: If true, then the tags used will be the more detailed | |
c5 tags. Otherwise, the simplified tags will be used. | |
:param strip_space: If true, then strip trailing spaces from | |
word tokens. Otherwise, leave the spaces on the tokens. | |
:param stem: If true, then use word stems instead of word strings. | |
""" | |
tag = "c5" if c5 else "pos" | |
return self._views( | |
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem | |
) | |
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False): | |
"""A helper function that instantiates BNCWordViews or the list of words/sentences.""" | |
f = BNCWordView if self._lazy else self._words | |
return concat( | |
[ | |
f(fileid, sent, tag, strip_space, stem) | |
for fileid in self.abspaths(fileids) | |
] | |
) | |
def _words(self, fileid, bracket_sent, tag, strip_space, stem): | |
""" | |
Helper used to implement the view methods -- returns a list of | |
words or a list of sentences, optionally tagged. | |
:param fileid: The name of the underlying file. | |
:param bracket_sent: If true, include sentence bracketing. | |
:param tag: The name of the tagset to use, or None for no tags. | |
:param strip_space: If true, strip spaces from word tokens. | |
:param stem: If true, then substitute stems for words. | |
""" | |
result = [] | |
xmldoc = ElementTree.parse(fileid).getroot() | |
for xmlsent in xmldoc.findall(".//s"): | |
sent = [] | |
for xmlword in _all_xmlwords_in(xmlsent): | |
word = xmlword.text | |
if not word: | |
word = "" # fixes issue 337? | |
if strip_space or stem: | |
word = word.strip() | |
if stem: | |
word = xmlword.get("hw", word) | |
if tag == "c5": | |
word = (word, xmlword.get("c5")) | |
elif tag == "pos": | |
word = (word, xmlword.get("pos", xmlword.get("c5"))) | |
sent.append(word) | |
if bracket_sent: | |
result.append(BNCSentence(xmlsent.attrib["n"], sent)) | |
else: | |
result.extend(sent) | |
assert None not in result | |
return result | |
def _all_xmlwords_in(elt, result=None): | |
if result is None: | |
result = [] | |
for child in elt: | |
if child.tag in ("c", "w"): | |
result.append(child) | |
else: | |
_all_xmlwords_in(child, result) | |
return result | |
class BNCSentence(list): | |
""" | |
A list of words, augmented by an attribute ``num`` used to record | |
the sentence identifier (the ``n`` attribute from the XML). | |
""" | |
def __init__(self, num, items): | |
self.num = num | |
list.__init__(self, items) | |
class BNCWordView(XMLCorpusView): | |
""" | |
A stream backed corpus view specialized for use with the BNC corpus. | |
""" | |
tags_to_ignore = { | |
"pb", | |
"gap", | |
"vocal", | |
"event", | |
"unclear", | |
"shift", | |
"pause", | |
"align", | |
} | |
"""These tags are ignored. For their description refer to the | |
technical documentation, for example, | |
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html | |
""" | |
def __init__(self, fileid, sent, tag, strip_space, stem): | |
""" | |
:param fileid: The name of the underlying file. | |
:param sent: If true, include sentence bracketing. | |
:param tag: The name of the tagset to use, or None for no tags. | |
:param strip_space: If true, strip spaces from word tokens. | |
:param stem: If true, then substitute stems for words. | |
""" | |
if sent: | |
tagspec = ".*/s" | |
else: | |
tagspec = ".*/s/(.*/)?(c|w)" | |
self._sent = sent | |
self._tag = tag | |
self._strip_space = strip_space | |
self._stem = stem | |
self.title = None #: Title of the document. | |
self.author = None #: Author of the document. | |
self.editor = None #: Editor | |
self.resps = None #: Statement of responsibility | |
XMLCorpusView.__init__(self, fileid, tagspec) | |
# Read in a tasty header. | |
self._open() | |
self.read_block(self._stream, ".*/teiHeader$", self.handle_header) | |
self.close() | |
# Reset tag context. | |
self._tag_context = {0: ()} | |
def handle_header(self, elt, context): | |
# Set up some metadata! | |
titles = elt.findall("titleStmt/title") | |
if titles: | |
self.title = "\n".join(title.text.strip() for title in titles) | |
authors = elt.findall("titleStmt/author") | |
if authors: | |
self.author = "\n".join(author.text.strip() for author in authors) | |
editors = elt.findall("titleStmt/editor") | |
if editors: | |
self.editor = "\n".join(editor.text.strip() for editor in editors) | |
resps = elt.findall("titleStmt/respStmt") | |
if resps: | |
self.resps = "\n\n".join( | |
"\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps | |
) | |
def handle_elt(self, elt, context): | |
if self._sent: | |
return self.handle_sent(elt) | |
else: | |
return self.handle_word(elt) | |
def handle_word(self, elt): | |
word = elt.text | |
if not word: | |
word = "" # fixes issue 337? | |
if self._strip_space or self._stem: | |
word = word.strip() | |
if self._stem: | |
word = elt.get("hw", word) | |
if self._tag == "c5": | |
word = (word, elt.get("c5")) | |
elif self._tag == "pos": | |
word = (word, elt.get("pos", elt.get("c5"))) | |
return word | |
def handle_sent(self, elt): | |
sent = [] | |
for child in elt: | |
if child.tag in ("mw", "hi", "corr", "trunc"): | |
sent += [self.handle_word(w) for w in child] | |
elif child.tag in ("w", "c"): | |
sent.append(self.handle_word(child)) | |
elif child.tag not in self.tags_to_ignore: | |
raise ValueError("Unexpected element %s" % child.tag) | |
return BNCSentence(elt.attrib["n"], sent) | |