Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /corpus /reader /bnc.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

9.72 kB

	# Natural Language Toolkit: Plaintext Corpus Reader
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Edward Loper <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""Corpus reader for the XML version of the British National Corpus."""

	from nltk.corpus.reader.util import concat
	from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView


	class BNCCorpusReader(XMLCorpusReader):
	r"""Corpus reader for the XML version of the British National Corpus.

	For access to the complete XML data structure, use the ``xml()``
	method. For access to simple word lists and tagged word lists, use
	``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.

	You can obtain the full version of the BNC corpus at
	https://www.ota.ox.ac.uk/desc/2554

	If you extracted the archive to a directory called `BNC`, then you can
	instantiate the reader as::

	BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w/\w\.xml')

	"""

	def __init__(self, root, fileids, lazy=True):
	XMLCorpusReader.__init__(self, root, fileids)
	self._lazy = lazy

	def words(self, fileids=None, strip_space=True, stem=False):
	"""
	:return: the given file(s) as a list of words
	and punctuation symbols.
	:rtype: list(str)

	:param strip_space: If true, then strip trailing spaces from
	word tokens. Otherwise, leave the spaces on the tokens.
	:param stem: If true, then use word stems instead of word strings.
	"""
	return self._views(fileids, False, None, strip_space, stem)

	def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
	"""
	:return: the given file(s) as a list of tagged
	words and punctuation symbols, encoded as tuples
	``(word,tag)``.
	:rtype: list(tuple(str,str))

	:param c5: If true, then the tags used will be the more detailed
	c5 tags. Otherwise, the simplified tags will be used.
	:param strip_space: If true, then strip trailing spaces from
	word tokens. Otherwise, leave the spaces on the tokens.
	:param stem: If true, then use word stems instead of word strings.
	"""
	tag = "c5" if c5 else "pos"
	return self._views(fileids, False, tag, strip_space, stem)

	def sents(self, fileids=None, strip_space=True, stem=False):
	"""
	:return: the given file(s) as a list of
	sentences or utterances, each encoded as a list of word
	strings.
	:rtype: list(list(str))

	:param strip_space: If true, then strip trailing spaces from
	word tokens. Otherwise, leave the spaces on the tokens.
	:param stem: If true, then use word stems instead of word strings.
	"""
	return self._views(fileids, True, None, strip_space, stem)

	def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
	"""
	:return: the given file(s) as a list of
	sentences, each encoded as a list of ``(word,tag)`` tuples.
	:rtype: list(list(tuple(str,str)))

	:param c5: If true, then the tags used will be the more detailed
	c5 tags. Otherwise, the simplified tags will be used.
	:param strip_space: If true, then strip trailing spaces from
	word tokens. Otherwise, leave the spaces on the tokens.
	:param stem: If true, then use word stems instead of word strings.
	"""
	tag = "c5" if c5 else "pos"
	return self._views(
	fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
	)

	def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
	"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
	f = BNCWordView if self._lazy else self._words
	return concat(
	[
	f(fileid, sent, tag, strip_space, stem)
	for fileid in self.abspaths(fileids)
	]
	)

	def _words(self, fileid, bracket_sent, tag, strip_space, stem):
	"""
	Helper used to implement the view methods -- returns a list of
	words or a list of sentences, optionally tagged.

	:param fileid: The name of the underlying file.
	:param bracket_sent: If true, include sentence bracketing.
	:param tag: The name of the tagset to use, or None for no tags.
	:param strip_space: If true, strip spaces from word tokens.
	:param stem: If true, then substitute stems for words.
	"""
	result = []

	xmldoc = ElementTree.parse(fileid).getroot()
	for xmlsent in xmldoc.findall(".//s"):
	sent = []
	for xmlword in _all_xmlwords_in(xmlsent):
	word = xmlword.text
	if not word:
	word = "" # fixes issue 337?
	if strip_space or stem:
	word = word.strip()
	if stem:
	word = xmlword.get("hw", word)
	if tag == "c5":
	word = (word, xmlword.get("c5"))
	elif tag == "pos":
	word = (word, xmlword.get("pos", xmlword.get("c5")))
	sent.append(word)
	if bracket_sent:
	result.append(BNCSentence(xmlsent.attrib["n"], sent))
	else:
	result.extend(sent)

	assert None not in result
	return result


	def _all_xmlwords_in(elt, result=None):
	if result is None:
	result = []
	for child in elt:
	if child.tag in ("c", "w"):
	result.append(child)
	else:
	_all_xmlwords_in(child, result)
	return result


	class BNCSentence(list):
	"""
	A list of words, augmented by an attribute ``num`` used to record
	the sentence identifier (the ``n`` attribute from the XML).
	"""

	def __init__(self, num, items):
	self.num = num
	list.__init__(self, items)


	class BNCWordView(XMLCorpusView):
	"""
	A stream backed corpus view specialized for use with the BNC corpus.
	"""

	tags_to_ignore = {
	"pb",
	"gap",
	"vocal",
	"event",
	"unclear",
	"shift",
	"pause",
	"align",
	}
	"""These tags are ignored. For their description refer to the
	technical documentation, for example,
	http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html

	"""

	def __init__(self, fileid, sent, tag, strip_space, stem):
	"""
	:param fileid: The name of the underlying file.
	:param sent: If true, include sentence bracketing.
	:param tag: The name of the tagset to use, or None for no tags.
	:param strip_space: If true, strip spaces from word tokens.
	:param stem: If true, then substitute stems for words.
	"""
	if sent:
	tagspec = ".*/s"
	else:
	tagspec = "./s/(./)?(c\|w)"
	self._sent = sent
	self._tag = tag
	self._strip_space = strip_space
	self._stem = stem

	self.title = None #: Title of the document.
	self.author = None #: Author of the document.
	self.editor = None #: Editor
	self.resps = None #: Statement of responsibility

	XMLCorpusView.__init__(self, fileid, tagspec)

	# Read in a tasty header.
	self._open()
	self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
	self.close()

	# Reset tag context.
	self._tag_context = {0: ()}

	def handle_header(self, elt, context):
	# Set up some metadata!
	titles = elt.findall("titleStmt/title")
	if titles:
	self.title = "\n".join(title.text.strip() for title in titles)

	authors = elt.findall("titleStmt/author")
	if authors:
	self.author = "\n".join(author.text.strip() for author in authors)

	editors = elt.findall("titleStmt/editor")
	if editors:
	self.editor = "\n".join(editor.text.strip() for editor in editors)

	resps = elt.findall("titleStmt/respStmt")
	if resps:
	self.resps = "\n\n".join(
	"\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
	)

	def handle_elt(self, elt, context):
	if self._sent:
	return self.handle_sent(elt)
	else:
	return self.handle_word(elt)

	def handle_word(self, elt):
	word = elt.text
	if not word:
	word = "" # fixes issue 337?
	if self._strip_space or self._stem:
	word = word.strip()
	if self._stem:
	word = elt.get("hw", word)
	if self._tag == "c5":
	word = (word, elt.get("c5"))
	elif self._tag == "pos":
	word = (word, elt.get("pos", elt.get("c5")))
	return word

	def handle_sent(self, elt):
	sent = []
	for child in elt:
	if child.tag in ("mw", "hi", "corr", "trunc"):
	sent += [self.handle_word(w) for w in child]
	elif child.tag in ("w", "c"):
	sent.append(self.handle_word(child))
	elif child.tag not in self.tags_to_ignore:
	raise ValueError("Unexpected element %s" % child.tag)
	return BNCSentence(elt.attrib["n"], sent)