Spaces:

anasampa2
/

parser

Runtime error

App Files Files Community

parser / udpipe2 /ud-2.6 /conllu_to_text.pl

anasampa2

Upload 151 files

ee0ec3d verified over 1 year ago

raw

history blame

8.68 kB

	#!/usr/bin/env perl
	# Extracts raw text from CoNLL-U file. Uses newdoc and newpar tags when available.
	# Copyright © 2017 Dan Zeman <[email protected]>
	# License: GNU GPL

	use utf8;
	use open ':utf8';
	binmode(STDIN, ':utf8');
	binmode(STDOUT, ':utf8');
	binmode(STDERR, ':utf8');
	use Getopt::Long;

	# Language code 'zh' or 'ja' will trigger Chinese-like text formatting.
	my $language = 'en';
	GetOptions
	(
	'language=s' => \$language
	);
	my $chinese = $language =~ m/^(zh\|ja)(_\|$)/;

	my @sentence = ();
	my $text = ''; # from the text attribute of the sentence
	my $ftext = ''; # from the word forms of the tokens
	my $newpar = 0;
	my $newdoc = 0;
	my $buffer = '';
	my $start = 1;
	my $mwtlast;
	while(<>)
	{
	push(@sentence, $_);
	if(m/^\#\stext\s=\s*(.+)/)
	{
	$text = $1;
	}
	elsif(m/^\#\s*newpar(\s\|$)/i)
	{
	$newpar = 1;
	}
	elsif(m/^\#\s*newdoc(\s\|$)/i)
	{
	$newdoc = 1;
	}
	elsif(m/^\d+-(\d+)\t/)
	{
	$mwtlast = $1;
	my @f = split(/\t/, $_);
	# Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.)
	# The first token of the new paragraph has "NewPar=Yes" in the MISC column.
	# Multi-word tokens have this in the token-introducing line.
	if($f[9] =~ m/NewPar=Yes/i)
	{
	# Empty line between documents and paragraphs. (There may have been
	# a paragraph break before the first part of this sentence as well!)
	$buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer);
	$buffer .= $ftext;
	# Line breaks at word boundaries after at most 80 characters.
	$buffer = print_lines_from_buffer($buffer, 80, $chinese);
	print("$buffer\n\n");
	$buffer = '';
	# Start is only true until we write the first sentence of the input stream.
	$start = 0;
	$newdoc = 0;
	$newpar = 0;
	$text = '';
	$ftext = '';
	}
	$ftext .= $f[1];
	$ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/);
	}
	elsif(m/^(\d+)\t/ && !(defined($mwtlast) && $1<=$mwtlast))
	{
	$mwtlast = undef;
	my @f = split(/\t/, $_);
	# Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.)
	# The first token of the new paragraph has "NewPar=Yes" in the MISC column.
	# Multi-word tokens have this in the token-introducing line.
	if($f[9] =~ m/NewPar=Yes/i)
	{
	# Empty line between documents and paragraphs. (There may have been
	# a paragraph break before the first part of this sentence as well!)
	$buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer);
	$buffer .= $ftext;
	# Line breaks at word boundaries after at most 80 characters.
	$buffer = print_lines_from_buffer($buffer, 80, $chinese);
	print("$buffer\n\n");
	$buffer = '';
	# Start is only true until we write the first sentence of the input stream.
	$start = 0;
	$newdoc = 0;
	$newpar = 0;
	$text = '';
	$ftext = '';
	}
	$ftext .= $f[1];
	$ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/);
	}
	elsif(m/^\s*$/)
	{
	# In a valid CoNLL-U file, $text should be equal to $ftext except for the
	# space after the last token. However, if there have been intra-sentential
	# paragraph breaks, $ftext contains only the part after the last such
	# break, and $text is empty. Hence we currently use $ftext everywhere
	# and ignore $text, even though we note it when seeing the text attribute.
	# $text .= ' ' unless($chinese);
	# Empty line between documents and paragraphs.
	$buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer);
	$buffer .= $ftext;
	# Line breaks at word boundaries after at most 80 characters.
	$buffer = print_lines_from_buffer($buffer, 80, $chinese);
	# Start is only true until we write the first sentence of the input stream.
	$start = 0;
	$newdoc = 0;
	$newpar = 0;
	$text = '';
	$ftext = '';
	$mwtlast = undef;
	}
	}
	# There may be unflushed buffer contents after the last sentence, less than 80 characters
	# (otherwise we would have already dealt with it), so just flush it.
	if($buffer ne '')
	{
	print("$buffer\n");
	}



	#------------------------------------------------------------------------------
	# Checks whether we have to print an extra line to separate paragraphs. Does it
	# if necessary. Returns the updated buffer.
	#------------------------------------------------------------------------------
	sub print_new_paragraph_if_needed
	{
	my $start = shift;
	my $newdoc = shift;
	my $newpar = shift;
	my $buffer = shift;
	if(!$start && ($newdoc \|\| $newpar))
	{
	if($buffer ne '')
	{
	print("$buffer\n");
	$buffer = '';
	}
	print("\n");
	}
	return $buffer;
	}



	#------------------------------------------------------------------------------
	# Prints as many complete lines of text as there are in the buffer. Returns the
	# remaining contents of the buffer.
	#------------------------------------------------------------------------------
	sub print_lines_from_buffer
	{
	my $buffer = shift;
	# Maximum number of characters allowed on one line, not counting the line
	# break character(s), which also replace any number of trailing spaces.
	# Exception: If there is a word longer than the limit, it will be printed
	# on one line.
	# Note that this algorithm is not suitable for Chinese and Japanese.
	my $limit = shift;
	# We need a different algorithm for Chinese and Japanese.
	my $chinese = shift;
	if($chinese)
	{
	return print_chinese_lines_from_buffer($buffer, $limit);
	}
	if(length($buffer) >= $limit)
	{
	my @cbuffer = split(//, $buffer);
	# There may be more than one new line waiting in the buffer.
	while(scalar(@cbuffer) >= $limit)
	{
	###!!! We could make it simpler if we ignored multi-space sequences
	###!!! between words. It sounds OK to ignore them because at the
	###!!! line break we do not respect original spacing anyway.
	my $i;
	my $ilastspace;
	for($i = 0; $i<=$#cbuffer; $i++)
	{
	if($i>$limit && defined($ilastspace))
	{
	last;
	}
	if($cbuffer[$i] =~ m/\s/)
	{
	$ilastspace = $i;
	}
	}
	if(defined($ilastspace) && $ilastspace>0)
	{
	my @out = @cbuffer[0..($ilastspace-1)];
	splice(@cbuffer, 0, $ilastspace+1);
	print(join('', @out), "\n");
	}
	else
	{
	print(join('', @cbuffer), "\n");
	splice(@cbuffer);
	}
	}
	$buffer = join('', @cbuffer);
	}
	return $buffer;
	}



	#------------------------------------------------------------------------------
	# Prints as many complete lines of text as there are in the buffer. Returns the
	# remaining contents of the buffer. Assumes that there are no spaces between
	# words and lines can be broken between any two characters, as is the custom in
	# Chinese and Japanese.
	#------------------------------------------------------------------------------
	sub print_chinese_lines_from_buffer
	{
	my $buffer = shift;
	# Maximum number of characters allowed on one line, not counting the line
	# break character(s).
	my $limit = shift;
	# We cannot simply print the first $limit characters from the buffer,
	# followed by a line break. There could be embedded Latin words or
	# numbers and we do not want to insert a line break in the middle of
	# a foreign word.
	my @cbuffer = split(//, $buffer);
	while(scalar(@cbuffer) >= $limit)
	{
	my $nprint = 0;
	for(my $i = 0; $i <= $#cbuffer; $i++)
	{
	if($i > $limit && $nprint > 0)
	{
	last;
	}
	unless($i < $#cbuffer && $cbuffer[$i] =~ m/[\p{Latin}0-9]/ && $cbuffer[$i+1] =~ m/[\p{Latin}0-9]/)
	{
	$nprint = $i+1;
	}
	}
	my @out = @cbuffer[0..($nprint-1)];
	splice(@cbuffer, 0, $nprint);
	print(join('', @out), "\n");
	}
	$buffer = join('', @cbuffer);
	return $buffer;
	}