Spaces:

LangTech-MT
/

document-translator

Sleeping

Fixed issues when dealing with hyperlinks (for now we keep the text and formatting but not the link), also improved format handling and sped things up a bit by avoiding loading fastalign with empty paragraphs

595da73 4 months ago

raw

history blame

14.7 kB

	import os
	import string

	from docx import Document
	from docx.text.hyperlink import Hyperlink
	from docx.text.run import Run
	import nltk

	nltk.download('punkt')
	nltk.download('punkt_tab')

	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.tokenize.treebank import TreebankWordDetokenizer

	from subprocess import Popen, PIPE

	from itertools import groupby
	import fileinput


	# Class to align original and translated sentences
	# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
	class Aligner():
	def __init__(self, config_folder, source_lang, target_lang, temp_folder):
	forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
	reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")

	fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
	rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))

	self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
	self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")

	self.forward_command = lambda \
	x: f'./fast_align -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
	self.reverse_command = lambda \
	x: f'./fast_align -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'

	self.symmetric_command = f'./atools -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'

	def __simplify_alignment_file(self, file):
	with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
	for line in f:
	print(line.split('\|\|\|')[2].strip())

	def __read_err(self, err):
	(T, m) = ('', '')
	for line in open(err):
	# expected target length = source length * N
	if 'expected target length' in line:
	m = line.split()[-1]
	# final tension: N
	elif 'final tension' in line:
	T = line.split()[-1]
	return T, m

	def align(self, file):
	# generate forward alignment
	process = Popen(self.forward_command(file), shell=True)
	process.wait()
	# generate reverse alignment
	process = Popen(self.reverse_command(file), shell=True)
	process.wait()

	# for some reason the output file contains more information than needed, remove it
	self.__simplify_alignment_file(self.forward_alignment_file_path)
	self.__simplify_alignment_file(self.reverse_alignment_file_path)

	# generate symmetrical alignment
	process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
	process.wait()

	# get final alignments and format them
	alignments_str = process.communicate()[0].decode('utf-8')
	alignments = []
	for line in alignments_str.splitlines():
	alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])

	return alignments


	# Function to extract paragraphs with their runs
	def extract_paragraphs_with_runs(doc):
	paragraphs_with_runs = []
	for paragraph in doc.paragraphs:
	runs = []
	for item in paragraph.iter_inner_content():
	if isinstance(item, Run):
	runs.append({
	'text': item.text,
	'bold': item.bold,
	'italic': item.italic,
	'underline': item.underline,
	'font_name': item.font.name,
	'font_size': item.font.size,
	'font_color': item.font.color.rgb
	})
	elif isinstance(item, Hyperlink):
	runs.append({
	'text': item.runs[0].text,
	'bold': item.runs[0].bold,
	'italic': item.runs[0].italic,
	'underline': item.runs[0].underline,
	'font_name': item.runs[0].font.name,
	'font_size': item.runs[0].font.size,
	'font_color': item.runs[0].font.color.rgb
	})


	paragraphs_with_runs.append(runs)
	return paragraphs_with_runs


	def tokenize_paragraph_with_runs2(runs_in_paragraph):
	text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
	sentences = sent_tokenize(text_paragraph)
	tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

	tokenized_sentences_with_style = []
	for tokenized_sentence in tokenized_sentences:
	tokenized_sentence_with_style = []
	token_idx = 0
	for run in runs_in_paragraph:
	text_in_run = run["text"].strip()

	if text_in_run == tokenized_sentence[token_idx]:
	new_run = run.copy()
	new_run["text"] = text_in_run
	tokenized_sentence_with_style.append(new_run)
	token_idx += 1
	if token_idx >= len(tokenized_sentence):
	break
	elif len(text_in_run) > len(tokenized_sentence[token_idx]):
	if text_in_run.startswith(tokenized_sentence[token_idx]):
	for token in word_tokenize(text_in_run):
	if token == tokenized_sentence[token_idx]:
	new_run = run.copy()
	new_run["text"] = token
	tokenized_sentence_with_style.append(new_run)
	token_idx += 1
	else:
	raise "oops"
	tokenized_sentences_with_style.append(tokenized_sentence_with_style)
	return tokenized_sentences_with_style


	def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer):
	text_paragraph = detokenizer.detokenize([run["text"] for run in runs_in_paragraph])
	sentences = sent_tokenize(text_paragraph)
	tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

	tokens_with_style = []
	for run in runs_in_paragraph:
	tokens = word_tokenize(run["text"])
	for token in tokens:
	tokens_with_style.append(run.copy())
	tokens_with_style[-1]["text"] = token

	token_index = 0
	tokenized_sentences_with_style = []
	for sentence in tokenized_sentences:
	sentence_with_style = []
	for word in sentence:
	if word == tokens_with_style[token_index]["text"]:
	sentence_with_style.append(tokens_with_style[token_index])
	token_index += 1
	else:
	if word.startswith(tokens_with_style[token_index]["text"]):
	# this token might be split into several runs
	word_left = word

	while word_left:
	sentence_with_style.append(tokens_with_style[token_index])
	word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
	token_index += 1
	else:
	raise "oops"
	tokenized_sentences_with_style.append(sentence_with_style)
	return tokenized_sentences_with_style


	def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner, temp_folder, detokenizer):
	# clean temp folder
	for f in os.listdir(temp_folder):
	os.remove(os.path.join(temp_folder, f))

	temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")

	# tokenize the original text by sentence and words while keeping the style
	original_tokenized_sentences_with_style = tokenize_paragraph_with_runs(original_runs_in_paragraph, detokenizer)
	# tokenize the translated text by sentence and word
	translated_tokenized_sentences = [word_tokenize(sentence) for sentence in sent_tokenize(translated_paragraph)]

	# write the file that fastalign will use
	with open(temp_file_path, "w") as out_file:
	for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
	out_file.write(f"{" ".join(item["text"] for item in original)} \|\|\| {" ".join(translated)}\n")

	alignments = aligner.align(temp_file_path)

	# using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
	translated_sentences_with_style = []
	for sentence_idx, sentence_alignments in enumerate(alignments):

	# reverse the order of the alignments and build a dict with it
	sentence_alignments = {target: source for source, target in sentence_alignments}

	translated_sentence_with_style = []
	for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
	# fastalign has found a token aligned with the translated one
	if token_idx in sentence_alignments.keys():
	# get the aligned token
	original_idx = sentence_alignments[token_idx]
	new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
	new_entry["text"] = translated_token
	translated_sentence_with_style.append(new_entry)
	else:
	# WARNING this is a test
	# since fastalign doesn't know from which word to reference this token, copy the style of the previous word
	new_entry = translated_sentence_with_style[-1].copy()
	new_entry["text"] = translated_token
	translated_sentence_with_style.append(new_entry)

	translated_sentences_with_style.append(translated_sentence_with_style)

	return translated_sentences_with_style


	# TODO
	def translate_paragraph(paragraph_text):
	translated_paragraph = ""
	return translated_paragraphs


	# group contiguous elements with the same boolean values
	def group_by_style(values, detokenizer):
	groups = []
	for key, group in groupby(values, key=lambda x: (
	x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
	text = detokenizer.detokenize([item['text'] for item in group])

	if groups and not text.startswith((",", ";", ":", ".", ")")):
	text = " " + text

	groups.append({"text": text,
	"bold": key[0],
	"italic": key[1],
	"underline": key[2],
	"font_name": key[3],
	"font_size": key[4],
	"font_color": key[5]})
	return groups


	def preprocess_runs(runs_in_paragraph):
	new_runs = []

	for run in runs_in_paragraph:

	# sometimes the parameters are False and sometimes they are None, set them all to False
	for key, value in run.items():
	if value is None and not key.startswith("font"):
	run[key] = False

	if not new_runs:
	new_runs.append(run)
	else:
	# if the previous run has the same format as the current run, we merge the two runs together
	if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
	new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
	and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
	and new_runs[-1]["underline"] == run["underline"]):
	new_runs[-1]["text"] += run["text"]
	else:
	new_runs.append(run)

	# we want to split runs that contain more than one sentence to avoid problems later when aligning styles
	sentences = sent_tokenize(new_runs[-1]["text"])
	if len(sentences) > 1:
	new_runs[-1]["text"] = sentences[0]
	for sentence in sentences[1:]:
	new_run = new_runs[-1].copy()
	new_run["text"] = sentence
	new_runs.append(new_run)

	return new_runs


	if __name__ == "__main__":
	input_file = 'data/test3.docx'
	output_file = 'data/translated_output.docx'
	source_lang = 'ca'
	target_lang = 'en'
	config_folder = "fast_align_config"
	temp_folder = "tmp"

	aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)

	os.makedirs(temp_folder, exist_ok=True)

	# load original file, extract the paragraphs with their runs (which include style and formatting)
	doc = Document(input_file)
	paragraphs_with_runs = extract_paragraphs_with_runs(doc)

	detokenizer = TreebankWordDetokenizer()

	# translate each paragraph
	translated_paragraphs = []
	for paragraph in paragraphs_with_runs:
	paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
	translated_paragraphs.append(translate_paragraph(paragraph_text))

	out_doc = Document()

	for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
	translated_paragraphs,
	doc.paragraphs):
	# in case there are empty paragraphs
	if len(original_runs_in_paragraph) == 1 and not original_runs_in_paragraph[0]["text"]:
	out_doc.add_paragraph(style=original_paragraph.style)

	original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)

	paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
	temp_folder, detokenizer)

	para = out_doc.add_paragraph(style=original_paragraph.style)

	# flatten the paragraph, we don't need it to split into sentences anymore
	paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]

	# merge tokens into runs and detokenize
	paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)

	for item in paragraph_with_runs:
	run = para.add_run(item["text"])
	# Preserve original run formatting
	run.bold = item['bold']
	run.italic = item['italic']
	run.underline = item['underline']
	run.font.name = item['font_name']
	run.font.size = item['font_size']
	run.font.color.rgb = item['font_color']

	out_doc.save(output_file)