import os import string from docx import Document from docx.text.hyperlink import Hyperlink from docx.text.run import Run import nltk nltk.download('punkt') nltk.download('punkt_tab') from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tokenize.treebank import TreebankWordDetokenizer from subprocess import Popen, PIPE from itertools import groupby import fileinput # Class to align original and translated sentences # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py class Aligner(): def __init__(self, config_folder, source_lang, target_lang, temp_folder): forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params") reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params") fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err")) rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err")) self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align") self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align") self.forward_command = lambda \ x: f'./fast_align -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}' self.reverse_command = lambda \ x: f'./fast_align -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}' self.symmetric_command = f'./atools -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and' def __simplify_alignment_file(self, file): with fileinput.FileInput(file, inplace=True, backup='.bak') as f: for line in f: print(line.split('|||')[2].strip()) def __read_err(self, err): (T, m) = ('', '') for line in open(err): # expected target length = source length * N if 'expected target length' in line: m = line.split()[-1] # final tension: N elif 'final tension' in line: T = line.split()[-1] return T, m def align(self, file): # generate forward alignment process = Popen(self.forward_command(file), shell=True) process.wait() # generate reverse alignment process = Popen(self.reverse_command(file), shell=True) process.wait() # for some reason the output file contains more information than needed, remove it self.__simplify_alignment_file(self.forward_alignment_file_path) self.__simplify_alignment_file(self.reverse_alignment_file_path) # generate symmetrical alignment process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE) process.wait() # get final alignments and format them alignments_str = process.communicate()[0].decode('utf-8') alignments = [] for line in alignments_str.splitlines(): alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]]) return alignments # Function to extract paragraphs with their runs def extract_paragraphs_with_runs(doc): paragraphs_with_runs = [] for paragraph in doc.paragraphs: runs = [] for item in paragraph.iter_inner_content(): if isinstance(item, Run): runs.append({ 'text': item.text, 'bold': item.bold, 'italic': item.italic, 'underline': item.underline, 'font_name': item.font.name, 'font_size': item.font.size, 'font_color': item.font.color.rgb }) elif isinstance(item, Hyperlink): runs.append({ 'text': item.runs[0].text, 'bold': item.runs[0].bold, 'italic': item.runs[0].italic, 'underline': item.runs[0].underline, 'font_name': item.runs[0].font.name, 'font_size': item.runs[0].font.size, 'font_color': item.runs[0].font.color.rgb }) paragraphs_with_runs.append(runs) return paragraphs_with_runs def tokenize_paragraph_with_runs2(runs_in_paragraph): text_paragraph = " ".join(run["text"] for run in runs_in_paragraph) sentences = sent_tokenize(text_paragraph) tokenized_sentences = [word_tokenize(sentence) for sentence in sentences] tokenized_sentences_with_style = [] for tokenized_sentence in tokenized_sentences: tokenized_sentence_with_style = [] token_idx = 0 for run in runs_in_paragraph: text_in_run = run["text"].strip() if text_in_run == tokenized_sentence[token_idx]: new_run = run.copy() new_run["text"] = text_in_run tokenized_sentence_with_style.append(new_run) token_idx += 1 if token_idx >= len(tokenized_sentence): break elif len(text_in_run) > len(tokenized_sentence[token_idx]): if text_in_run.startswith(tokenized_sentence[token_idx]): for token in word_tokenize(text_in_run): if token == tokenized_sentence[token_idx]: new_run = run.copy() new_run["text"] = token tokenized_sentence_with_style.append(new_run) token_idx += 1 else: raise "oops" tokenized_sentences_with_style.append(tokenized_sentence_with_style) return tokenized_sentences_with_style def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer): text_paragraph = detokenizer.detokenize([run["text"] for run in runs_in_paragraph]) sentences = sent_tokenize(text_paragraph) tokenized_sentences = [word_tokenize(sentence) for sentence in sentences] tokens_with_style = [] for run in runs_in_paragraph: tokens = word_tokenize(run["text"]) for token in tokens: tokens_with_style.append(run.copy()) tokens_with_style[-1]["text"] = token token_index = 0 tokenized_sentences_with_style = [] for sentence in tokenized_sentences: sentence_with_style = [] for word in sentence: if word == tokens_with_style[token_index]["text"]: sentence_with_style.append(tokens_with_style[token_index]) token_index += 1 else: if word.startswith(tokens_with_style[token_index]["text"]): # this token might be split into several runs word_left = word while word_left: sentence_with_style.append(tokens_with_style[token_index]) word_left = word_left.removeprefix(tokens_with_style[token_index]["text"]) token_index += 1 else: raise "oops" tokenized_sentences_with_style.append(sentence_with_style) return tokenized_sentences_with_style def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner, temp_folder, detokenizer): # clean temp folder for f in os.listdir(temp_folder): os.remove(os.path.join(temp_folder, f)) temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt") # tokenize the original text by sentence and words while keeping the style original_tokenized_sentences_with_style = tokenize_paragraph_with_runs(original_runs_in_paragraph, detokenizer) # tokenize the translated text by sentence and word translated_tokenized_sentences = [word_tokenize(sentence) for sentence in sent_tokenize(translated_paragraph)] # write the file that fastalign will use with open(temp_file_path, "w") as out_file: for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences): out_file.write(f"{" ".join(item["text"] for item in original)} ||| {" ".join(translated)}\n") alignments = aligner.align(temp_file_path) # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one translated_sentences_with_style = [] for sentence_idx, sentence_alignments in enumerate(alignments): # reverse the order of the alignments and build a dict with it sentence_alignments = {target: source for source, target in sentence_alignments} translated_sentence_with_style = [] for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]): # fastalign has found a token aligned with the translated one if token_idx in sentence_alignments.keys(): # get the aligned token original_idx = sentence_alignments[token_idx] new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy() new_entry["text"] = translated_token translated_sentence_with_style.append(new_entry) else: # WARNING this is a test # since fastalign doesn't know from which word to reference this token, copy the style of the previous word new_entry = translated_sentence_with_style[-1].copy() new_entry["text"] = translated_token translated_sentence_with_style.append(new_entry) translated_sentences_with_style.append(translated_sentence_with_style) return translated_sentences_with_style # TODO def translate_paragraph(paragraph_text): translated_paragraph = "" return translated_paragraphs # group contiguous elements with the same boolean values def group_by_style(values, detokenizer): groups = [] for key, group in groupby(values, key=lambda x: ( x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])): text = detokenizer.detokenize([item['text'] for item in group]) if groups and not text.startswith((",", ";", ":", ".", ")")): text = " " + text groups.append({"text": text, "bold": key[0], "italic": key[1], "underline": key[2], "font_name": key[3], "font_size": key[4], "font_color": key[5]}) return groups def preprocess_runs(runs_in_paragraph): new_runs = [] for run in runs_in_paragraph: # sometimes the parameters are False and sometimes they are None, set them all to False for key, value in run.items(): if value is None and not key.startswith("font"): run[key] = False if not new_runs: new_runs.append(run) else: # if the previous run has the same format as the current run, we merge the two runs together if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"] and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"] and new_runs[-1]["underline"] == run["underline"]): new_runs[-1]["text"] += run["text"] else: new_runs.append(run) # we want to split runs that contain more than one sentence to avoid problems later when aligning styles sentences = sent_tokenize(new_runs[-1]["text"]) if len(sentences) > 1: new_runs[-1]["text"] = sentences[0] for sentence in sentences[1:]: new_run = new_runs[-1].copy() new_run["text"] = sentence new_runs.append(new_run) return new_runs if __name__ == "__main__": input_file = 'data/test3.docx' output_file = 'data/translated_output.docx' source_lang = 'ca' target_lang = 'en' config_folder = "fast_align_config" temp_folder = "tmp" aligner = Aligner(config_folder, source_lang, target_lang, temp_folder) os.makedirs(temp_folder, exist_ok=True) # load original file, extract the paragraphs with their runs (which include style and formatting) doc = Document(input_file) paragraphs_with_runs = extract_paragraphs_with_runs(doc) detokenizer = TreebankWordDetokenizer() # translate each paragraph translated_paragraphs = [] for paragraph in paragraphs_with_runs: paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph]) translated_paragraphs.append(translate_paragraph(paragraph_text)) out_doc = Document() for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs, translated_paragraphs, doc.paragraphs): # in case there are empty paragraphs if len(original_runs_in_paragraph) == 1 and not original_runs_in_paragraph[0]["text"]: out_doc.add_paragraph(style=original_paragraph.style) original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph) paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner, temp_folder, detokenizer) para = out_doc.add_paragraph(style=original_paragraph.style) # flatten the paragraph, we don't need it to split into sentences anymore paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist] # merge tokens into runs and detokenize paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer) for item in paragraph_with_runs: run = para.add_run(item["text"]) # Preserve original run formatting run.bold = item['bold'] run.italic = item['italic'] run.underline = item['underline'] run.font.name = item['font_name'] run.font.size = item['font_size'] run.font.color.rgb = item['font_color'] out_doc.save(output_file)