Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Apr 8

Commit

978cbf1

0 Parent(s):

First commit. For now the translation has not been integrated but reading a docx and writing its translation while keeping the formatting and style should work

Browse files

Files changed (3) hide show

main.py +318 -0
readme.md +18 -0
requirements.txt +2 -0

main.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import os
+from docx import Document
+import nltk
+nltk.download('punkt')
+nltk.download('punkt_tab')
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+from subprocess import Popen, PIPE
+from itertools import groupby
+import fileinput
+# Class to align original and translated sentences
+# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
+class Aligner():
+    def __init__(self, config_folder, source_lang, target_lang, temp_folder):
+        forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
+        reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
+        fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
+        rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
+        self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
+        self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
+        self.forward_command = lambda \
+                x: f'./fast_align -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
+        self.reverse_command = lambda \
+                x: f'./fast_align -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
+        self.symmetric_command = f'./atools -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
+    def __simplify_alignment_file(self, file):
+        with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
+            for line in f:
+                print(line.split('|||')[2].strip())
+    def __read_err(self, err):
+        (T, m) = ('', '')
+        for line in open(err):
+            # expected target length = source length * N
+            if 'expected target length' in line:
+                m = line.split()[-1]
+            # final tension: N
+            elif 'final tension' in line:
+                T = line.split()[-1]
+        return T, m
+    def align(self, file):
+        # generate forward alignment
+        process = Popen(self.forward_command(file), shell=True)
+        process.wait()
+        # generate reverse alignment
+        process = Popen(self.reverse_command(file), shell=True)
+        process.wait()
+        # for some reason the output file contains more information than needed, remove it
+        self.__simplify_alignment_file(self.forward_alignment_file_path)
+        self.__simplify_alignment_file(self.reverse_alignment_file_path)
+        # generate symmetrical alignment
+        process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
+        process.wait()
+        # get final alignments and format them
+        alignments_str = process.communicate()[0].decode('utf-8')
+        alignments = []
+        for line in alignments_str.splitlines():
+            alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
+        return alignments
+# Function to extract paragraphs with their runs
+def extract_paragraphs_with_runs(doc):
+    paragraphs_with_runs = []
+    for para in doc.paragraphs:
+        runs = []
+        for run in para.runs:
+            runs.append({
+                'text': run.text,
+                'bold': run.bold,
+                'italic': run.italic,
+                'underline': run.underline,
+                'font_name': run.font.name,
+                'font_size': run.font.size,
+                'font_color': run.font.color.rgb
+            })
+        paragraphs_with_runs.append(runs)
+    return paragraphs_with_runs
+def tokenize_paragraph_with_runs2(runs_in_paragraph):
+    text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
+    sentences = sent_tokenize(text_paragraph)
+    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
+    tokenized_sentences_with_style = []
+    for tokenized_sentence in tokenized_sentences:
+        tokenized_sentence_with_style = []
+        token_idx = 0
+        for run in runs_in_paragraph:
+            text_in_run = run["text"].strip()
+            if text_in_run == tokenized_sentence[token_idx]:
+                new_run = run.copy()
+                new_run["text"] = text_in_run
+                tokenized_sentence_with_style.append(new_run)
+                token_idx += 1
+                if token_idx >= len(tokenized_sentence):
+                    break
+            elif len(text_in_run) > len(tokenized_sentence[token_idx]):
+                if text_in_run.startswith(tokenized_sentence[token_idx]):
+                    for token in word_tokenize(text_in_run):
+                        if token == tokenized_sentence[token_idx]:
+                            new_run = run.copy()
+                            new_run["text"] = token
+                            tokenized_sentence_with_style.append(new_run)
+                        token_idx += 1
+            else:
+                raise "oops"
+        tokenized_sentences_with_style.append(tokenized_sentence_with_style)
+    return tokenized_sentences_with_style
+def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer):
+    text_paragraph = detokenizer.detokenize([run["text"] for run in runs_in_paragraph])
+    sentences = sent_tokenize(text_paragraph)
+    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
+    tokens_with_style = []
+    for run in runs_in_paragraph:
+        tokens = word_tokenize(run["text"])
+        for token in tokens:
+            tokens_with_style.append(run.copy())
+            tokens_with_style[-1]["text"] = token
+    token_index = 0
+    tokenized_sentences_with_style = []
+    for sentence in tokenized_sentences:
+        sentence_with_style = []
+        for word in sentence:
+            if word == tokens_with_style[token_index]["text"]:
+                sentence_with_style.append(tokens_with_style[token_index])
+                token_index += 1
+            else:
+                if word.startswith(tokens_with_style[token_index]["text"]):
+                    # this token might be split into several runs
+                    word_left = word
+                    while word_left:
+                        sentence_with_style.append(tokens_with_style[token_index])
+                        word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
+                        token_index += 1
+                else:
+                    raise "oops"
+        tokenized_sentences_with_style.append(sentence_with_style)
+    return tokenized_sentences_with_style
+def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner, temp_folder, detokenizer):
+    # clean temp folder
+    for f in os.listdir(temp_folder):
+        os.remove(os.path.join(temp_folder, f))
+    temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
+    # tokenize the original text by sentence and words while keeping the style
+    original_tokenized_sentences_with_style = tokenize_paragraph_with_runs(original_runs_in_paragraph, detokenizer)
+    # tokenize the translated text by sentence and word
+    translated_tokenized_sentences = [word_tokenize(sentence) for sentence in sent_tokenize(translated_paragraph)]
+    # write the file that fastalign will use
+    with open(temp_file_path, "w") as out_file:
+        for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
+            out_file.write(f"{" ".join(item["text"] for item in original)} ||| {" ".join(translated)}\n")
+    alignments = aligner.align(temp_file_path)
+    # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
+    translated_sentences_with_style = []
+    for sentence_idx, sentence_alignments in enumerate(alignments):
+        # reverse the order of the alignments and build a dict with it
+        sentence_alignments = {target: source for source, target in sentence_alignments}
+        translated_sentence_with_style = []
+        for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
+            # fastalign has found a token aligned with the translated one
+            if token_idx in sentence_alignments.keys():
+                # get the aligned token
+                original_idx = sentence_alignments[token_idx]
+                new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
+                new_entry["text"] = translated_token
+                translated_sentence_with_style.append(new_entry)
+            else:
+                # WARNING this is a test
+                # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
+                new_entry = translated_sentence_with_style[-1].copy()
+                new_entry["text"] = translated_token
+                translated_sentence_with_style.append(new_entry)
+        translated_sentences_with_style.append(translated_sentence_with_style)
+    return translated_sentences_with_style
+# TODO
+def translate_paragraph(paragraph_text):
+    translated_paragraph = ""
+    return translated_paragraphs
+# group contiguous elements with the same boolean values
+def group_by_style(values, detokenizer):
+    groups = []
+    for key, group in groupby(values, key=lambda x: (
+            x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
+        text = detokenizer.detokenize([item['text'] for item in group])
+        groups.append({"text": text,
+                       "bold": key[0],
+                       "italic": key[1],
+                       "underline": key[2],
+                       "font_name": key[3],
+                       "font_size": key[4],
+                       "font_color": key[5]})
+    return groups
+def preprocess_runs(runs_in_paragraph):
+    new_runs = []
+    for run in runs_in_paragraph:
+        if not new_runs:
+            new_runs.append(run)
+        else:
+            # if the previous run has the same format as the current run, we merge the two runs together
+            if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
+                    new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
+                    and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
+                    and new_runs[-1]["underline"] == run["underline"]):
+                new_runs[-1]["text"] += run["text"]
+            else:
+                new_runs.append(run)
+        # we want to split runs that contain more than one sentence to avoid problems later when aligning styles
+        sentences = sent_tokenize(new_runs[-1]["text"])
+        if len(sentences) > 1:
+            new_runs[-1]["text"] = sentences[0]
+            for sentence in sentences[1:]:
+                new_run = new_runs[-1].copy()
+                new_run["text"] = sentence
+                new_runs.append(new_run)
+    return new_runs
+if __name__ == "__main__":
+    input_file = 'data/test2.docx'
+    output_file = 'data/translated_output.docx'
+    source_lang = 'ca'
+    target_lang = 'en'
+    config_folder = "fast_align_config"
+    temp_folder = "tmp"
+    aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
+    os.makedirs(temp_folder, exist_ok=True)
+    # load original file, extract the paragraphs with their runs (which include style and formatting)
+    doc = Document(input_file)
+    paragraphs_with_runs = extract_paragraphs_with_runs(doc)
+    detokenizer = TreebankWordDetokenizer()
+    # translate each paragraph
+    translated_paragraphs = []
+    for paragraph in paragraphs_with_runs:
+        paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
+        translated_paragraphs.append(translate_paragraph(paragraph_text))
+    out_doc = Document()
+    for original_runs_in_paragraph, translated_paragraph in zip(paragraphs_with_runs, translated_paragraphs):
+        # sometimes we get empty paragraphs for some reason, I think it's just docx shenanigans
+        if not original_runs_in_paragraph:
+            continue
+        original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
+        paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
+                                                   temp_folder, detokenizer)
+        para = out_doc.add_paragraph()
+        # flatten the paragraph, we don't need it to split into sentences anymore
+        paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
+        # merge tokens into runs and detokenize
+        paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
+        for item in paragraph_with_runs:
+            run = para.add_run(item["text"] + " ")
+            # Preserve original run formatting
+            run.bold = item['bold']
+            run.italic = item['italic']
+            run.underline = item['underline']
+            run.font.name = item['font_name']
+            run.font.size = item['font_size']
+            run.font.color.rgb = item['font_color']
+    out_doc.save(output_file)

readme.md ADDED Viewed

	@@ -0,0 +1,18 @@

+# document_translator
+Project to translate files (for now .docx) using BSC's models while keeping the formatting and style of the original file.
+## Requirements
+### python 3.12
+### fast_align
+Clone https://github.com/clab/fast_align, run the compilation commands indicated in the project's readme, place fast_align and atools (.exe if using windows) in this project's root.
+### fast_align fine-tuning files
+I took the 4 files (ca-en.params, ca-en.err, en-ca.params and en-ca.err) from https://huggingface.co/projecte-aina/aina-translator-ca-en/tree/main. Maybe we could automatize the download of these files. For now, place these files in config_folder (defined in main.py).
+### python requirements
+    pip install -r requirements.txt

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ nltk~=3.9.1
2	+ python-docx~=1.1.2