Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Jun 20

Commit

8f1143c

unverified ·

2 Parent(s): 580106a 209a51e

Merge pull request #3 from langtech-bsc/any-doc

Browse files

Files changed (9) hide show

Dockerfile +21 -0
gradio_app.py +27 -19
readme.md +15 -1
requirements.txt +6 -5
src/aligner.py +82 -0
src/mtuoc_aina_translator.py +19 -0
src/salamandraTA7b_translator.py +24 -0
src/translate_any_doc.py +471 -0
translate_docx.py → src/translate_docx.py +4 -83

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.12-slim
+WORKDIR /app
+COPY fast_align_config ./fast_align_config
+COPY src ./src
+COPY okapi-apps_gtk2-linux-x86_64_1.47.0 ./okapi-apps_gtk2-linux-x86_64_1.47.0
+COPY gradio_app.py .
+COPY requirements.txt .
+COPY fast_align .
+COPY atools .
+RUN pip install --no-cache-dir -r requirements.txt
+RUN python -m spacy download xx_ent_wiki_sm
+RUN apt-get update && \
+    apt-get install libgomp1 && \
+    apt-get install -y openjdk-17-jre-headless
+CMD ["python", "gradio_app.py"]

gradio_app.py CHANGED Viewed

@@ -1,39 +1,47 @@
 import gradio as gr
-from pathlib import Path
-import requests
-import json
-from translate_docx import translate_document, translate, Aligner
-from nltk.tokenize.treebank import TreebankWordDetokenizer
-ip='10.192.31.127'
 config_folder = 'fast_align_config'
-source_lang = 'en'
-target_lang = 'ca'
 temp_folder = 'tmp'
-aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
-detokenizer = TreebankWordDetokenizer()
-def upload_file(filepath):
-    translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
-    return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
 def download_file():
     return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
 with gr.Blocks() as demo:
     with gr.Tab("Text"):
-        gr.Interface(fn=translate, inputs=["text","text","text"], outputs="text")
-    with gr.Tab("Docx documents"):
         gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
         with gr.Row():
             u = gr.UploadButton("Upload a file", file_count="single")
             d = gr.DownloadButton("Download the file", visible=False)
-        u.upload(upload_file, u, [u, d])
         d.click(download_file, None, [u, d])
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from src.translate_any_doc import translate_document
+from src.salamandraTA7b_translator import SalamandraTA7bTranslator
+from src.aligner import Aligner
+import os
 config_folder = 'fast_align_config'
 temp_folder = 'tmp'
+hf_token = os.getenv('HF_TOKEN')
+translator = SalamandraTA7bTranslator(hf_token)
+def upload_file(filepath, source_lang, target_lang):
+    aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
+    translated_file_name = translate_document(filepath, source_lang, target_lang, translator, aligner)
+    return [gr.UploadButton(visible=False),
+            gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
+def before_processing():
+    return [
+        gr.UploadButton("Processing...", interactive=False),
+        gr.DownloadButton(visible=False)  # Keep download hidden until processing finishes
+    ]
 def download_file():
     return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
 with gr.Blocks() as demo:
     with gr.Tab("Text"):
+        gr.Interface(fn=translator.translate, inputs=["text", "text", "text"], outputs="text")
+    with gr.Tab("Documents"):
+        with gr.Row():
+            dropdown1 = gr.Dropdown(label="Source language", choices=["en", "ca"], value=None, interactive=True)
+            dropdown2 = gr.Dropdown(label="Target language", choices=["en", "ca"], value=None, interactive=True)
         gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
         with gr.Row():
             u = gr.UploadButton("Upload a file", file_count="single")
             d = gr.DownloadButton("Download the file", visible=False)
+        u.upload(fn=before_processing, inputs=None, outputs=[u, d]).then(upload_file, [u, dropdown1, dropdown2], [u, d])
         d.click(download_file, None, [u, d])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

readme.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # document_translator
-Project to translate files (for now .docx) using BSC's models while keeping the formatting and style of the original file.
 ## Requirements
 ### python 3.12
@@ -16,3 +16,17 @@ I took the 4 files (ca-en.params, ca-en.err, en-ca.params and en-ca.err) from ht
 ### python requirements
     pip install -r requirements.txt

 # document_translator
+Project to translate files using BSC's models while keeping the formatting and style of the original file.
 ## Requirements
 ### python 3.12
 ### python requirements
     pip install -r requirements.txt
+### mtuoc_aina_translator
+To use this class you also need to be running MTUOC's translation server with the proper translation models. There's also no
+need to use fastalign on that side since the current project already runs it.
+### salamandrata7b_translator
+Class that uses huggingface's demo.
+## Docker
+sudo docker build -t document-translator .
+docker run -p 7860:7860 -e HF_TOKEN=your_token_here --rm -it document-translator

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
-nltk~=3.9.1
-python-docx~=1.1.2
-torch~=2.6.0
-transformers~=4.51.2
 iso-639~=0.4.5
 protobuf~=6.30.2
-sentencepiece~=0.2.0

 iso-639~=0.4.5
 protobuf~=6.30.2
+requests~=2.32.3
+tqdm~=4.67.1
+gradio~=5.25.1
+gradio_client~=1.8.0
+setuptools~=80.0.0
+spacy~=3.8.6

src/aligner.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import fileinput
+import os
+import platform
+from subprocess import Popen, PIPE
+# Class to align original and translated sentences
+# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
+class Aligner():
+    def __init__(self, config_folder, source_lang, target_lang, temp_folder):
+        forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
+        reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
+        fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
+        rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
+        self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
+        self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
+        if platform.system().lower() == "windows":
+            fastalign_bin = "fast_align.exe"
+            atools_bin = "atools.exe"
+        else:
+            fastalign_bin = "./fast_align"
+            atools_bin = "./atools"
+        self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt")
+        self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
+                                forward_params_path]
+        self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
+                                reverse_params_path, "r"]
+        self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
+                                  self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
+    def __simplify_alignment_file(self, file):
+        with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
+            for line in f:
+                print(line.split('|||')[2].strip())
+    def __read_err(self, err):
+        (T, m) = ('', '')
+        for line in open(err):
+            # expected target length = source length * N
+            if 'expected target length' in line:
+                m = line.split()[-1]
+            # final tension: N
+            elif 'final tension' in line:
+                T = line.split()[-1]
+        return T, m
+    def align(self, original_sentences, translated_sentences):
+        # create temporary file which fastalign will use
+        with open(self.temp_file_path, "w") as temp_file:
+            for original, translated in zip(original_sentences, translated_sentences):
+                temp_file.write(f"{original} ||| {translated}\n")
+        # generate forward alignment
+        with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
+            fw_process = Popen(self.forward_command, stdout=f_out)
+            # generate reverse alignment
+            r_process = Popen(self.reverse_command, stdout=r_out)
+            # wait for both to finish
+            fw_process.wait()
+            r_process.wait()
+        # for some reason the output file contains more information than needed, remove it
+        self.__simplify_alignment_file(self.forward_alignment_file_path)
+        self.__simplify_alignment_file(self.reverse_alignment_file_path)
+        # generate symmetrical alignment
+        process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
+        process.wait()
+        # get final alignments and format them
+        alignments_str = process.communicate()[0].decode('utf-8')
+        alignments = []
+        for line in alignments_str.splitlines():
+            alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
+        return alignments

src/mtuoc_aina_translator.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import requests
+import json
+class MTUOCAinaTranslator:
+    def __init__(self, ip: str, port: str):
+        self.ip = ip
+        self.port = port
+    def translate(self, text, source_lang=None, target_lang=None):
+        myobj = {
+            'id': '1',
+            'src': text,
+        }
+        url = f'http://{self.ip}:{self.port}/translate'
+        #url = 'http://' + self.ip + ':' + self.port + '/translate'
+        x = requests.post(url, json=myobj)
+        json_response = json.loads(x.text)
+        return json_response['tgt']

src/salamandraTA7b_translator.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from gradio_client import Client
+from iso639 import languages
+class SalamandraTA7bTranslator:
+    def __init__(self, hf_token):
+        self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
+    def translate(self, text, source_lang, target_lang):
+        if not text:
+            return ""
+        # we assume that they are specifying the language by code so we need to convert it to name
+        lang1 = languages.get(alpha2=source_lang).name
+        lang2 = languages.get(alpha2=target_lang).name
+        result = self.client.predict(
+            task="Translation",
+            source=lang1,
+            target=lang2,
+            input_text=text,
+            mt_text=None,
+            api_name="/generate_output"
+        )
+        return result[0]

src/translate_any_doc.py ADDED Viewed

	@@ -0,0 +1,471 @@

+import shutil
+import string
+import time
+import os
+from itertools import groupby
+from subprocess import Popen, PIPE
+import re
+from src.aligner import Aligner
+import glob
+import spacy
+from spacy.tokens import Doc
+import tqdm
+# Load multilingual model to use as sentence tokenizer
+spacy_nlp = spacy.load("xx_ent_wiki_sm")
+# Add the rule-based sentencizer
+if "sentencizer" not in spacy_nlp.pipe_names:
+    spacy_nlp.add_pipe("sentencizer")
+def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
+                      original_xliff_file_path: str) -> str:
+    """
+    Given a document, this function generates an xliff file and then a plain text file with the text contents
+    while keeping style and formatting using tags like <g id=1> </g>
+    Parameters:
+    input_file: Path to document to process
+    source_lang: Source language of the document
+    target_lang: Target language of the document
+    tikal_folder: Folder where tikal.sh is located
+    original_xliff_file_path: Path to xliff file to generate, which will be use later
+    Returns:
+    string: Path to plain text file
+    """
+    tikal_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-x", input_file, "-nocopy", "-sl", source_lang,
+                           "-tl", target_lang]
+    Popen(tikal_xliff_command).wait()
+    tikal_moses_command = [os.path.join(tikal_folder, "tikal.sh"), "-xm", original_xliff_file_path, "-sl", source_lang,
+                           "-tl", target_lang]
+    Popen(tikal_moses_command).wait()
+    return os.path.join(original_xliff_file_path + f".{source_lang}")
+def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[str, str | tuple[str, ...]]]:
+    """
+    Given some text that may or may not contain some chunks tagged with something like <g id=1> </g>, extract each
+    of the runs of text and convert them into dictionaries to keep this information
+    Parameters:
+    text: Text to process
+    paragraph_index: Index of the paragraph in the file
+    Returns:
+    list[dict]: Where each element is a run with text, tag id (if any, if not None) and paragraph_index
+    """
+    tag_stack = []
+    runs = []
+    pos = 0
+    # Match any tag: <tag id="123"/>, </tag>, or <tag id="123">
+    tag_pattern = re.compile(r'<(/?)(\w+)(?:\s+id="(\d+)")?\s*(/?)>')
+    for match in tag_pattern.finditer(paragraph):
+        start, end = match.span()
+        is_closing = match.group(1) == "/"
+        tag_name = match.group(2)
+        tag_id = match.group(3)
+        is_self_closing = match.group(4) == "/"
+        # Text before this tag
+        if start > pos:
+            text = paragraph[pos:start]
+            if text:
+                runs.append({
+                    "text": text,
+                    "id": tag_stack.copy(),
+                    "paragraph_index": paragraph_index
+                })
+        if is_closing:
+            # Closing tag </tag>
+            expected_prefix = f"{tag_name}_"
+            if tag_stack and tag_stack[-1].startswith(expected_prefix):
+                tag_stack.pop()
+            else:
+                raise ValueError(f"Mismatched closing tag </{tag_name}>")
+        elif is_self_closing:
+            # Self-closing tag like <x id="1"/>
+            if tag_id is None:
+                raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
+            runs.append({
+                "text": "",
+                "id": [f"{tag_name}_{tag_id}"],
+                "paragraph_index": paragraph_index
+            })
+        else:
+            # Opening tag <tag id="...">
+            if tag_id is None:
+                raise ValueError(f"Opening tag <{tag_name}> missing id")
+            tag_stack.append(f"{tag_name}_{tag_id}")
+        pos = end
+    # Final trailing text
+    if pos < len(paragraph):
+        text = paragraph[pos:]
+        if text:
+            runs.append({
+                "text": text,
+                "id": tag_stack.copy(),
+                "paragraph_index": paragraph_index
+            })
+    return runs
+def tokenize_text(text, tokenizer):
+    # To avoid the tokenizer destroying the url
+    def preserve_urls(text):
+        url_pattern = r'https?://[^\s\)\]\}\>]+|www\.[^\s\)\]\}\>]+'
+        # Find URLs using regex and replace them with a placeholder
+        urls = re.findall(url_pattern, text)
+        for idx, url in enumerate(urls):
+            placeholder = f"URL{idx}"
+            text = text.replace(url, placeholder)
+        return text, urls
+    # Replace URLs with placeholders
+    text, urls = preserve_urls(text)
+    # Tokenize using Sacremoses
+    tokens = tokenizer.tokenize(text)
+    # Revert placeholders back to original URLs
+    for idx, url in enumerate(urls):
+        placeholder = f"URL{idx}"
+        tokens = [token.replace(placeholder, url) for token in tokens]
+    return tokens
+def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str, str]]], list[list[bool]]]:
+    """
+    Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
+    to its original run
+    Parameters:
+    runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
+    source_lang: Language of the document
+    Returns:
+    list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
+    """
+    # it's a bit of a mess but first we get the tokenized sentences
+    # join runs and send through spacy to split into clean tokens
+    doc_from_runs = spacy_nlp("".join([run["text"] for run in runs]).strip())
+    # extract sentences and tokenize each into words
+    tokenized_sentences = [[token.text.strip() for token in sent if token.text.strip()] for sent in doc_from_runs.sents]
+    tokenized_sentences_spaces = [[token.whitespace_ != '' for token in sent if token.text.strip()] for sent in
+                                  doc_from_runs.sents]
+    flat_tokens = [token for sentence in tokenized_sentences for token in sentence]
+    flat_spaces = [token for sentence in tokenized_sentences_spaces for token in sentence]
+    flat_tokens_with_style = []
+    flat_spaces_with_style = []
+    token_idx = 0
+    for run in runs:
+        run["text"] = run["text"].strip()
+        while run["text"]:
+            if run["text"].startswith(flat_tokens[token_idx]):
+                run["text"] = run["text"][len(flat_tokens[token_idx]):]
+                if flat_spaces[token_idx]:
+                    run["text"] = run["text"].lstrip()
+                item = run.copy()
+                item["text"] = flat_tokens[token_idx]
+                flat_tokens_with_style.append(item)
+                flat_spaces_with_style.append(flat_spaces[token_idx])
+                token_idx += 1
+            elif flat_tokens[token_idx].startswith(run["text"]):
+                subtoken = flat_tokens[token_idx][:len(run["text"])]
+                item = run.copy()
+                item["text"] = subtoken
+                flat_tokens_with_style.append(item)
+                flat_spaces_with_style.append(False)
+                flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
+                run["text"] = run["text"][len(subtoken):]
+    # reconstruct the sentences
+    token_idx = 0
+    tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
+    for sentence, sentence_spaces in zip(tokenized_sentences, tokenized_sentences_spaces):
+        sentence_with_style, sentence_spaces_with_style = [], []
+        for token in sentence:
+            if token == flat_tokens_with_style[token_idx]["text"]:
+                sentence_with_style.append(flat_tokens_with_style[token_idx])
+                sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
+                token_idx += 1
+            elif token.startswith(flat_tokens_with_style[token_idx]["text"]):
+                while token:
+                    token = token[len(flat_tokens_with_style[token_idx]["text"]):]
+                    sentence_with_style.append(flat_tokens_with_style[token_idx])
+                    sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
+                    token_idx += 1
+            else:
+                print(token)
+                print(sentence)
+                print(token_idx)
+                print(flat_tokens_with_style)
+                raise Exception(f"Something unexpected happened")
+        tokenized_sentences_with_style.append(sentence_with_style)
+        tokenized_sentences_spaces_with_style.append(sentence_spaces_with_style)
+    return tokenized_sentences_with_style, tokenized_sentences_spaces_with_style
+def generate_alignments(original_tokenized_sentences_with_style: list[list[dict[str, str]]],
+                        translated_sentences: list[str], aligner, temp_folder: str):
+    """
+    Given some original sentences with style and formatting and its translation without formatting, try to match
+    the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
+    forget about paragraphs and work only in sentences, so the output is a list of sentences but with information about
+    from which paragraph that sentence came from
+    Parameters:
+    original_tokenized_sentences_with_style: Original text split into sentences with style information
+    translated_sentences: Translated text, split into sentences
+    aligner: Object of the aligner class, uses fastalign
+    temp_folder: Path to folder where to put all the intermediate files
+    source_lang: original language of the document
+    target_lang: target language of the translation
+    Returns:
+    list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
+                        original token
+    """
+    # clean temp folder
+    for f in glob.glob(os.path.join(temp_folder, "*align*")):
+        os.remove(f)
+    # tokenize the translated text by sentence and word
+    translated_tokenized_sentences = []
+    # keep spacing information to detokenize properly later
+    translated_tokenized_sentences_spaces = []
+    for sentence in translated_sentences:
+        tokens = spacy_nlp(sentence)
+        translated_tokenized_sentences_spaces.append([token.whitespace_ != '' for token in tokens])
+        translated_tokenized_sentences.append([token.text for token in tokens])
+    assert len(translated_tokenized_sentences) == len(
+        original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentences, likely due to a translation error"
+    original_sentences = []
+    translated_sentences = []
+    for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
+        original_sentences.append(' '.join(item['text'] for item in original))
+        translated_sentences.append(' '.join(translated))
+    alignments = aligner.align(original_sentences, translated_sentences)
+    # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
+    translated_sentences_with_style = []
+    for sentence_idx, sentence_alignments in enumerate(alignments):
+        # reverse the order of the alignments and build a dict with it
+        sentence_alignments = {target: source for source, target in sentence_alignments}
+        translated_sentence_with_style: list[dict[str, str]] = []
+        for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
+            # fastalign has found a token aligned with the translated one
+            if token_idx in sentence_alignments.keys():
+                # get the aligned token
+                original_idx = sentence_alignments[token_idx]
+                new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
+                new_entry["text"] = translated_token
+                translated_sentence_with_style.append(new_entry)
+            else:
+                # WARNING this is a test
+                # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
+                new_entry = translated_sentence_with_style[-1].copy()
+                new_entry["text"] = translated_token
+                translated_sentence_with_style.append(new_entry)
+        translated_sentences_with_style.append(translated_sentence_with_style)
+    return translated_sentences_with_style, translated_tokenized_sentences_spaces
+def group_by_style(tokens: list[dict[str, str]], spaces: list[bool]) -> list[dict[str, str]]:
+    """
+    To avoid having issues in the future, we group the contiguous tokens that have the same style. Basically, we
+    reconstruct the runs.
+    Parameters:
+    tokens: Tokens with style information
+    Returns:
+    list[dict]: A list of translated runs with format and style
+    """
+    groups = []
+    zipped = zip(tokens, spaces)
+    for key, group in groupby(zipped, key=lambda x: (x[0]["id"], x[0]["paragraph_index"])):
+        group = list(group)
+        tokens = [item[0]['text'] for item in group]
+        spaces = [item[1] for item in group]
+        text = Doc(spacy_nlp.vocab, words=tokens, spaces=spaces).text
+        groups.append({"text": text,
+                       "id": key[0],
+                       "paragraph_index": key[1]})
+    return groups
+def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]]], out_file_path: str):
+    """
+    Generate a plain text file restoring the original tag structure like <g id=1> </g>
+    Parameters:
+    paragraphs_with_style: Dictionary where each key is the paragraph_index and its contents are a list of runs
+    out_file_path: Path to the file where the plain text will be saved
+    """
+    with open(out_file_path, "w") as out_file:
+        def close_tags(ids):
+            tag = ""
+            for gid in ids:
+                tag_type, tag_id = gid.split("_")
+                tag += f'</{tag_type}>'
+            return tag
+        def open_tags(ids):
+            tag = ""
+            for gid in ids:
+                tag_type, tag_id = gid.split("_")
+                tag += f'<{tag_type} id="{tag_id}">'
+            return tag
+        for key, paragraph in paragraphs_with_style.items():
+            for run in paragraph:
+                ids = list(run["id"]) if run["id"] else []
+                if ids:
+                    output = open_tags(ids) + run["text"] + close_tags(ids)
+                    out_file.write(output)
+                else:
+                    out_file.write("".join(run["text"]))
+            out_file.write("\n")
+def translate_document(input_file: str, source_lang: str, target_lang: str,
+                       translator,
+                       aligner: Aligner,
+                       temp_folder: str = "tmp",
+                       tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0", with_format: bool = True) -> str:
+    input_filename = input_file.split("/")[-1]
+    os.makedirs(temp_folder, exist_ok=True)
+    # copy the original file to the temporal folder to avoid common issues with tikal
+    temp_input_file = os.path.join(temp_folder, input_filename)
+    shutil.copy(input_file, temp_input_file)
+    original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
+    plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
+    # get paragraphs with runs
+    paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
+                            enumerate(open(plain_text_file).readlines())]
+    # translate using plaintext file
+    original_tokenized_sentences_with_style = []
+    original_spacing = []
+    for run in paragraphs_with_runs:
+        tokens, spaces = tokenize_with_runs(run)
+        original_tokenized_sentences_with_style += tokens
+        original_spacing += spaces
+    translated_sentences = []
+    for sentence, spacing in tqdm.tqdm(zip(original_tokenized_sentences_with_style, original_spacing),
+                                       desc="Translating paragraphs...",
+                                       total=len(original_tokenized_sentences_with_style)):
+        text = Doc(spacy_nlp.vocab, words=[token["text"] for token in sentence], spaces=spacing).text
+        while True:
+            try:
+                translated_sentences.append(translator.translate(text, source_lang, target_lang))
+                break
+            except:
+                continue
+    # time to align the translation with the original
+    print("Generating alignments...")
+    start_time = time.time()
+    translated_sentences_with_style, translated_sentences_spacing = generate_alignments(
+        original_tokenized_sentences_with_style,
+        translated_sentences, aligner,
+        temp_folder)
+    print(f"Finished alignments in {time.time() - start_time} seconds")
+    # since we tokenized these sentences independently, the spacing information does not contain spaces after punctuation
+    # at the end of the sentence (there's no space at the end of a sentence that ends with ".", unless there's a sentence
+    # right after
+    for sentence, sentence_spaces in zip(translated_sentences_with_style, translated_sentences_spacing):
+        if sentence[-1]["text"] in string.punctuation:
+            sentence_spaces[-1] = True
+    # flatten the sentences into a list of tokens
+    translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
+    tokens_spaces = [item for sublist in translated_sentences_spacing for item in sublist]
+    # group the tokens by style/run
+    translated_runs_with_style = group_by_style(translated_tokens_with_style, tokens_spaces)
+    # group the runs by original paragraph
+    translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
+                                        range(len(paragraphs_with_runs))}
+    for item in translated_runs_with_style:
+        # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
+        # didn't know where paragraphs started and ended
+        if not translated_paragraphs_with_style[item['paragraph_index']][0]["text"]:
+            first_item_in_paragraph = item.copy()
+            first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
+            translated_paragraphs_with_style[item['paragraph_index']] = []
+            translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
+        else:
+            translated_paragraphs_with_style[item['paragraph_index']].append(item)
+    # save to new plain text file
+    translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
+    runs_to_plain_text(translated_paragraphs_with_style, translated_moses_file)
+    # put the translations into the xlf
+    tikal_moses_to_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-lm", original_xliff_file, "-sl",
+                                    source_lang, "-tl", target_lang, "-from", translated_moses_file, "-totrg",
+                                    "-noalttrans", "-to", original_xliff_file]
+    Popen(tikal_moses_to_xliff_command).wait()
+    # any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
+    # them. This may happen if a word in the original language has been split in more that one words that have other
+    # words in between, or an error in fastalign
+    text = open(original_xliff_file).read()
+    result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
+    open(original_xliff_file, "w").write(result)
+    # merge into a docx again
+    tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
+    final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
+    stdout, stderr = final_process.communicate()
+    final_process.wait()
+    # get the path to the output file
+    output = stdout.decode('utf-8')
+    translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
+    print(f"Saved file in {translated_file_path}")
+    return translated_file_path

translate_docx.py → src/translate_docx.py RENAMED Viewed

@@ -8,17 +8,13 @@ from docx import Document
 from docx.text.hyperlink import Hyperlink
 from docx.text.run import Run
 import nltk
-import platform
 nltk.download('punkt')
 nltk.download('punkt_tab')
 from nltk.tokenize import sent_tokenize, word_tokenize
-from subprocess import Popen, PIPE
 from itertools import groupby
-import fileinput
 ip = "192.168.20.216"
 port = "8000"
@@ -36,85 +32,6 @@ def translate(text, ip, port):
     return json_response['tgt']
-# Class to align original and translated sentences
-# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
-class Aligner():
-    def __init__(self, config_folder, source_lang, target_lang, temp_folder):
-        forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
-        reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
-        fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
-        rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
-        self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
-        self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
-        if platform.system().lower() == "windows":
-            fastalign_bin = "fast_align.exe"
-            atools_bin = "atools.exe"
-        else:
-            fastalign_bin = "./fast_align"
-            atools_bin = "./atools"
-        self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
-        self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
-                                forward_params_path]
-        self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
-                                reverse_params_path, "r"]
-        self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
-                                  self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
-    def __simplify_alignment_file(self, file):
-        with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
-            for line in f:
-                print(line.split('|||')[2].strip())
-    def __read_err(self, err):
-        (T, m) = ('', '')
-        for line in open(err):
-            # expected target length = source length * N
-            if 'expected target length' in line:
-                m = line.split()[-1]
-            # final tension: N
-            elif 'final tension' in line:
-                T = line.split()[-1]
-        return T, m
-    def align(self, original_sentences, translated_sentences):
-        # create temporary file which fastalign will use
-        with open(self.temp_file_path, "w") as temp_file:
-            for original, translated in zip(original_sentences, translated_sentences):
-                temp_file.write(f"{original} ||| {translated}\n")
-        # generate forward alignment
-        with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
-            fw_process = Popen(self.forward_command, stdout=f_out)
-            # generate reverse alignment
-            r_process = Popen(self.reverse_command, stdout=r_out)
-            # wait for both to finish
-            fw_process.wait()
-            r_process.wait()
-        # for some reason the output file contains more information than needed, remove it
-        self.__simplify_alignment_file(self.forward_alignment_file_path)
-        self.__simplify_alignment_file(self.reverse_alignment_file_path)
-        # generate symmetrical alignment
-        process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
-        process.wait()
-        # get final alignments and format them
-        alignments_str = process.communicate()[0].decode('utf-8')
-        alignments = []
-        for line in alignments_str.splitlines():
-            alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
-        return alignments
 # Function to extract paragraphs with their runs
 def extract_paragraphs_with_runs(doc):
     paragraphs_with_runs = []
@@ -200,6 +117,10 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
     translated_tokenized_sentences = [word_tokenize(sentence) for
                                       translated_paragraph in translated_paragraphs for sentence in
                                       sent_tokenize(translated_paragraph)]
     original_sentences = []
     translated_sentences = []
     for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):

 from docx.text.hyperlink import Hyperlink
 from docx.text.run import Run
 import nltk
 nltk.download('punkt')
 nltk.download('punkt_tab')
 from nltk.tokenize import sent_tokenize, word_tokenize
 from itertools import groupby
 ip = "192.168.20.216"
 port = "8000"
     return json_response['tgt']
 # Function to extract paragraphs with their runs
 def extract_paragraphs_with_runs(doc):
     paragraphs_with_runs = []
     translated_tokenized_sentences = [word_tokenize(sentence) for
                                       translated_paragraph in translated_paragraphs for sentence in
                                       sent_tokenize(translated_paragraph)]
+    assert len(translated_tokenized_sentences) == len(
+        original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
     original_sentences = []
     translated_sentences = []
     for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):