mjuvilla commited on
Commit
8f1143c
·
unverified ·
2 Parent(s): 580106a 209a51e

Merge pull request #3 from langtech-bsc/any-doc

Browse files
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY fast_align_config ./fast_align_config
6
+ COPY src ./src
7
+ COPY okapi-apps_gtk2-linux-x86_64_1.47.0 ./okapi-apps_gtk2-linux-x86_64_1.47.0
8
+ COPY gradio_app.py .
9
+ COPY requirements.txt .
10
+
11
+ COPY fast_align .
12
+ COPY atools .
13
+
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+ RUN python -m spacy download xx_ent_wiki_sm
16
+
17
+ RUN apt-get update && \
18
+ apt-get install libgomp1 && \
19
+ apt-get install -y openjdk-17-jre-headless
20
+
21
+ CMD ["python", "gradio_app.py"]
gradio_app.py CHANGED
@@ -1,39 +1,47 @@
1
  import gradio as gr
2
- from pathlib import Path
3
- import requests
4
- import json
5
- from translate_docx import translate_document, translate, Aligner
6
- from nltk.tokenize.treebank import TreebankWordDetokenizer
7
 
8
-
9
- ip='10.192.31.127'
10
  config_folder = 'fast_align_config'
11
- source_lang = 'en'
12
- target_lang = 'ca'
13
  temp_folder = 'tmp'
14
- aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
15
- detokenizer = TreebankWordDetokenizer()
 
 
 
 
 
 
 
 
16
 
17
 
18
- def upload_file(filepath):
19
- translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
20
- return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
 
 
 
21
 
22
  def download_file():
23
  return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
24
 
25
 
26
  with gr.Blocks() as demo:
27
-
28
  with gr.Tab("Text"):
29
- gr.Interface(fn=translate, inputs=["text","text","text"], outputs="text")
30
- with gr.Tab("Docx documents"):
 
 
 
31
  gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
32
  with gr.Row():
33
  u = gr.UploadButton("Upload a file", file_count="single")
34
  d = gr.DownloadButton("Download the file", visible=False)
35
 
36
- u.upload(upload_file, u, [u, d])
37
  d.click(download_file, None, [u, d])
38
  if __name__ == "__main__":
39
- demo.launch()
 
1
  import gradio as gr
2
+ from src.translate_any_doc import translate_document
3
+ from src.salamandraTA7b_translator import SalamandraTA7bTranslator
4
+ from src.aligner import Aligner
5
+ import os
 
6
 
 
 
7
  config_folder = 'fast_align_config'
 
 
8
  temp_folder = 'tmp'
9
+ hf_token = os.getenv('HF_TOKEN')
10
+
11
+ translator = SalamandraTA7bTranslator(hf_token)
12
+
13
+
14
+ def upload_file(filepath, source_lang, target_lang):
15
+ aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
16
+ translated_file_name = translate_document(filepath, source_lang, target_lang, translator, aligner)
17
+ return [gr.UploadButton(visible=False),
18
+ gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
19
 
20
 
21
+ def before_processing():
22
+ return [
23
+ gr.UploadButton("Processing...", interactive=False),
24
+ gr.DownloadButton(visible=False) # Keep download hidden until processing finishes
25
+ ]
26
+
27
 
28
  def download_file():
29
  return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
30
 
31
 
32
  with gr.Blocks() as demo:
 
33
  with gr.Tab("Text"):
34
+ gr.Interface(fn=translator.translate, inputs=["text", "text", "text"], outputs="text")
35
+ with gr.Tab("Documents"):
36
+ with gr.Row():
37
+ dropdown1 = gr.Dropdown(label="Source language", choices=["en", "ca"], value=None, interactive=True)
38
+ dropdown2 = gr.Dropdown(label="Target language", choices=["en", "ca"], value=None, interactive=True)
39
  gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
40
  with gr.Row():
41
  u = gr.UploadButton("Upload a file", file_count="single")
42
  d = gr.DownloadButton("Download the file", visible=False)
43
 
44
+ u.upload(fn=before_processing, inputs=None, outputs=[u, d]).then(upload_file, [u, dropdown1, dropdown2], [u, d])
45
  d.click(download_file, None, [u, d])
46
  if __name__ == "__main__":
47
+ demo.launch(server_name="0.0.0.0", server_port=7860)
readme.md CHANGED
@@ -1,6 +1,6 @@
1
  # document_translator
2
 
3
- Project to translate files (for now .docx) using BSC's models while keeping the formatting and style of the original file.
4
 
5
  ## Requirements
6
  ### python 3.12
@@ -16,3 +16,17 @@ I took the 4 files (ca-en.params, ca-en.err, en-ca.params and en-ca.err) from ht
16
  ### python requirements
17
 
18
  pip install -r requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # document_translator
2
 
3
+ Project to translate files using BSC's models while keeping the formatting and style of the original file.
4
 
5
  ## Requirements
6
  ### python 3.12
 
16
  ### python requirements
17
 
18
  pip install -r requirements.txt
19
+
20
+ ### mtuoc_aina_translator
21
+
22
+ To use this class you also need to be running MTUOC's translation server with the proper translation models. There's also no
23
+ need to use fastalign on that side since the current project already runs it.
24
+
25
+ ### salamandrata7b_translator
26
+
27
+ Class that uses huggingface's demo.
28
+
29
+ ## Docker
30
+
31
+ sudo docker build -t document-translator .
32
+ docker run -p 7860:7860 -e HF_TOKEN=your_token_here --rm -it document-translator
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
- nltk~=3.9.1
2
- python-docx~=1.1.2
3
- torch~=2.6.0
4
- transformers~=4.51.2
5
  iso-639~=0.4.5
6
  protobuf~=6.30.2
7
- sentencepiece~=0.2.0
 
 
 
 
 
 
 
 
 
 
1
  iso-639~=0.4.5
2
  protobuf~=6.30.2
3
+ requests~=2.32.3
4
+ tqdm~=4.67.1
5
+ gradio~=5.25.1
6
+ gradio_client~=1.8.0
7
+ setuptools~=80.0.0
8
+ spacy~=3.8.6
src/aligner.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fileinput
2
+ import os
3
+ import platform
4
+ from subprocess import Popen, PIPE
5
+
6
+ # Class to align original and translated sentences
7
+ # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
8
+ class Aligner():
9
+ def __init__(self, config_folder, source_lang, target_lang, temp_folder):
10
+ forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
11
+ reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
12
+
13
+ fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
14
+ rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
15
+
16
+ self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
17
+ self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
18
+
19
+ if platform.system().lower() == "windows":
20
+ fastalign_bin = "fast_align.exe"
21
+ atools_bin = "atools.exe"
22
+ else:
23
+ fastalign_bin = "./fast_align"
24
+ atools_bin = "./atools"
25
+
26
+ self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt")
27
+
28
+ self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
29
+ forward_params_path]
30
+ self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
31
+ reverse_params_path, "r"]
32
+
33
+ self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
34
+ self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
35
+
36
+ def __simplify_alignment_file(self, file):
37
+ with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
38
+ for line in f:
39
+ print(line.split('|||')[2].strip())
40
+
41
+ def __read_err(self, err):
42
+ (T, m) = ('', '')
43
+ for line in open(err):
44
+ # expected target length = source length * N
45
+ if 'expected target length' in line:
46
+ m = line.split()[-1]
47
+ # final tension: N
48
+ elif 'final tension' in line:
49
+ T = line.split()[-1]
50
+ return T, m
51
+
52
+ def align(self, original_sentences, translated_sentences):
53
+ # create temporary file which fastalign will use
54
+ with open(self.temp_file_path, "w") as temp_file:
55
+ for original, translated in zip(original_sentences, translated_sentences):
56
+ temp_file.write(f"{original} ||| {translated}\n")
57
+
58
+ # generate forward alignment
59
+ with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
60
+ fw_process = Popen(self.forward_command, stdout=f_out)
61
+ # generate reverse alignment
62
+ r_process = Popen(self.reverse_command, stdout=r_out)
63
+
64
+ # wait for both to finish
65
+ fw_process.wait()
66
+ r_process.wait()
67
+
68
+ # for some reason the output file contains more information than needed, remove it
69
+ self.__simplify_alignment_file(self.forward_alignment_file_path)
70
+ self.__simplify_alignment_file(self.reverse_alignment_file_path)
71
+
72
+ # generate symmetrical alignment
73
+ process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
74
+ process.wait()
75
+
76
+ # get final alignments and format them
77
+ alignments_str = process.communicate()[0].decode('utf-8')
78
+ alignments = []
79
+ for line in alignments_str.splitlines():
80
+ alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
81
+
82
+ return alignments
src/mtuoc_aina_translator.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+
5
+ class MTUOCAinaTranslator:
6
+ def __init__(self, ip: str, port: str):
7
+ self.ip = ip
8
+ self.port = port
9
+
10
+ def translate(self, text, source_lang=None, target_lang=None):
11
+ myobj = {
12
+ 'id': '1',
13
+ 'src': text,
14
+ }
15
+ url = f'http://{self.ip}:{self.port}/translate'
16
+ #url = 'http://' + self.ip + ':' + self.port + '/translate'
17
+ x = requests.post(url, json=myobj)
18
+ json_response = json.loads(x.text)
19
+ return json_response['tgt']
src/salamandraTA7b_translator.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio_client import Client
2
+ from iso639 import languages
3
+
4
+
5
+ class SalamandraTA7bTranslator:
6
+ def __init__(self, hf_token):
7
+ self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
8
+
9
+ def translate(self, text, source_lang, target_lang):
10
+ if not text:
11
+ return ""
12
+
13
+ # we assume that they are specifying the language by code so we need to convert it to name
14
+ lang1 = languages.get(alpha2=source_lang).name
15
+ lang2 = languages.get(alpha2=target_lang).name
16
+ result = self.client.predict(
17
+ task="Translation",
18
+ source=lang1,
19
+ target=lang2,
20
+ input_text=text,
21
+ mt_text=None,
22
+ api_name="/generate_output"
23
+ )
24
+ return result[0]
src/translate_any_doc.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import string
3
+ import time
4
+ import os
5
+ from itertools import groupby
6
+ from subprocess import Popen, PIPE
7
+ import re
8
+
9
+ from src.aligner import Aligner
10
+
11
+ import glob
12
+ import spacy
13
+ from spacy.tokens import Doc
14
+
15
+ import tqdm
16
+
17
+ # Load multilingual model to use as sentence tokenizer
18
+ spacy_nlp = spacy.load("xx_ent_wiki_sm")
19
+ # Add the rule-based sentencizer
20
+ if "sentencizer" not in spacy_nlp.pipe_names:
21
+ spacy_nlp.add_pipe("sentencizer")
22
+
23
+
24
+ def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
25
+ original_xliff_file_path: str) -> str:
26
+ """
27
+ Given a document, this function generates an xliff file and then a plain text file with the text contents
28
+ while keeping style and formatting using tags like <g id=1> </g>
29
+
30
+ Parameters:
31
+ input_file: Path to document to process
32
+ source_lang: Source language of the document
33
+ target_lang: Target language of the document
34
+ tikal_folder: Folder where tikal.sh is located
35
+ original_xliff_file_path: Path to xliff file to generate, which will be use later
36
+
37
+ Returns:
38
+ string: Path to plain text file
39
+ """
40
+
41
+ tikal_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-x", input_file, "-nocopy", "-sl", source_lang,
42
+ "-tl", target_lang]
43
+ Popen(tikal_xliff_command).wait()
44
+
45
+ tikal_moses_command = [os.path.join(tikal_folder, "tikal.sh"), "-xm", original_xliff_file_path, "-sl", source_lang,
46
+ "-tl", target_lang]
47
+ Popen(tikal_moses_command).wait()
48
+
49
+ return os.path.join(original_xliff_file_path + f".{source_lang}")
50
+
51
+
52
+ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[str, str | tuple[str, ...]]]:
53
+ """
54
+ Given some text that may or may not contain some chunks tagged with something like <g id=1> </g>, extract each
55
+ of the runs of text and convert them into dictionaries to keep this information
56
+
57
+ Parameters:
58
+ text: Text to process
59
+ paragraph_index: Index of the paragraph in the file
60
+
61
+ Returns:
62
+ list[dict]: Where each element is a run with text, tag id (if any, if not None) and paragraph_index
63
+ """
64
+
65
+ tag_stack = []
66
+ runs = []
67
+ pos = 0
68
+
69
+ # Match any tag: <tag id="123"/>, </tag>, or <tag id="123">
70
+ tag_pattern = re.compile(r'<(/?)(\w+)(?:\s+id="(\d+)")?\s*(/?)>')
71
+
72
+ for match in tag_pattern.finditer(paragraph):
73
+ start, end = match.span()
74
+ is_closing = match.group(1) == "/"
75
+ tag_name = match.group(2)
76
+ tag_id = match.group(3)
77
+ is_self_closing = match.group(4) == "/"
78
+
79
+ # Text before this tag
80
+ if start > pos:
81
+ text = paragraph[pos:start]
82
+ if text:
83
+ runs.append({
84
+ "text": text,
85
+ "id": tag_stack.copy(),
86
+ "paragraph_index": paragraph_index
87
+ })
88
+
89
+ if is_closing:
90
+ # Closing tag </tag>
91
+ expected_prefix = f"{tag_name}_"
92
+ if tag_stack and tag_stack[-1].startswith(expected_prefix):
93
+ tag_stack.pop()
94
+ else:
95
+ raise ValueError(f"Mismatched closing tag </{tag_name}>")
96
+ elif is_self_closing:
97
+ # Self-closing tag like <x id="1"/>
98
+ if tag_id is None:
99
+ raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
100
+ runs.append({
101
+ "text": "",
102
+ "id": [f"{tag_name}_{tag_id}"],
103
+ "paragraph_index": paragraph_index
104
+ })
105
+ else:
106
+ # Opening tag <tag id="...">
107
+ if tag_id is None:
108
+ raise ValueError(f"Opening tag <{tag_name}> missing id")
109
+ tag_stack.append(f"{tag_name}_{tag_id}")
110
+
111
+ pos = end
112
+
113
+ # Final trailing text
114
+ if pos < len(paragraph):
115
+ text = paragraph[pos:]
116
+ if text:
117
+ runs.append({
118
+ "text": text,
119
+ "id": tag_stack.copy(),
120
+ "paragraph_index": paragraph_index
121
+ })
122
+
123
+ return runs
124
+
125
+
126
+ def tokenize_text(text, tokenizer):
127
+ # To avoid the tokenizer destroying the url
128
+ def preserve_urls(text):
129
+ url_pattern = r'https?://[^\s\)\]\}\>]+|www\.[^\s\)\]\}\>]+'
130
+ # Find URLs using regex and replace them with a placeholder
131
+ urls = re.findall(url_pattern, text)
132
+ for idx, url in enumerate(urls):
133
+ placeholder = f"URL{idx}"
134
+ text = text.replace(url, placeholder)
135
+
136
+ return text, urls
137
+
138
+ # Replace URLs with placeholders
139
+ text, urls = preserve_urls(text)
140
+
141
+ # Tokenize using Sacremoses
142
+ tokens = tokenizer.tokenize(text)
143
+
144
+ # Revert placeholders back to original URLs
145
+ for idx, url in enumerate(urls):
146
+ placeholder = f"URL{idx}"
147
+ tokens = [token.replace(placeholder, url) for token in tokens]
148
+
149
+ return tokens
150
+
151
+
152
+ def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str, str]]], list[list[bool]]]:
153
+ """
154
+ Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
155
+ to its original run
156
+
157
+ Parameters:
158
+ runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
159
+ source_lang: Language of the document
160
+
161
+ Returns:
162
+ list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
163
+ """
164
+
165
+ # it's a bit of a mess but first we get the tokenized sentences
166
+ # join runs and send through spacy to split into clean tokens
167
+ doc_from_runs = spacy_nlp("".join([run["text"] for run in runs]).strip())
168
+
169
+ # extract sentences and tokenize each into words
170
+ tokenized_sentences = [[token.text.strip() for token in sent if token.text.strip()] for sent in doc_from_runs.sents]
171
+ tokenized_sentences_spaces = [[token.whitespace_ != '' for token in sent if token.text.strip()] for sent in
172
+ doc_from_runs.sents]
173
+
174
+ flat_tokens = [token for sentence in tokenized_sentences for token in sentence]
175
+ flat_spaces = [token for sentence in tokenized_sentences_spaces for token in sentence]
176
+
177
+ flat_tokens_with_style = []
178
+ flat_spaces_with_style = []
179
+ token_idx = 0
180
+ for run in runs:
181
+ run["text"] = run["text"].strip()
182
+ while run["text"]:
183
+ if run["text"].startswith(flat_tokens[token_idx]):
184
+ run["text"] = run["text"][len(flat_tokens[token_idx]):]
185
+ if flat_spaces[token_idx]:
186
+ run["text"] = run["text"].lstrip()
187
+ item = run.copy()
188
+ item["text"] = flat_tokens[token_idx]
189
+ flat_tokens_with_style.append(item)
190
+ flat_spaces_with_style.append(flat_spaces[token_idx])
191
+ token_idx += 1
192
+ elif flat_tokens[token_idx].startswith(run["text"]):
193
+ subtoken = flat_tokens[token_idx][:len(run["text"])]
194
+ item = run.copy()
195
+ item["text"] = subtoken
196
+ flat_tokens_with_style.append(item)
197
+ flat_spaces_with_style.append(False)
198
+ flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
199
+ run["text"] = run["text"][len(subtoken):]
200
+
201
+ # reconstruct the sentences
202
+ token_idx = 0
203
+ tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
204
+ for sentence, sentence_spaces in zip(tokenized_sentences, tokenized_sentences_spaces):
205
+ sentence_with_style, sentence_spaces_with_style = [], []
206
+ for token in sentence:
207
+ if token == flat_tokens_with_style[token_idx]["text"]:
208
+ sentence_with_style.append(flat_tokens_with_style[token_idx])
209
+ sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
210
+ token_idx += 1
211
+ elif token.startswith(flat_tokens_with_style[token_idx]["text"]):
212
+ while token:
213
+ token = token[len(flat_tokens_with_style[token_idx]["text"]):]
214
+ sentence_with_style.append(flat_tokens_with_style[token_idx])
215
+ sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
216
+ token_idx += 1
217
+ else:
218
+ print(token)
219
+ print(sentence)
220
+ print(token_idx)
221
+ print(flat_tokens_with_style)
222
+ raise Exception(f"Something unexpected happened")
223
+ tokenized_sentences_with_style.append(sentence_with_style)
224
+ tokenized_sentences_spaces_with_style.append(sentence_spaces_with_style)
225
+
226
+ return tokenized_sentences_with_style, tokenized_sentences_spaces_with_style
227
+
228
+
229
+ def generate_alignments(original_tokenized_sentences_with_style: list[list[dict[str, str]]],
230
+ translated_sentences: list[str], aligner, temp_folder: str):
231
+ """
232
+ Given some original sentences with style and formatting and its translation without formatting, try to match
233
+ the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
234
+ forget about paragraphs and work only in sentences, so the output is a list of sentences but with information about
235
+ from which paragraph that sentence came from
236
+
237
+ Parameters:
238
+ original_tokenized_sentences_with_style: Original text split into sentences with style information
239
+ translated_sentences: Translated text, split into sentences
240
+ aligner: Object of the aligner class, uses fastalign
241
+ temp_folder: Path to folder where to put all the intermediate files
242
+ source_lang: original language of the document
243
+ target_lang: target language of the translation
244
+
245
+ Returns:
246
+ list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
247
+ original token
248
+ """
249
+ # clean temp folder
250
+ for f in glob.glob(os.path.join(temp_folder, "*align*")):
251
+ os.remove(f)
252
+
253
+ # tokenize the translated text by sentence and word
254
+ translated_tokenized_sentences = []
255
+ # keep spacing information to detokenize properly later
256
+ translated_tokenized_sentences_spaces = []
257
+ for sentence in translated_sentences:
258
+ tokens = spacy_nlp(sentence)
259
+ translated_tokenized_sentences_spaces.append([token.whitespace_ != '' for token in tokens])
260
+ translated_tokenized_sentences.append([token.text for token in tokens])
261
+
262
+ assert len(translated_tokenized_sentences) == len(
263
+ original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentences, likely due to a translation error"
264
+
265
+ original_sentences = []
266
+ translated_sentences = []
267
+ for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
268
+ original_sentences.append(' '.join(item['text'] for item in original))
269
+ translated_sentences.append(' '.join(translated))
270
+
271
+ alignments = aligner.align(original_sentences, translated_sentences)
272
+
273
+ # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
274
+ translated_sentences_with_style = []
275
+ for sentence_idx, sentence_alignments in enumerate(alignments):
276
+
277
+ # reverse the order of the alignments and build a dict with it
278
+ sentence_alignments = {target: source for source, target in sentence_alignments}
279
+
280
+ translated_sentence_with_style: list[dict[str, str]] = []
281
+ for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
282
+ # fastalign has found a token aligned with the translated one
283
+ if token_idx in sentence_alignments.keys():
284
+ # get the aligned token
285
+ original_idx = sentence_alignments[token_idx]
286
+ new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
287
+ new_entry["text"] = translated_token
288
+ translated_sentence_with_style.append(new_entry)
289
+ else:
290
+ # WARNING this is a test
291
+ # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
292
+ new_entry = translated_sentence_with_style[-1].copy()
293
+ new_entry["text"] = translated_token
294
+ translated_sentence_with_style.append(new_entry)
295
+
296
+ translated_sentences_with_style.append(translated_sentence_with_style)
297
+
298
+ return translated_sentences_with_style, translated_tokenized_sentences_spaces
299
+
300
+
301
+ def group_by_style(tokens: list[dict[str, str]], spaces: list[bool]) -> list[dict[str, str]]:
302
+ """
303
+ To avoid having issues in the future, we group the contiguous tokens that have the same style. Basically, we
304
+ reconstruct the runs.
305
+
306
+ Parameters:
307
+ tokens: Tokens with style information
308
+
309
+ Returns:
310
+ list[dict]: A list of translated runs with format and style
311
+ """
312
+ groups = []
313
+ zipped = zip(tokens, spaces)
314
+ for key, group in groupby(zipped, key=lambda x: (x[0]["id"], x[0]["paragraph_index"])):
315
+ group = list(group)
316
+ tokens = [item[0]['text'] for item in group]
317
+ spaces = [item[1] for item in group]
318
+
319
+ text = Doc(spacy_nlp.vocab, words=tokens, spaces=spaces).text
320
+
321
+ groups.append({"text": text,
322
+ "id": key[0],
323
+ "paragraph_index": key[1]})
324
+ return groups
325
+
326
+
327
+ def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]]], out_file_path: str):
328
+ """
329
+ Generate a plain text file restoring the original tag structure like <g id=1> </g>
330
+
331
+ Parameters:
332
+ paragraphs_with_style: Dictionary where each key is the paragraph_index and its contents are a list of runs
333
+ out_file_path: Path to the file where the plain text will be saved
334
+ """
335
+ with open(out_file_path, "w") as out_file:
336
+
337
+ def close_tags(ids):
338
+ tag = ""
339
+ for gid in ids:
340
+ tag_type, tag_id = gid.split("_")
341
+ tag += f'</{tag_type}>'
342
+ return tag
343
+
344
+ def open_tags(ids):
345
+ tag = ""
346
+ for gid in ids:
347
+ tag_type, tag_id = gid.split("_")
348
+ tag += f'<{tag_type} id="{tag_id}">'
349
+ return tag
350
+
351
+ for key, paragraph in paragraphs_with_style.items():
352
+ for run in paragraph:
353
+ ids = list(run["id"]) if run["id"] else []
354
+
355
+ if ids:
356
+ output = open_tags(ids) + run["text"] + close_tags(ids)
357
+ out_file.write(output)
358
+
359
+ else:
360
+ out_file.write("".join(run["text"]))
361
+
362
+ out_file.write("\n")
363
+
364
+
365
+ def translate_document(input_file: str, source_lang: str, target_lang: str,
366
+ translator,
367
+ aligner: Aligner,
368
+ temp_folder: str = "tmp",
369
+ tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0", with_format: bool = True) -> str:
370
+ input_filename = input_file.split("/")[-1]
371
+ os.makedirs(temp_folder, exist_ok=True)
372
+
373
+ # copy the original file to the temporal folder to avoid common issues with tikal
374
+ temp_input_file = os.path.join(temp_folder, input_filename)
375
+ shutil.copy(input_file, temp_input_file)
376
+
377
+ original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
378
+ plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
379
+
380
+ # get paragraphs with runs
381
+ paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
382
+ enumerate(open(plain_text_file).readlines())]
383
+
384
+ # translate using plaintext file
385
+ original_tokenized_sentences_with_style = []
386
+ original_spacing = []
387
+ for run in paragraphs_with_runs:
388
+ tokens, spaces = tokenize_with_runs(run)
389
+ original_tokenized_sentences_with_style += tokens
390
+ original_spacing += spaces
391
+
392
+ translated_sentences = []
393
+ for sentence, spacing in tqdm.tqdm(zip(original_tokenized_sentences_with_style, original_spacing),
394
+ desc="Translating paragraphs...",
395
+ total=len(original_tokenized_sentences_with_style)):
396
+ text = Doc(spacy_nlp.vocab, words=[token["text"] for token in sentence], spaces=spacing).text
397
+
398
+ while True:
399
+ try:
400
+ translated_sentences.append(translator.translate(text, source_lang, target_lang))
401
+ break
402
+ except:
403
+ continue
404
+
405
+ # time to align the translation with the original
406
+ print("Generating alignments...")
407
+ start_time = time.time()
408
+ translated_sentences_with_style, translated_sentences_spacing = generate_alignments(
409
+ original_tokenized_sentences_with_style,
410
+ translated_sentences, aligner,
411
+ temp_folder)
412
+ print(f"Finished alignments in {time.time() - start_time} seconds")
413
+
414
+ # since we tokenized these sentences independently, the spacing information does not contain spaces after punctuation
415
+ # at the end of the sentence (there's no space at the end of a sentence that ends with ".", unless there's a sentence
416
+ # right after
417
+ for sentence, sentence_spaces in zip(translated_sentences_with_style, translated_sentences_spacing):
418
+ if sentence[-1]["text"] in string.punctuation:
419
+ sentence_spaces[-1] = True
420
+
421
+ # flatten the sentences into a list of tokens
422
+ translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
423
+ tokens_spaces = [item for sublist in translated_sentences_spacing for item in sublist]
424
+
425
+ # group the tokens by style/run
426
+ translated_runs_with_style = group_by_style(translated_tokens_with_style, tokens_spaces)
427
+
428
+ # group the runs by original paragraph
429
+ translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
430
+ range(len(paragraphs_with_runs))}
431
+
432
+ for item in translated_runs_with_style:
433
+ # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
434
+ # didn't know where paragraphs started and ended
435
+ if not translated_paragraphs_with_style[item['paragraph_index']][0]["text"]:
436
+ first_item_in_paragraph = item.copy()
437
+ first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
438
+ translated_paragraphs_with_style[item['paragraph_index']] = []
439
+ translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
440
+ else:
441
+ translated_paragraphs_with_style[item['paragraph_index']].append(item)
442
+
443
+ # save to new plain text file
444
+ translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
445
+ runs_to_plain_text(translated_paragraphs_with_style, translated_moses_file)
446
+
447
+ # put the translations into the xlf
448
+ tikal_moses_to_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-lm", original_xliff_file, "-sl",
449
+ source_lang, "-tl", target_lang, "-from", translated_moses_file, "-totrg",
450
+ "-noalttrans", "-to", original_xliff_file]
451
+ Popen(tikal_moses_to_xliff_command).wait()
452
+
453
+ # any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
454
+ # them. This may happen if a word in the original language has been split in more that one words that have other
455
+ # words in between, or an error in fastalign
456
+ text = open(original_xliff_file).read()
457
+ result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
458
+ open(original_xliff_file, "w").write(result)
459
+
460
+ # merge into a docx again
461
+ tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
462
+ final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
463
+ stdout, stderr = final_process.communicate()
464
+ final_process.wait()
465
+
466
+ # get the path to the output file
467
+ output = stdout.decode('utf-8')
468
+ translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
469
+
470
+ print(f"Saved file in {translated_file_path}")
471
+ return translated_file_path
translate_docx.py → src/translate_docx.py RENAMED
@@ -8,17 +8,13 @@ from docx import Document
8
  from docx.text.hyperlink import Hyperlink
9
  from docx.text.run import Run
10
  import nltk
11
- import platform
12
 
13
  nltk.download('punkt')
14
  nltk.download('punkt_tab')
15
 
16
  from nltk.tokenize import sent_tokenize, word_tokenize
17
 
18
- from subprocess import Popen, PIPE
19
-
20
  from itertools import groupby
21
- import fileinput
22
 
23
  ip = "192.168.20.216"
24
  port = "8000"
@@ -36,85 +32,6 @@ def translate(text, ip, port):
36
  return json_response['tgt']
37
 
38
 
39
- # Class to align original and translated sentences
40
- # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
41
- class Aligner():
42
- def __init__(self, config_folder, source_lang, target_lang, temp_folder):
43
- forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
44
- reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
45
-
46
- fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
47
- rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
48
-
49
- self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
50
- self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
51
-
52
- if platform.system().lower() == "windows":
53
- fastalign_bin = "fast_align.exe"
54
- atools_bin = "atools.exe"
55
- else:
56
- fastalign_bin = "./fast_align"
57
- atools_bin = "./atools"
58
-
59
- self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
60
-
61
- self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
62
- forward_params_path]
63
- self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
64
- reverse_params_path, "r"]
65
-
66
- self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
67
- self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
68
-
69
- def __simplify_alignment_file(self, file):
70
- with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
71
- for line in f:
72
- print(line.split('|||')[2].strip())
73
-
74
- def __read_err(self, err):
75
- (T, m) = ('', '')
76
- for line in open(err):
77
- # expected target length = source length * N
78
- if 'expected target length' in line:
79
- m = line.split()[-1]
80
- # final tension: N
81
- elif 'final tension' in line:
82
- T = line.split()[-1]
83
- return T, m
84
-
85
- def align(self, original_sentences, translated_sentences):
86
- # create temporary file which fastalign will use
87
- with open(self.temp_file_path, "w") as temp_file:
88
- for original, translated in zip(original_sentences, translated_sentences):
89
- temp_file.write(f"{original} ||| {translated}\n")
90
-
91
- # generate forward alignment
92
- with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
93
- fw_process = Popen(self.forward_command, stdout=f_out)
94
- # generate reverse alignment
95
- r_process = Popen(self.reverse_command, stdout=r_out)
96
-
97
- # wait for both to finish
98
- fw_process.wait()
99
- r_process.wait()
100
-
101
- # for some reason the output file contains more information than needed, remove it
102
- self.__simplify_alignment_file(self.forward_alignment_file_path)
103
- self.__simplify_alignment_file(self.reverse_alignment_file_path)
104
-
105
- # generate symmetrical alignment
106
- process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
107
- process.wait()
108
-
109
- # get final alignments and format them
110
- alignments_str = process.communicate()[0].decode('utf-8')
111
- alignments = []
112
- for line in alignments_str.splitlines():
113
- alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
114
-
115
- return alignments
116
-
117
-
118
  # Function to extract paragraphs with their runs
119
  def extract_paragraphs_with_runs(doc):
120
  paragraphs_with_runs = []
@@ -200,6 +117,10 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
200
  translated_tokenized_sentences = [word_tokenize(sentence) for
201
  translated_paragraph in translated_paragraphs for sentence in
202
  sent_tokenize(translated_paragraph)]
 
 
 
 
203
  original_sentences = []
204
  translated_sentences = []
205
  for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
 
8
  from docx.text.hyperlink import Hyperlink
9
  from docx.text.run import Run
10
  import nltk
 
11
 
12
  nltk.download('punkt')
13
  nltk.download('punkt_tab')
14
 
15
  from nltk.tokenize import sent_tokenize, word_tokenize
16
 
 
 
17
  from itertools import groupby
 
18
 
19
  ip = "192.168.20.216"
20
  port = "8000"
 
32
  return json_response['tgt']
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  # Function to extract paragraphs with their runs
36
  def extract_paragraphs_with_runs(doc):
37
  paragraphs_with_runs = []
 
117
  translated_tokenized_sentences = [word_tokenize(sentence) for
118
  translated_paragraph in translated_paragraphs for sentence in
119
  sent_tokenize(translated_paragraph)]
120
+
121
+ assert len(translated_tokenized_sentences) == len(
122
+ original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
123
+
124
  original_sentences = []
125
  translated_sentences = []
126
  for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):