Spaces:
Sleeping
Sleeping
Merge pull request #3 from langtech-bsc/any-doc
Browse files- Dockerfile +21 -0
- gradio_app.py +27 -19
- readme.md +15 -1
- requirements.txt +6 -5
- src/aligner.py +82 -0
- src/mtuoc_aina_translator.py +19 -0
- src/salamandraTA7b_translator.py +24 -0
- src/translate_any_doc.py +471 -0
- translate_docx.py → src/translate_docx.py +4 -83
Dockerfile
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY fast_align_config ./fast_align_config
|
6 |
+
COPY src ./src
|
7 |
+
COPY okapi-apps_gtk2-linux-x86_64_1.47.0 ./okapi-apps_gtk2-linux-x86_64_1.47.0
|
8 |
+
COPY gradio_app.py .
|
9 |
+
COPY requirements.txt .
|
10 |
+
|
11 |
+
COPY fast_align .
|
12 |
+
COPY atools .
|
13 |
+
|
14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
+
RUN python -m spacy download xx_ent_wiki_sm
|
16 |
+
|
17 |
+
RUN apt-get update && \
|
18 |
+
apt-get install libgomp1 && \
|
19 |
+
apt-get install -y openjdk-17-jre-headless
|
20 |
+
|
21 |
+
CMD ["python", "gradio_app.py"]
|
gradio_app.py
CHANGED
@@ -1,39 +1,47 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
import
|
4 |
-
import
|
5 |
-
|
6 |
-
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
7 |
|
8 |
-
|
9 |
-
ip='10.192.31.127'
|
10 |
config_folder = 'fast_align_config'
|
11 |
-
source_lang = 'en'
|
12 |
-
target_lang = 'ca'
|
13 |
temp_folder = 'tmp'
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
-
def
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
21 |
|
22 |
def download_file():
|
23 |
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|
24 |
|
25 |
|
26 |
with gr.Blocks() as demo:
|
27 |
-
|
28 |
with gr.Tab("Text"):
|
29 |
-
gr.Interface(fn=translate, inputs=["text","text","text"], outputs="text")
|
30 |
-
with gr.Tab("
|
|
|
|
|
|
|
31 |
gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
|
32 |
with gr.Row():
|
33 |
u = gr.UploadButton("Upload a file", file_count="single")
|
34 |
d = gr.DownloadButton("Download the file", visible=False)
|
35 |
|
36 |
-
u.upload(upload_file, u, [u, d])
|
37 |
d.click(download_file, None, [u, d])
|
38 |
if __name__ == "__main__":
|
39 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from src.translate_any_doc import translate_document
|
3 |
+
from src.salamandraTA7b_translator import SalamandraTA7bTranslator
|
4 |
+
from src.aligner import Aligner
|
5 |
+
import os
|
|
|
6 |
|
|
|
|
|
7 |
config_folder = 'fast_align_config'
|
|
|
|
|
8 |
temp_folder = 'tmp'
|
9 |
+
hf_token = os.getenv('HF_TOKEN')
|
10 |
+
|
11 |
+
translator = SalamandraTA7bTranslator(hf_token)
|
12 |
+
|
13 |
+
|
14 |
+
def upload_file(filepath, source_lang, target_lang):
|
15 |
+
aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
|
16 |
+
translated_file_name = translate_document(filepath, source_lang, target_lang, translator, aligner)
|
17 |
+
return [gr.UploadButton(visible=False),
|
18 |
+
gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
|
19 |
|
20 |
|
21 |
+
def before_processing():
|
22 |
+
return [
|
23 |
+
gr.UploadButton("Processing...", interactive=False),
|
24 |
+
gr.DownloadButton(visible=False) # Keep download hidden until processing finishes
|
25 |
+
]
|
26 |
+
|
27 |
|
28 |
def download_file():
|
29 |
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|
30 |
|
31 |
|
32 |
with gr.Blocks() as demo:
|
|
|
33 |
with gr.Tab("Text"):
|
34 |
+
gr.Interface(fn=translator.translate, inputs=["text", "text", "text"], outputs="text")
|
35 |
+
with gr.Tab("Documents"):
|
36 |
+
with gr.Row():
|
37 |
+
dropdown1 = gr.Dropdown(label="Source language", choices=["en", "ca"], value=None, interactive=True)
|
38 |
+
dropdown2 = gr.Dropdown(label="Target language", choices=["en", "ca"], value=None, interactive=True)
|
39 |
gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
|
40 |
with gr.Row():
|
41 |
u = gr.UploadButton("Upload a file", file_count="single")
|
42 |
d = gr.DownloadButton("Download the file", visible=False)
|
43 |
|
44 |
+
u.upload(fn=before_processing, inputs=None, outputs=[u, d]).then(upload_file, [u, dropdown1, dropdown2], [u, d])
|
45 |
d.click(download_file, None, [u, d])
|
46 |
if __name__ == "__main__":
|
47 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
readme.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# document_translator
|
2 |
|
3 |
-
Project to translate files
|
4 |
|
5 |
## Requirements
|
6 |
### python 3.12
|
@@ -16,3 +16,17 @@ I took the 4 files (ca-en.params, ca-en.err, en-ca.params and en-ca.err) from ht
|
|
16 |
### python requirements
|
17 |
|
18 |
pip install -r requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# document_translator
|
2 |
|
3 |
+
Project to translate files using BSC's models while keeping the formatting and style of the original file.
|
4 |
|
5 |
## Requirements
|
6 |
### python 3.12
|
|
|
16 |
### python requirements
|
17 |
|
18 |
pip install -r requirements.txt
|
19 |
+
|
20 |
+
### mtuoc_aina_translator
|
21 |
+
|
22 |
+
To use this class you also need to be running MTUOC's translation server with the proper translation models. There's also no
|
23 |
+
need to use fastalign on that side since the current project already runs it.
|
24 |
+
|
25 |
+
### salamandrata7b_translator
|
26 |
+
|
27 |
+
Class that uses huggingface's demo.
|
28 |
+
|
29 |
+
## Docker
|
30 |
+
|
31 |
+
sudo docker build -t document-translator .
|
32 |
+
docker run -p 7860:7860 -e HF_TOKEN=your_token_here --rm -it document-translator
|
requirements.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
-
nltk~=3.9.1
|
2 |
-
python-docx~=1.1.2
|
3 |
-
torch~=2.6.0
|
4 |
-
transformers~=4.51.2
|
5 |
iso-639~=0.4.5
|
6 |
protobuf~=6.30.2
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
iso-639~=0.4.5
|
2 |
protobuf~=6.30.2
|
3 |
+
requests~=2.32.3
|
4 |
+
tqdm~=4.67.1
|
5 |
+
gradio~=5.25.1
|
6 |
+
gradio_client~=1.8.0
|
7 |
+
setuptools~=80.0.0
|
8 |
+
spacy~=3.8.6
|
src/aligner.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fileinput
|
2 |
+
import os
|
3 |
+
import platform
|
4 |
+
from subprocess import Popen, PIPE
|
5 |
+
|
6 |
+
# Class to align original and translated sentences
|
7 |
+
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
|
8 |
+
class Aligner():
|
9 |
+
def __init__(self, config_folder, source_lang, target_lang, temp_folder):
|
10 |
+
forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
|
11 |
+
reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
|
12 |
+
|
13 |
+
fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
|
14 |
+
rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
|
15 |
+
|
16 |
+
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
17 |
+
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
18 |
+
|
19 |
+
if platform.system().lower() == "windows":
|
20 |
+
fastalign_bin = "fast_align.exe"
|
21 |
+
atools_bin = "atools.exe"
|
22 |
+
else:
|
23 |
+
fastalign_bin = "./fast_align"
|
24 |
+
atools_bin = "./atools"
|
25 |
+
|
26 |
+
self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt")
|
27 |
+
|
28 |
+
self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
|
29 |
+
forward_params_path]
|
30 |
+
self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
|
31 |
+
reverse_params_path, "r"]
|
32 |
+
|
33 |
+
self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
|
34 |
+
self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
|
35 |
+
|
36 |
+
def __simplify_alignment_file(self, file):
|
37 |
+
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
38 |
+
for line in f:
|
39 |
+
print(line.split('|||')[2].strip())
|
40 |
+
|
41 |
+
def __read_err(self, err):
|
42 |
+
(T, m) = ('', '')
|
43 |
+
for line in open(err):
|
44 |
+
# expected target length = source length * N
|
45 |
+
if 'expected target length' in line:
|
46 |
+
m = line.split()[-1]
|
47 |
+
# final tension: N
|
48 |
+
elif 'final tension' in line:
|
49 |
+
T = line.split()[-1]
|
50 |
+
return T, m
|
51 |
+
|
52 |
+
def align(self, original_sentences, translated_sentences):
|
53 |
+
# create temporary file which fastalign will use
|
54 |
+
with open(self.temp_file_path, "w") as temp_file:
|
55 |
+
for original, translated in zip(original_sentences, translated_sentences):
|
56 |
+
temp_file.write(f"{original} ||| {translated}\n")
|
57 |
+
|
58 |
+
# generate forward alignment
|
59 |
+
with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
|
60 |
+
fw_process = Popen(self.forward_command, stdout=f_out)
|
61 |
+
# generate reverse alignment
|
62 |
+
r_process = Popen(self.reverse_command, stdout=r_out)
|
63 |
+
|
64 |
+
# wait for both to finish
|
65 |
+
fw_process.wait()
|
66 |
+
r_process.wait()
|
67 |
+
|
68 |
+
# for some reason the output file contains more information than needed, remove it
|
69 |
+
self.__simplify_alignment_file(self.forward_alignment_file_path)
|
70 |
+
self.__simplify_alignment_file(self.reverse_alignment_file_path)
|
71 |
+
|
72 |
+
# generate symmetrical alignment
|
73 |
+
process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
74 |
+
process.wait()
|
75 |
+
|
76 |
+
# get final alignments and format them
|
77 |
+
alignments_str = process.communicate()[0].decode('utf-8')
|
78 |
+
alignments = []
|
79 |
+
for line in alignments_str.splitlines():
|
80 |
+
alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
|
81 |
+
|
82 |
+
return alignments
|
src/mtuoc_aina_translator.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
|
4 |
+
|
5 |
+
class MTUOCAinaTranslator:
|
6 |
+
def __init__(self, ip: str, port: str):
|
7 |
+
self.ip = ip
|
8 |
+
self.port = port
|
9 |
+
|
10 |
+
def translate(self, text, source_lang=None, target_lang=None):
|
11 |
+
myobj = {
|
12 |
+
'id': '1',
|
13 |
+
'src': text,
|
14 |
+
}
|
15 |
+
url = f'http://{self.ip}:{self.port}/translate'
|
16 |
+
#url = 'http://' + self.ip + ':' + self.port + '/translate'
|
17 |
+
x = requests.post(url, json=myobj)
|
18 |
+
json_response = json.loads(x.text)
|
19 |
+
return json_response['tgt']
|
src/salamandraTA7b_translator.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gradio_client import Client
|
2 |
+
from iso639 import languages
|
3 |
+
|
4 |
+
|
5 |
+
class SalamandraTA7bTranslator:
|
6 |
+
def __init__(self, hf_token):
|
7 |
+
self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
|
8 |
+
|
9 |
+
def translate(self, text, source_lang, target_lang):
|
10 |
+
if not text:
|
11 |
+
return ""
|
12 |
+
|
13 |
+
# we assume that they are specifying the language by code so we need to convert it to name
|
14 |
+
lang1 = languages.get(alpha2=source_lang).name
|
15 |
+
lang2 = languages.get(alpha2=target_lang).name
|
16 |
+
result = self.client.predict(
|
17 |
+
task="Translation",
|
18 |
+
source=lang1,
|
19 |
+
target=lang2,
|
20 |
+
input_text=text,
|
21 |
+
mt_text=None,
|
22 |
+
api_name="/generate_output"
|
23 |
+
)
|
24 |
+
return result[0]
|
src/translate_any_doc.py
ADDED
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import shutil
|
2 |
+
import string
|
3 |
+
import time
|
4 |
+
import os
|
5 |
+
from itertools import groupby
|
6 |
+
from subprocess import Popen, PIPE
|
7 |
+
import re
|
8 |
+
|
9 |
+
from src.aligner import Aligner
|
10 |
+
|
11 |
+
import glob
|
12 |
+
import spacy
|
13 |
+
from spacy.tokens import Doc
|
14 |
+
|
15 |
+
import tqdm
|
16 |
+
|
17 |
+
# Load multilingual model to use as sentence tokenizer
|
18 |
+
spacy_nlp = spacy.load("xx_ent_wiki_sm")
|
19 |
+
# Add the rule-based sentencizer
|
20 |
+
if "sentencizer" not in spacy_nlp.pipe_names:
|
21 |
+
spacy_nlp.add_pipe("sentencizer")
|
22 |
+
|
23 |
+
|
24 |
+
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
25 |
+
original_xliff_file_path: str) -> str:
|
26 |
+
"""
|
27 |
+
Given a document, this function generates an xliff file and then a plain text file with the text contents
|
28 |
+
while keeping style and formatting using tags like <g id=1> </g>
|
29 |
+
|
30 |
+
Parameters:
|
31 |
+
input_file: Path to document to process
|
32 |
+
source_lang: Source language of the document
|
33 |
+
target_lang: Target language of the document
|
34 |
+
tikal_folder: Folder where tikal.sh is located
|
35 |
+
original_xliff_file_path: Path to xliff file to generate, which will be use later
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
string: Path to plain text file
|
39 |
+
"""
|
40 |
+
|
41 |
+
tikal_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-x", input_file, "-nocopy", "-sl", source_lang,
|
42 |
+
"-tl", target_lang]
|
43 |
+
Popen(tikal_xliff_command).wait()
|
44 |
+
|
45 |
+
tikal_moses_command = [os.path.join(tikal_folder, "tikal.sh"), "-xm", original_xliff_file_path, "-sl", source_lang,
|
46 |
+
"-tl", target_lang]
|
47 |
+
Popen(tikal_moses_command).wait()
|
48 |
+
|
49 |
+
return os.path.join(original_xliff_file_path + f".{source_lang}")
|
50 |
+
|
51 |
+
|
52 |
+
def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[str, str | tuple[str, ...]]]:
|
53 |
+
"""
|
54 |
+
Given some text that may or may not contain some chunks tagged with something like <g id=1> </g>, extract each
|
55 |
+
of the runs of text and convert them into dictionaries to keep this information
|
56 |
+
|
57 |
+
Parameters:
|
58 |
+
text: Text to process
|
59 |
+
paragraph_index: Index of the paragraph in the file
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
list[dict]: Where each element is a run with text, tag id (if any, if not None) and paragraph_index
|
63 |
+
"""
|
64 |
+
|
65 |
+
tag_stack = []
|
66 |
+
runs = []
|
67 |
+
pos = 0
|
68 |
+
|
69 |
+
# Match any tag: <tag id="123"/>, </tag>, or <tag id="123">
|
70 |
+
tag_pattern = re.compile(r'<(/?)(\w+)(?:\s+id="(\d+)")?\s*(/?)>')
|
71 |
+
|
72 |
+
for match in tag_pattern.finditer(paragraph):
|
73 |
+
start, end = match.span()
|
74 |
+
is_closing = match.group(1) == "/"
|
75 |
+
tag_name = match.group(2)
|
76 |
+
tag_id = match.group(3)
|
77 |
+
is_self_closing = match.group(4) == "/"
|
78 |
+
|
79 |
+
# Text before this tag
|
80 |
+
if start > pos:
|
81 |
+
text = paragraph[pos:start]
|
82 |
+
if text:
|
83 |
+
runs.append({
|
84 |
+
"text": text,
|
85 |
+
"id": tag_stack.copy(),
|
86 |
+
"paragraph_index": paragraph_index
|
87 |
+
})
|
88 |
+
|
89 |
+
if is_closing:
|
90 |
+
# Closing tag </tag>
|
91 |
+
expected_prefix = f"{tag_name}_"
|
92 |
+
if tag_stack and tag_stack[-1].startswith(expected_prefix):
|
93 |
+
tag_stack.pop()
|
94 |
+
else:
|
95 |
+
raise ValueError(f"Mismatched closing tag </{tag_name}>")
|
96 |
+
elif is_self_closing:
|
97 |
+
# Self-closing tag like <x id="1"/>
|
98 |
+
if tag_id is None:
|
99 |
+
raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
|
100 |
+
runs.append({
|
101 |
+
"text": "",
|
102 |
+
"id": [f"{tag_name}_{tag_id}"],
|
103 |
+
"paragraph_index": paragraph_index
|
104 |
+
})
|
105 |
+
else:
|
106 |
+
# Opening tag <tag id="...">
|
107 |
+
if tag_id is None:
|
108 |
+
raise ValueError(f"Opening tag <{tag_name}> missing id")
|
109 |
+
tag_stack.append(f"{tag_name}_{tag_id}")
|
110 |
+
|
111 |
+
pos = end
|
112 |
+
|
113 |
+
# Final trailing text
|
114 |
+
if pos < len(paragraph):
|
115 |
+
text = paragraph[pos:]
|
116 |
+
if text:
|
117 |
+
runs.append({
|
118 |
+
"text": text,
|
119 |
+
"id": tag_stack.copy(),
|
120 |
+
"paragraph_index": paragraph_index
|
121 |
+
})
|
122 |
+
|
123 |
+
return runs
|
124 |
+
|
125 |
+
|
126 |
+
def tokenize_text(text, tokenizer):
|
127 |
+
# To avoid the tokenizer destroying the url
|
128 |
+
def preserve_urls(text):
|
129 |
+
url_pattern = r'https?://[^\s\)\]\}\>]+|www\.[^\s\)\]\}\>]+'
|
130 |
+
# Find URLs using regex and replace them with a placeholder
|
131 |
+
urls = re.findall(url_pattern, text)
|
132 |
+
for idx, url in enumerate(urls):
|
133 |
+
placeholder = f"URL{idx}"
|
134 |
+
text = text.replace(url, placeholder)
|
135 |
+
|
136 |
+
return text, urls
|
137 |
+
|
138 |
+
# Replace URLs with placeholders
|
139 |
+
text, urls = preserve_urls(text)
|
140 |
+
|
141 |
+
# Tokenize using Sacremoses
|
142 |
+
tokens = tokenizer.tokenize(text)
|
143 |
+
|
144 |
+
# Revert placeholders back to original URLs
|
145 |
+
for idx, url in enumerate(urls):
|
146 |
+
placeholder = f"URL{idx}"
|
147 |
+
tokens = [token.replace(placeholder, url) for token in tokens]
|
148 |
+
|
149 |
+
return tokens
|
150 |
+
|
151 |
+
|
152 |
+
def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str, str]]], list[list[bool]]]:
|
153 |
+
"""
|
154 |
+
Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
|
155 |
+
to its original run
|
156 |
+
|
157 |
+
Parameters:
|
158 |
+
runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
|
159 |
+
source_lang: Language of the document
|
160 |
+
|
161 |
+
Returns:
|
162 |
+
list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
|
163 |
+
"""
|
164 |
+
|
165 |
+
# it's a bit of a mess but first we get the tokenized sentences
|
166 |
+
# join runs and send through spacy to split into clean tokens
|
167 |
+
doc_from_runs = spacy_nlp("".join([run["text"] for run in runs]).strip())
|
168 |
+
|
169 |
+
# extract sentences and tokenize each into words
|
170 |
+
tokenized_sentences = [[token.text.strip() for token in sent if token.text.strip()] for sent in doc_from_runs.sents]
|
171 |
+
tokenized_sentences_spaces = [[token.whitespace_ != '' for token in sent if token.text.strip()] for sent in
|
172 |
+
doc_from_runs.sents]
|
173 |
+
|
174 |
+
flat_tokens = [token for sentence in tokenized_sentences for token in sentence]
|
175 |
+
flat_spaces = [token for sentence in tokenized_sentences_spaces for token in sentence]
|
176 |
+
|
177 |
+
flat_tokens_with_style = []
|
178 |
+
flat_spaces_with_style = []
|
179 |
+
token_idx = 0
|
180 |
+
for run in runs:
|
181 |
+
run["text"] = run["text"].strip()
|
182 |
+
while run["text"]:
|
183 |
+
if run["text"].startswith(flat_tokens[token_idx]):
|
184 |
+
run["text"] = run["text"][len(flat_tokens[token_idx]):]
|
185 |
+
if flat_spaces[token_idx]:
|
186 |
+
run["text"] = run["text"].lstrip()
|
187 |
+
item = run.copy()
|
188 |
+
item["text"] = flat_tokens[token_idx]
|
189 |
+
flat_tokens_with_style.append(item)
|
190 |
+
flat_spaces_with_style.append(flat_spaces[token_idx])
|
191 |
+
token_idx += 1
|
192 |
+
elif flat_tokens[token_idx].startswith(run["text"]):
|
193 |
+
subtoken = flat_tokens[token_idx][:len(run["text"])]
|
194 |
+
item = run.copy()
|
195 |
+
item["text"] = subtoken
|
196 |
+
flat_tokens_with_style.append(item)
|
197 |
+
flat_spaces_with_style.append(False)
|
198 |
+
flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
|
199 |
+
run["text"] = run["text"][len(subtoken):]
|
200 |
+
|
201 |
+
# reconstruct the sentences
|
202 |
+
token_idx = 0
|
203 |
+
tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
|
204 |
+
for sentence, sentence_spaces in zip(tokenized_sentences, tokenized_sentences_spaces):
|
205 |
+
sentence_with_style, sentence_spaces_with_style = [], []
|
206 |
+
for token in sentence:
|
207 |
+
if token == flat_tokens_with_style[token_idx]["text"]:
|
208 |
+
sentence_with_style.append(flat_tokens_with_style[token_idx])
|
209 |
+
sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
|
210 |
+
token_idx += 1
|
211 |
+
elif token.startswith(flat_tokens_with_style[token_idx]["text"]):
|
212 |
+
while token:
|
213 |
+
token = token[len(flat_tokens_with_style[token_idx]["text"]):]
|
214 |
+
sentence_with_style.append(flat_tokens_with_style[token_idx])
|
215 |
+
sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
|
216 |
+
token_idx += 1
|
217 |
+
else:
|
218 |
+
print(token)
|
219 |
+
print(sentence)
|
220 |
+
print(token_idx)
|
221 |
+
print(flat_tokens_with_style)
|
222 |
+
raise Exception(f"Something unexpected happened")
|
223 |
+
tokenized_sentences_with_style.append(sentence_with_style)
|
224 |
+
tokenized_sentences_spaces_with_style.append(sentence_spaces_with_style)
|
225 |
+
|
226 |
+
return tokenized_sentences_with_style, tokenized_sentences_spaces_with_style
|
227 |
+
|
228 |
+
|
229 |
+
def generate_alignments(original_tokenized_sentences_with_style: list[list[dict[str, str]]],
|
230 |
+
translated_sentences: list[str], aligner, temp_folder: str):
|
231 |
+
"""
|
232 |
+
Given some original sentences with style and formatting and its translation without formatting, try to match
|
233 |
+
the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
|
234 |
+
forget about paragraphs and work only in sentences, so the output is a list of sentences but with information about
|
235 |
+
from which paragraph that sentence came from
|
236 |
+
|
237 |
+
Parameters:
|
238 |
+
original_tokenized_sentences_with_style: Original text split into sentences with style information
|
239 |
+
translated_sentences: Translated text, split into sentences
|
240 |
+
aligner: Object of the aligner class, uses fastalign
|
241 |
+
temp_folder: Path to folder where to put all the intermediate files
|
242 |
+
source_lang: original language of the document
|
243 |
+
target_lang: target language of the translation
|
244 |
+
|
245 |
+
Returns:
|
246 |
+
list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
|
247 |
+
original token
|
248 |
+
"""
|
249 |
+
# clean temp folder
|
250 |
+
for f in glob.glob(os.path.join(temp_folder, "*align*")):
|
251 |
+
os.remove(f)
|
252 |
+
|
253 |
+
# tokenize the translated text by sentence and word
|
254 |
+
translated_tokenized_sentences = []
|
255 |
+
# keep spacing information to detokenize properly later
|
256 |
+
translated_tokenized_sentences_spaces = []
|
257 |
+
for sentence in translated_sentences:
|
258 |
+
tokens = spacy_nlp(sentence)
|
259 |
+
translated_tokenized_sentences_spaces.append([token.whitespace_ != '' for token in tokens])
|
260 |
+
translated_tokenized_sentences.append([token.text for token in tokens])
|
261 |
+
|
262 |
+
assert len(translated_tokenized_sentences) == len(
|
263 |
+
original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentences, likely due to a translation error"
|
264 |
+
|
265 |
+
original_sentences = []
|
266 |
+
translated_sentences = []
|
267 |
+
for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
|
268 |
+
original_sentences.append(' '.join(item['text'] for item in original))
|
269 |
+
translated_sentences.append(' '.join(translated))
|
270 |
+
|
271 |
+
alignments = aligner.align(original_sentences, translated_sentences)
|
272 |
+
|
273 |
+
# using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
|
274 |
+
translated_sentences_with_style = []
|
275 |
+
for sentence_idx, sentence_alignments in enumerate(alignments):
|
276 |
+
|
277 |
+
# reverse the order of the alignments and build a dict with it
|
278 |
+
sentence_alignments = {target: source for source, target in sentence_alignments}
|
279 |
+
|
280 |
+
translated_sentence_with_style: list[dict[str, str]] = []
|
281 |
+
for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
|
282 |
+
# fastalign has found a token aligned with the translated one
|
283 |
+
if token_idx in sentence_alignments.keys():
|
284 |
+
# get the aligned token
|
285 |
+
original_idx = sentence_alignments[token_idx]
|
286 |
+
new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
|
287 |
+
new_entry["text"] = translated_token
|
288 |
+
translated_sentence_with_style.append(new_entry)
|
289 |
+
else:
|
290 |
+
# WARNING this is a test
|
291 |
+
# since fastalign doesn't know from which word to reference this token, copy the style of the previous word
|
292 |
+
new_entry = translated_sentence_with_style[-1].copy()
|
293 |
+
new_entry["text"] = translated_token
|
294 |
+
translated_sentence_with_style.append(new_entry)
|
295 |
+
|
296 |
+
translated_sentences_with_style.append(translated_sentence_with_style)
|
297 |
+
|
298 |
+
return translated_sentences_with_style, translated_tokenized_sentences_spaces
|
299 |
+
|
300 |
+
|
301 |
+
def group_by_style(tokens: list[dict[str, str]], spaces: list[bool]) -> list[dict[str, str]]:
|
302 |
+
"""
|
303 |
+
To avoid having issues in the future, we group the contiguous tokens that have the same style. Basically, we
|
304 |
+
reconstruct the runs.
|
305 |
+
|
306 |
+
Parameters:
|
307 |
+
tokens: Tokens with style information
|
308 |
+
|
309 |
+
Returns:
|
310 |
+
list[dict]: A list of translated runs with format and style
|
311 |
+
"""
|
312 |
+
groups = []
|
313 |
+
zipped = zip(tokens, spaces)
|
314 |
+
for key, group in groupby(zipped, key=lambda x: (x[0]["id"], x[0]["paragraph_index"])):
|
315 |
+
group = list(group)
|
316 |
+
tokens = [item[0]['text'] for item in group]
|
317 |
+
spaces = [item[1] for item in group]
|
318 |
+
|
319 |
+
text = Doc(spacy_nlp.vocab, words=tokens, spaces=spaces).text
|
320 |
+
|
321 |
+
groups.append({"text": text,
|
322 |
+
"id": key[0],
|
323 |
+
"paragraph_index": key[1]})
|
324 |
+
return groups
|
325 |
+
|
326 |
+
|
327 |
+
def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]]], out_file_path: str):
|
328 |
+
"""
|
329 |
+
Generate a plain text file restoring the original tag structure like <g id=1> </g>
|
330 |
+
|
331 |
+
Parameters:
|
332 |
+
paragraphs_with_style: Dictionary where each key is the paragraph_index and its contents are a list of runs
|
333 |
+
out_file_path: Path to the file where the plain text will be saved
|
334 |
+
"""
|
335 |
+
with open(out_file_path, "w") as out_file:
|
336 |
+
|
337 |
+
def close_tags(ids):
|
338 |
+
tag = ""
|
339 |
+
for gid in ids:
|
340 |
+
tag_type, tag_id = gid.split("_")
|
341 |
+
tag += f'</{tag_type}>'
|
342 |
+
return tag
|
343 |
+
|
344 |
+
def open_tags(ids):
|
345 |
+
tag = ""
|
346 |
+
for gid in ids:
|
347 |
+
tag_type, tag_id = gid.split("_")
|
348 |
+
tag += f'<{tag_type} id="{tag_id}">'
|
349 |
+
return tag
|
350 |
+
|
351 |
+
for key, paragraph in paragraphs_with_style.items():
|
352 |
+
for run in paragraph:
|
353 |
+
ids = list(run["id"]) if run["id"] else []
|
354 |
+
|
355 |
+
if ids:
|
356 |
+
output = open_tags(ids) + run["text"] + close_tags(ids)
|
357 |
+
out_file.write(output)
|
358 |
+
|
359 |
+
else:
|
360 |
+
out_file.write("".join(run["text"]))
|
361 |
+
|
362 |
+
out_file.write("\n")
|
363 |
+
|
364 |
+
|
365 |
+
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
366 |
+
translator,
|
367 |
+
aligner: Aligner,
|
368 |
+
temp_folder: str = "tmp",
|
369 |
+
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0", with_format: bool = True) -> str:
|
370 |
+
input_filename = input_file.split("/")[-1]
|
371 |
+
os.makedirs(temp_folder, exist_ok=True)
|
372 |
+
|
373 |
+
# copy the original file to the temporal folder to avoid common issues with tikal
|
374 |
+
temp_input_file = os.path.join(temp_folder, input_filename)
|
375 |
+
shutil.copy(input_file, temp_input_file)
|
376 |
+
|
377 |
+
original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
|
378 |
+
plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
|
379 |
+
|
380 |
+
# get paragraphs with runs
|
381 |
+
paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
|
382 |
+
enumerate(open(plain_text_file).readlines())]
|
383 |
+
|
384 |
+
# translate using plaintext file
|
385 |
+
original_tokenized_sentences_with_style = []
|
386 |
+
original_spacing = []
|
387 |
+
for run in paragraphs_with_runs:
|
388 |
+
tokens, spaces = tokenize_with_runs(run)
|
389 |
+
original_tokenized_sentences_with_style += tokens
|
390 |
+
original_spacing += spaces
|
391 |
+
|
392 |
+
translated_sentences = []
|
393 |
+
for sentence, spacing in tqdm.tqdm(zip(original_tokenized_sentences_with_style, original_spacing),
|
394 |
+
desc="Translating paragraphs...",
|
395 |
+
total=len(original_tokenized_sentences_with_style)):
|
396 |
+
text = Doc(spacy_nlp.vocab, words=[token["text"] for token in sentence], spaces=spacing).text
|
397 |
+
|
398 |
+
while True:
|
399 |
+
try:
|
400 |
+
translated_sentences.append(translator.translate(text, source_lang, target_lang))
|
401 |
+
break
|
402 |
+
except:
|
403 |
+
continue
|
404 |
+
|
405 |
+
# time to align the translation with the original
|
406 |
+
print("Generating alignments...")
|
407 |
+
start_time = time.time()
|
408 |
+
translated_sentences_with_style, translated_sentences_spacing = generate_alignments(
|
409 |
+
original_tokenized_sentences_with_style,
|
410 |
+
translated_sentences, aligner,
|
411 |
+
temp_folder)
|
412 |
+
print(f"Finished alignments in {time.time() - start_time} seconds")
|
413 |
+
|
414 |
+
# since we tokenized these sentences independently, the spacing information does not contain spaces after punctuation
|
415 |
+
# at the end of the sentence (there's no space at the end of a sentence that ends with ".", unless there's a sentence
|
416 |
+
# right after
|
417 |
+
for sentence, sentence_spaces in zip(translated_sentences_with_style, translated_sentences_spacing):
|
418 |
+
if sentence[-1]["text"] in string.punctuation:
|
419 |
+
sentence_spaces[-1] = True
|
420 |
+
|
421 |
+
# flatten the sentences into a list of tokens
|
422 |
+
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
423 |
+
tokens_spaces = [item for sublist in translated_sentences_spacing for item in sublist]
|
424 |
+
|
425 |
+
# group the tokens by style/run
|
426 |
+
translated_runs_with_style = group_by_style(translated_tokens_with_style, tokens_spaces)
|
427 |
+
|
428 |
+
# group the runs by original paragraph
|
429 |
+
translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
|
430 |
+
range(len(paragraphs_with_runs))}
|
431 |
+
|
432 |
+
for item in translated_runs_with_style:
|
433 |
+
# first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
|
434 |
+
# didn't know where paragraphs started and ended
|
435 |
+
if not translated_paragraphs_with_style[item['paragraph_index']][0]["text"]:
|
436 |
+
first_item_in_paragraph = item.copy()
|
437 |
+
first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
|
438 |
+
translated_paragraphs_with_style[item['paragraph_index']] = []
|
439 |
+
translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
|
440 |
+
else:
|
441 |
+
translated_paragraphs_with_style[item['paragraph_index']].append(item)
|
442 |
+
|
443 |
+
# save to new plain text file
|
444 |
+
translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
|
445 |
+
runs_to_plain_text(translated_paragraphs_with_style, translated_moses_file)
|
446 |
+
|
447 |
+
# put the translations into the xlf
|
448 |
+
tikal_moses_to_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-lm", original_xliff_file, "-sl",
|
449 |
+
source_lang, "-tl", target_lang, "-from", translated_moses_file, "-totrg",
|
450 |
+
"-noalttrans", "-to", original_xliff_file]
|
451 |
+
Popen(tikal_moses_to_xliff_command).wait()
|
452 |
+
|
453 |
+
# any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
|
454 |
+
# them. This may happen if a word in the original language has been split in more that one words that have other
|
455 |
+
# words in between, or an error in fastalign
|
456 |
+
text = open(original_xliff_file).read()
|
457 |
+
result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
|
458 |
+
open(original_xliff_file, "w").write(result)
|
459 |
+
|
460 |
+
# merge into a docx again
|
461 |
+
tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
|
462 |
+
final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
|
463 |
+
stdout, stderr = final_process.communicate()
|
464 |
+
final_process.wait()
|
465 |
+
|
466 |
+
# get the path to the output file
|
467 |
+
output = stdout.decode('utf-8')
|
468 |
+
translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
|
469 |
+
|
470 |
+
print(f"Saved file in {translated_file_path}")
|
471 |
+
return translated_file_path
|
translate_docx.py → src/translate_docx.py
RENAMED
@@ -8,17 +8,13 @@ from docx import Document
|
|
8 |
from docx.text.hyperlink import Hyperlink
|
9 |
from docx.text.run import Run
|
10 |
import nltk
|
11 |
-
import platform
|
12 |
|
13 |
nltk.download('punkt')
|
14 |
nltk.download('punkt_tab')
|
15 |
|
16 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
17 |
|
18 |
-
from subprocess import Popen, PIPE
|
19 |
-
|
20 |
from itertools import groupby
|
21 |
-
import fileinput
|
22 |
|
23 |
ip = "192.168.20.216"
|
24 |
port = "8000"
|
@@ -36,85 +32,6 @@ def translate(text, ip, port):
|
|
36 |
return json_response['tgt']
|
37 |
|
38 |
|
39 |
-
# Class to align original and translated sentences
|
40 |
-
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
|
41 |
-
class Aligner():
|
42 |
-
def __init__(self, config_folder, source_lang, target_lang, temp_folder):
|
43 |
-
forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
|
44 |
-
reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
|
45 |
-
|
46 |
-
fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
|
47 |
-
rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
|
48 |
-
|
49 |
-
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
50 |
-
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
51 |
-
|
52 |
-
if platform.system().lower() == "windows":
|
53 |
-
fastalign_bin = "fast_align.exe"
|
54 |
-
atools_bin = "atools.exe"
|
55 |
-
else:
|
56 |
-
fastalign_bin = "./fast_align"
|
57 |
-
atools_bin = "./atools"
|
58 |
-
|
59 |
-
self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
|
60 |
-
|
61 |
-
self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
|
62 |
-
forward_params_path]
|
63 |
-
self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
|
64 |
-
reverse_params_path, "r"]
|
65 |
-
|
66 |
-
self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
|
67 |
-
self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
|
68 |
-
|
69 |
-
def __simplify_alignment_file(self, file):
|
70 |
-
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
71 |
-
for line in f:
|
72 |
-
print(line.split('|||')[2].strip())
|
73 |
-
|
74 |
-
def __read_err(self, err):
|
75 |
-
(T, m) = ('', '')
|
76 |
-
for line in open(err):
|
77 |
-
# expected target length = source length * N
|
78 |
-
if 'expected target length' in line:
|
79 |
-
m = line.split()[-1]
|
80 |
-
# final tension: N
|
81 |
-
elif 'final tension' in line:
|
82 |
-
T = line.split()[-1]
|
83 |
-
return T, m
|
84 |
-
|
85 |
-
def align(self, original_sentences, translated_sentences):
|
86 |
-
# create temporary file which fastalign will use
|
87 |
-
with open(self.temp_file_path, "w") as temp_file:
|
88 |
-
for original, translated in zip(original_sentences, translated_sentences):
|
89 |
-
temp_file.write(f"{original} ||| {translated}\n")
|
90 |
-
|
91 |
-
# generate forward alignment
|
92 |
-
with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
|
93 |
-
fw_process = Popen(self.forward_command, stdout=f_out)
|
94 |
-
# generate reverse alignment
|
95 |
-
r_process = Popen(self.reverse_command, stdout=r_out)
|
96 |
-
|
97 |
-
# wait for both to finish
|
98 |
-
fw_process.wait()
|
99 |
-
r_process.wait()
|
100 |
-
|
101 |
-
# for some reason the output file contains more information than needed, remove it
|
102 |
-
self.__simplify_alignment_file(self.forward_alignment_file_path)
|
103 |
-
self.__simplify_alignment_file(self.reverse_alignment_file_path)
|
104 |
-
|
105 |
-
# generate symmetrical alignment
|
106 |
-
process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
107 |
-
process.wait()
|
108 |
-
|
109 |
-
# get final alignments and format them
|
110 |
-
alignments_str = process.communicate()[0].decode('utf-8')
|
111 |
-
alignments = []
|
112 |
-
for line in alignments_str.splitlines():
|
113 |
-
alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
|
114 |
-
|
115 |
-
return alignments
|
116 |
-
|
117 |
-
|
118 |
# Function to extract paragraphs with their runs
|
119 |
def extract_paragraphs_with_runs(doc):
|
120 |
paragraphs_with_runs = []
|
@@ -200,6 +117,10 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
|
|
200 |
translated_tokenized_sentences = [word_tokenize(sentence) for
|
201 |
translated_paragraph in translated_paragraphs for sentence in
|
202 |
sent_tokenize(translated_paragraph)]
|
|
|
|
|
|
|
|
|
203 |
original_sentences = []
|
204 |
translated_sentences = []
|
205 |
for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
|
|
|
8 |
from docx.text.hyperlink import Hyperlink
|
9 |
from docx.text.run import Run
|
10 |
import nltk
|
|
|
11 |
|
12 |
nltk.download('punkt')
|
13 |
nltk.download('punkt_tab')
|
14 |
|
15 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
16 |
|
|
|
|
|
17 |
from itertools import groupby
|
|
|
18 |
|
19 |
ip = "192.168.20.216"
|
20 |
port = "8000"
|
|
|
32 |
return json_response['tgt']
|
33 |
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
# Function to extract paragraphs with their runs
|
36 |
def extract_paragraphs_with_runs(doc):
|
37 |
paragraphs_with_runs = []
|
|
|
117 |
translated_tokenized_sentences = [word_tokenize(sentence) for
|
118 |
translated_paragraph in translated_paragraphs for sentence in
|
119 |
sent_tokenize(translated_paragraph)]
|
120 |
+
|
121 |
+
assert len(translated_tokenized_sentences) == len(
|
122 |
+
original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
|
123 |
+
|
124 |
original_sentences = []
|
125 |
translated_sentences = []
|
126 |
for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
|