Spaces:
Sleeping
Sleeping
now both fastaligns run in parallel, also added some improvements to the code (avoid using shell in popen, class Aligner creates the temporary files instead of doing it externally)
Browse files- translate_docx.py +29 -20
translate_docx.py
CHANGED
@@ -59,12 +59,15 @@ class Aligner():
|
|
59 |
fastalign_bin = "./fast_align"
|
60 |
atools_bin = "./atools"
|
61 |
|
62 |
-
self.
|
63 |
-
x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
|
64 |
-
self.reverse_command = lambda \
|
65 |
-
x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
|
66 |
|
67 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
def __simplify_alignment_file(self, file):
|
70 |
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
@@ -82,20 +85,28 @@ class Aligner():
|
|
82 |
T = line.split()[-1]
|
83 |
return T, m
|
84 |
|
85 |
-
def align(self,
|
|
|
|
|
|
|
|
|
|
|
86 |
# generate forward alignment
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
92 |
|
93 |
# for some reason the output file contains more information than needed, remove it
|
94 |
self.__simplify_alignment_file(self.forward_alignment_file_path)
|
95 |
self.__simplify_alignment_file(self.reverse_alignment_file_path)
|
96 |
|
97 |
# generate symmetrical alignment
|
98 |
-
process = Popen(self.symmetric_command,
|
99 |
process.wait()
|
100 |
|
101 |
# get final alignments and format them
|
@@ -180,8 +191,6 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
|
|
180 |
for f in os.listdir(temp_folder):
|
181 |
os.remove(os.path.join(temp_folder, f))
|
182 |
|
183 |
-
temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
|
184 |
-
|
185 |
# tokenize the original text by sentence and words while keeping the style
|
186 |
original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
|
187 |
original_paragraphs_with_runs]
|
@@ -194,13 +203,13 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
|
|
194 |
translated_tokenized_sentences = [word_tokenize(sentence) for
|
195 |
translated_paragraph in translated_paragraphs for sentence in
|
196 |
sent_tokenize(translated_paragraph)]
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
-
|
199 |
-
with open(temp_file_path, "w") as out_file:
|
200 |
-
for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
|
201 |
-
out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
|
202 |
-
|
203 |
-
alignments = aligner.align(temp_file_path)
|
204 |
|
205 |
# using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
|
206 |
translated_sentences_with_style = []
|
|
|
59 |
fastalign_bin = "./fast_align"
|
60 |
atools_bin = "./atools"
|
61 |
|
62 |
+
self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
|
|
|
|
|
|
|
63 |
|
64 |
+
self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
|
65 |
+
forward_params_path]
|
66 |
+
self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
|
67 |
+
reverse_params_path, "r"]
|
68 |
+
|
69 |
+
self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
|
70 |
+
self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
|
71 |
|
72 |
def __simplify_alignment_file(self, file):
|
73 |
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
|
|
85 |
T = line.split()[-1]
|
86 |
return T, m
|
87 |
|
88 |
+
def align(self, original_sentences, translated_sentences):
|
89 |
+
# create temporary file which fastalign will use
|
90 |
+
with open(self.temp_file_path, "w") as temp_file:
|
91 |
+
for original, translated in zip(original_sentences, translated_sentences):
|
92 |
+
temp_file.write(f"{original} ||| {translated}\n")
|
93 |
+
|
94 |
# generate forward alignment
|
95 |
+
with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
|
96 |
+
fw_process = Popen(self.forward_command, stdout=f_out)
|
97 |
+
# generate reverse alignment
|
98 |
+
r_process = Popen(self.reverse_command, stdout=r_out)
|
99 |
+
|
100 |
+
# wait for both to finish
|
101 |
+
fw_process.wait()
|
102 |
+
r_process.wait()
|
103 |
|
104 |
# for some reason the output file contains more information than needed, remove it
|
105 |
self.__simplify_alignment_file(self.forward_alignment_file_path)
|
106 |
self.__simplify_alignment_file(self.reverse_alignment_file_path)
|
107 |
|
108 |
# generate symmetrical alignment
|
109 |
+
process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
110 |
process.wait()
|
111 |
|
112 |
# get final alignments and format them
|
|
|
191 |
for f in os.listdir(temp_folder):
|
192 |
os.remove(os.path.join(temp_folder, f))
|
193 |
|
|
|
|
|
194 |
# tokenize the original text by sentence and words while keeping the style
|
195 |
original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
|
196 |
original_paragraphs_with_runs]
|
|
|
203 |
translated_tokenized_sentences = [word_tokenize(sentence) for
|
204 |
translated_paragraph in translated_paragraphs for sentence in
|
205 |
sent_tokenize(translated_paragraph)]
|
206 |
+
original_sentences = []
|
207 |
+
translated_sentences = []
|
208 |
+
for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
|
209 |
+
original_sentences.append(' '.join(item['text'] for item in original))
|
210 |
+
translated_sentences.append(' '.join(translated))
|
211 |
|
212 |
+
alignments = aligner.align(original_sentences, translated_sentences)
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
# using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
|
215 |
translated_sentences_with_style = []
|