Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Apr 15

Commit

0efc9da

1 Parent(s): fd61039

now both fastaligns run in parallel, also added some improvements to the code (avoid using shell in popen, class Aligner creates the temporary files instead of doing it externally)

Browse files

Files changed (1) hide show

translate_docx.py +29 -20

translate_docx.py CHANGED Viewed

@@ -59,12 +59,15 @@ class Aligner():
             fastalign_bin = "./fast_align"
             atools_bin = "./atools"
-        self.forward_command = lambda \
-                x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
-        self.reverse_command = lambda \
-                x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
-        self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
     def __simplify_alignment_file(self, file):
         with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
@@ -82,20 +85,28 @@ class Aligner():
                 T = line.split()[-1]
         return T, m
-    def align(self, file):
         # generate forward alignment
-        process = Popen(self.forward_command(file), shell=True)
-        process.wait()
-        # generate reverse alignment
-        process = Popen(self.reverse_command(file), shell=True)
-        process.wait()
         # for some reason the output file contains more information than needed, remove it
         self.__simplify_alignment_file(self.forward_alignment_file_path)
         self.__simplify_alignment_file(self.reverse_alignment_file_path)
         # generate symmetrical alignment
-        process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
         process.wait()
         # get final alignments and format them
@@ -180,8 +191,6 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
     for f in os.listdir(temp_folder):
         os.remove(os.path.join(temp_folder, f))
-    temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
     # tokenize the original text by sentence and words while keeping the style
     original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
                                                original_paragraphs_with_runs]
@@ -194,13 +203,13 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
     translated_tokenized_sentences = [word_tokenize(sentence) for
                                       translated_paragraph in translated_paragraphs for sentence in
                                       sent_tokenize(translated_paragraph)]
-    # write the file that fastalign will use
-    with open(temp_file_path, "w") as out_file:
-        for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
-            out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
-    alignments = aligner.align(temp_file_path)
     # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
     translated_sentences_with_style = []

             fastalign_bin = "./fast_align"
             atools_bin = "./atools"
+        self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
+        self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
+                                forward_params_path]
+        self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
+                                reverse_params_path, "r"]
+        self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
+                                  self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
     def __simplify_alignment_file(self, file):
         with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
                 T = line.split()[-1]
         return T, m
+    def align(self, original_sentences, translated_sentences):
+        # create temporary file which fastalign will use
+        with open(self.temp_file_path, "w") as temp_file:
+            for original, translated in zip(original_sentences, translated_sentences):
+                temp_file.write(f"{original} ||| {translated}\n")
         # generate forward alignment
+        with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
+            fw_process = Popen(self.forward_command, stdout=f_out)
+            # generate reverse alignment
+            r_process = Popen(self.reverse_command, stdout=r_out)
+            # wait for both to finish
+            fw_process.wait()
+            r_process.wait()
         # for some reason the output file contains more information than needed, remove it
         self.__simplify_alignment_file(self.forward_alignment_file_path)
         self.__simplify_alignment_file(self.reverse_alignment_file_path)
         # generate symmetrical alignment
+        process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
         process.wait()
         # get final alignments and format them
     for f in os.listdir(temp_folder):
         os.remove(os.path.join(temp_folder, f))
     # tokenize the original text by sentence and words while keeping the style
     original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
                                                original_paragraphs_with_runs]
     translated_tokenized_sentences = [word_tokenize(sentence) for
                                       translated_paragraph in translated_paragraphs for sentence in
                                       sent_tokenize(translated_paragraph)]
+    original_sentences = []
+    translated_sentences = []
+    for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
+        original_sentences.append(' '.join(item['text'] for item in original))
+        translated_sentences.append(' '.join(translated))
+    alignments = aligner.align(original_sentences, translated_sentences)
     # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
     translated_sentences_with_style = []