mjuvilla commited on
Commit
0efc9da
·
1 Parent(s): fd61039

now both fastaligns run in parallel, also added some improvements to the code (avoid using shell in popen, class Aligner creates the temporary files instead of doing it externally)

Browse files
Files changed (1) hide show
  1. translate_docx.py +29 -20
translate_docx.py CHANGED
@@ -59,12 +59,15 @@ class Aligner():
59
  fastalign_bin = "./fast_align"
60
  atools_bin = "./atools"
61
 
62
- self.forward_command = lambda \
63
- x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
64
- self.reverse_command = lambda \
65
- x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
66
 
67
- self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
 
 
 
 
 
 
68
 
69
  def __simplify_alignment_file(self, file):
70
  with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
@@ -82,20 +85,28 @@ class Aligner():
82
  T = line.split()[-1]
83
  return T, m
84
 
85
- def align(self, file):
 
 
 
 
 
86
  # generate forward alignment
87
- process = Popen(self.forward_command(file), shell=True)
88
- process.wait()
89
- # generate reverse alignment
90
- process = Popen(self.reverse_command(file), shell=True)
91
- process.wait()
 
 
 
92
 
93
  # for some reason the output file contains more information than needed, remove it
94
  self.__simplify_alignment_file(self.forward_alignment_file_path)
95
  self.__simplify_alignment_file(self.reverse_alignment_file_path)
96
 
97
  # generate symmetrical alignment
98
- process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
99
  process.wait()
100
 
101
  # get final alignments and format them
@@ -180,8 +191,6 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
180
  for f in os.listdir(temp_folder):
181
  os.remove(os.path.join(temp_folder, f))
182
 
183
- temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
184
-
185
  # tokenize the original text by sentence and words while keeping the style
186
  original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
187
  original_paragraphs_with_runs]
@@ -194,13 +203,13 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
194
  translated_tokenized_sentences = [word_tokenize(sentence) for
195
  translated_paragraph in translated_paragraphs for sentence in
196
  sent_tokenize(translated_paragraph)]
 
 
 
 
 
197
 
198
- # write the file that fastalign will use
199
- with open(temp_file_path, "w") as out_file:
200
- for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
201
- out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
202
-
203
- alignments = aligner.align(temp_file_path)
204
 
205
  # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
206
  translated_sentences_with_style = []
 
59
  fastalign_bin = "./fast_align"
60
  atools_bin = "./atools"
61
 
62
+ self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
 
 
 
63
 
64
+ self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
65
+ forward_params_path]
66
+ self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
67
+ reverse_params_path, "r"]
68
+
69
+ self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
70
+ self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
71
 
72
  def __simplify_alignment_file(self, file):
73
  with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
 
85
  T = line.split()[-1]
86
  return T, m
87
 
88
+ def align(self, original_sentences, translated_sentences):
89
+ # create temporary file which fastalign will use
90
+ with open(self.temp_file_path, "w") as temp_file:
91
+ for original, translated in zip(original_sentences, translated_sentences):
92
+ temp_file.write(f"{original} ||| {translated}\n")
93
+
94
  # generate forward alignment
95
+ with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
96
+ fw_process = Popen(self.forward_command, stdout=f_out)
97
+ # generate reverse alignment
98
+ r_process = Popen(self.reverse_command, stdout=r_out)
99
+
100
+ # wait for both to finish
101
+ fw_process.wait()
102
+ r_process.wait()
103
 
104
  # for some reason the output file contains more information than needed, remove it
105
  self.__simplify_alignment_file(self.forward_alignment_file_path)
106
  self.__simplify_alignment_file(self.reverse_alignment_file_path)
107
 
108
  # generate symmetrical alignment
109
+ process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
110
  process.wait()
111
 
112
  # get final alignments and format them
 
191
  for f in os.listdir(temp_folder):
192
  os.remove(os.path.join(temp_folder, f))
193
 
 
 
194
  # tokenize the original text by sentence and words while keeping the style
195
  original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
196
  original_paragraphs_with_runs]
 
203
  translated_tokenized_sentences = [word_tokenize(sentence) for
204
  translated_paragraph in translated_paragraphs for sentence in
205
  sent_tokenize(translated_paragraph)]
206
+ original_sentences = []
207
+ translated_sentences = []
208
+ for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
209
+ original_sentences.append(' '.join(item['text'] for item in original))
210
+ translated_sentences.append(' '.join(translated))
211
 
212
+ alignments = aligner.align(original_sentences, translated_sentences)
 
 
 
 
 
213
 
214
  # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
215
  translated_sentences_with_style = []