mjuvilla commited on
Commit
580106a
·
unverified ·
2 Parent(s): fd61039 36f2ac1

Merge pull request #2 from langtech-bsc/multithreading-and-optimizations

Browse files
Files changed (1) hide show
  1. translate_docx.py +42 -28
translate_docx.py CHANGED
@@ -3,8 +3,6 @@ import json
3
  import requests
4
  import tqdm
5
  import os
6
- import string
7
- from collections import defaultdict
8
 
9
  from docx import Document
10
  from docx.text.hyperlink import Hyperlink
@@ -16,7 +14,6 @@ nltk.download('punkt')
16
  nltk.download('punkt_tab')
17
 
18
  from nltk.tokenize import sent_tokenize, word_tokenize
19
- from nltk.tokenize.treebank import TreebankWordDetokenizer
20
 
21
  from subprocess import Popen, PIPE
22
 
@@ -59,12 +56,15 @@ class Aligner():
59
  fastalign_bin = "./fast_align"
60
  atools_bin = "./atools"
61
 
62
- self.forward_command = lambda \
63
- x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
64
- self.reverse_command = lambda \
65
- x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
66
 
67
- self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
 
 
 
 
 
 
68
 
69
  def __simplify_alignment_file(self, file):
70
  with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
@@ -82,20 +82,28 @@ class Aligner():
82
  T = line.split()[-1]
83
  return T, m
84
 
85
- def align(self, file):
 
 
 
 
 
86
  # generate forward alignment
87
- process = Popen(self.forward_command(file), shell=True)
88
- process.wait()
89
- # generate reverse alignment
90
- process = Popen(self.reverse_command(file), shell=True)
91
- process.wait()
 
 
 
92
 
93
  # for some reason the output file contains more information than needed, remove it
94
  self.__simplify_alignment_file(self.forward_alignment_file_path)
95
  self.__simplify_alignment_file(self.reverse_alignment_file_path)
96
 
97
  # generate symmetrical alignment
98
- process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
99
  process.wait()
100
 
101
  # get final alignments and format them
@@ -180,8 +188,6 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
180
  for f in os.listdir(temp_folder):
181
  os.remove(os.path.join(temp_folder, f))
182
 
183
- temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
184
-
185
  # tokenize the original text by sentence and words while keeping the style
186
  original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
187
  original_paragraphs_with_runs]
@@ -194,13 +200,13 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
194
  translated_tokenized_sentences = [word_tokenize(sentence) for
195
  translated_paragraph in translated_paragraphs for sentence in
196
  sent_tokenize(translated_paragraph)]
 
 
 
 
 
197
 
198
- # write the file that fastalign will use
199
- with open(temp_file_path, "w") as out_file:
200
- for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
201
- out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
202
-
203
- alignments = aligner.align(temp_file_path)
204
 
205
  # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
206
  translated_sentences_with_style = []
@@ -238,7 +244,7 @@ def group_by_style(values, detokenizer):
238
  x['paragraph_index'])):
239
  text = detokenizer.detokenize([item['text'] for item in group])
240
 
241
- if groups and not text.startswith((",", ";", ":", ".", ")")):
242
  text = " " + text
243
 
244
  groups.append({"text": text,
@@ -309,21 +315,29 @@ def translate_document(input_file,
309
  processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
310
 
311
  print("Generating alignments...")
 
312
  translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
313
  translated_paragraphs, aligner,
314
  temp_folder, detokenizer)
315
- print("Finished alignments")
316
 
317
  # flatten the sentences into a list of tokens
318
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
319
  # group the tokens by style/run
320
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
321
- print("Grouped by style")
322
 
323
  # group the runs by original paragraph
324
- translated_paragraphs_with_style = defaultdict(list)
325
  for item in translated_runs_with_style:
326
- translated_paragraphs_with_style[item['paragraph_index']].append(item)
 
 
 
 
 
 
 
 
327
 
328
  for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
329
  # in case there are empty paragraphs
 
3
  import requests
4
  import tqdm
5
  import os
 
 
6
 
7
  from docx import Document
8
  from docx.text.hyperlink import Hyperlink
 
14
  nltk.download('punkt_tab')
15
 
16
  from nltk.tokenize import sent_tokenize, word_tokenize
 
17
 
18
  from subprocess import Popen, PIPE
19
 
 
56
  fastalign_bin = "./fast_align"
57
  atools_bin = "./atools"
58
 
59
+ self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
 
 
 
60
 
61
+ self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
62
+ forward_params_path]
63
+ self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
64
+ reverse_params_path, "r"]
65
+
66
+ self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
67
+ self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
68
 
69
  def __simplify_alignment_file(self, file):
70
  with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
 
82
  T = line.split()[-1]
83
  return T, m
84
 
85
+ def align(self, original_sentences, translated_sentences):
86
+ # create temporary file which fastalign will use
87
+ with open(self.temp_file_path, "w") as temp_file:
88
+ for original, translated in zip(original_sentences, translated_sentences):
89
+ temp_file.write(f"{original} ||| {translated}\n")
90
+
91
  # generate forward alignment
92
+ with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
93
+ fw_process = Popen(self.forward_command, stdout=f_out)
94
+ # generate reverse alignment
95
+ r_process = Popen(self.reverse_command, stdout=r_out)
96
+
97
+ # wait for both to finish
98
+ fw_process.wait()
99
+ r_process.wait()
100
 
101
  # for some reason the output file contains more information than needed, remove it
102
  self.__simplify_alignment_file(self.forward_alignment_file_path)
103
  self.__simplify_alignment_file(self.reverse_alignment_file_path)
104
 
105
  # generate symmetrical alignment
106
+ process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
107
  process.wait()
108
 
109
  # get final alignments and format them
 
188
  for f in os.listdir(temp_folder):
189
  os.remove(os.path.join(temp_folder, f))
190
 
 
 
191
  # tokenize the original text by sentence and words while keeping the style
192
  original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
193
  original_paragraphs_with_runs]
 
200
  translated_tokenized_sentences = [word_tokenize(sentence) for
201
  translated_paragraph in translated_paragraphs for sentence in
202
  sent_tokenize(translated_paragraph)]
203
+ original_sentences = []
204
+ translated_sentences = []
205
+ for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
206
+ original_sentences.append(' '.join(item['text'] for item in original))
207
+ translated_sentences.append(' '.join(translated))
208
 
209
+ alignments = aligner.align(original_sentences, translated_sentences)
 
 
 
 
 
210
 
211
  # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
212
  translated_sentences_with_style = []
 
244
  x['paragraph_index'])):
245
  text = detokenizer.detokenize([item['text'] for item in group])
246
 
247
+ if groups and not text.startswith((",", ";", ":", ".", ")", "!", "?")):
248
  text = " " + text
249
 
250
  groups.append({"text": text,
 
315
  processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
316
 
317
  print("Generating alignments...")
318
+ start_time = time.time()
319
  translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
320
  translated_paragraphs, aligner,
321
  temp_folder, detokenizer)
322
+ print(f"Finished alignments in {time.time() - start_time} seconds")
323
 
324
  # flatten the sentences into a list of tokens
325
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
326
  # group the tokens by style/run
327
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
 
328
 
329
  # group the runs by original paragraph
330
+ translated_paragraphs_with_style = dict()
331
  for item in translated_runs_with_style:
332
+ if item['paragraph_index'] in translated_paragraphs_with_style:
333
+ translated_paragraphs_with_style[item['paragraph_index']].append(item)
334
+ else:
335
+ # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
336
+ # didn't know where paragraphs started and ended
337
+ first_item_in_paragraph = item.copy()
338
+ first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
339
+ translated_paragraphs_with_style[item['paragraph_index']] = []
340
+ translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
341
 
342
  for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
343
  # in case there are empty paragraphs