mjuvilla commited on
Commit
1792639
·
1 Parent(s): 100f3e3

Added support for windows and linux, removed unused function, added more logs

Browse files
Files changed (1) hide show
  1. translate_docx.py +27 -47
translate_docx.py CHANGED
@@ -10,6 +10,7 @@ from docx import Document
10
  from docx.text.hyperlink import Hyperlink
11
  from docx.text.run import Run
12
  import nltk
 
13
 
14
  nltk.download('punkt')
15
  nltk.download('punkt_tab')
@@ -22,21 +23,22 @@ from subprocess import Popen, PIPE
22
  from itertools import groupby
23
  import fileinput
24
 
25
- ip="192.168.20.216"
26
- port="8000"
27
 
28
- def translate(text, ip, port):
29
 
 
30
  myobj = {
31
- 'id': '1',
32
- 'src': text,
33
- }
34
  port = str(int(port))
35
  url = 'http://' + ip + ':' + port + '/translate'
36
- x = requests.post(url, json = myobj)
37
  json_response = json.loads(x.text)
38
  return json_response['tgt']
39
 
 
40
  # Class to align original and translated sentences
41
  # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
42
  class Aligner():
@@ -50,12 +52,19 @@ class Aligner():
50
  self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
51
  self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
52
 
 
 
 
 
 
 
 
53
  self.forward_command = lambda \
54
- x: f'fast_align.exe -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
55
  self.reverse_command = lambda \
56
- x: f'fast_align.exe -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
57
 
58
- self.symmetric_command = f'atools.exe -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
59
 
60
  def __simplify_alignment_file(self, file):
61
  with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
@@ -131,39 +140,6 @@ def extract_paragraphs_with_runs(doc):
131
  return paragraphs_with_runs
132
 
133
 
134
- def tokenize_paragraph_with_runs2(runs_in_paragraph):
135
- text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
136
- sentences = sent_tokenize(text_paragraph)
137
- tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
138
-
139
- tokenized_sentences_with_style = []
140
- for tokenized_sentence in tokenized_sentences:
141
- tokenized_sentence_with_style = []
142
- token_idx = 0
143
- for run in runs_in_paragraph:
144
- text_in_run = run["text"].strip()
145
-
146
- if text_in_run == tokenized_sentence[token_idx]:
147
- new_run = run.copy()
148
- new_run["text"] = text_in_run
149
- tokenized_sentence_with_style.append(new_run)
150
- token_idx += 1
151
- if token_idx >= len(tokenized_sentence):
152
- break
153
- elif len(text_in_run) > len(tokenized_sentence[token_idx]):
154
- if text_in_run.startswith(tokenized_sentence[token_idx]):
155
- for token in word_tokenize(text_in_run):
156
- if token == tokenized_sentence[token_idx]:
157
- new_run = run.copy()
158
- new_run["text"] = token
159
- tokenized_sentence_with_style.append(new_run)
160
- token_idx += 1
161
- else:
162
- raise "oops"
163
- tokenized_sentences_with_style.append(tokenized_sentence_with_style)
164
- return tokenized_sentences_with_style
165
-
166
-
167
  def tokenize_with_runs(runs, detokenizer):
168
  text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
169
  sentences = sent_tokenize(text_paragraph)
@@ -194,7 +170,7 @@ def tokenize_with_runs(runs, detokenizer):
194
  word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
195
  token_index += 1
196
  else:
197
- raise "oops"
198
  tokenized_sentences_with_style.append(sentence_with_style)
199
  return tokenized_sentences_with_style
200
 
@@ -311,8 +287,7 @@ def preprocess_runs(runs_in_paragraph):
311
  return new_runs
312
 
313
 
314
-
315
- def translate_document(input_file,
316
  aligner,
317
  detokenizer,
318
  ip="192.168.20.216",
@@ -322,7 +297,7 @@ def translate_document(input_file,
322
  # load original file, extract the paragraphs with their runs (which include style and formatting)
323
  doc = Document(input_file)
324
  paragraphs_with_runs = extract_paragraphs_with_runs(doc)
325
-
326
  # translate each paragraph
327
  translated_paragraphs = []
328
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
@@ -333,13 +308,17 @@ def translate_document(input_file,
333
 
334
  processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
335
 
 
336
  translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
337
  translated_paragraphs, aligner,
338
  temp_folder, detokenizer)
 
 
339
  # flatten the sentences into a list of tokens
340
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
341
  # group the tokens by style/run
342
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
 
343
 
344
  # group the runs by original paragraph
345
  translated_paragraphs_with_style = defaultdict(list)
@@ -365,4 +344,5 @@ def translate_document(input_file,
365
  run.font.color.rgb = item['font_color']
366
 
367
  out_doc.save("translated.docx")
 
368
  return "translated.docx"
 
10
  from docx.text.hyperlink import Hyperlink
11
  from docx.text.run import Run
12
  import nltk
13
+ import platform
14
 
15
  nltk.download('punkt')
16
  nltk.download('punkt_tab')
 
23
  from itertools import groupby
24
  import fileinput
25
 
26
+ ip = "192.168.20.216"
27
+ port = "8000"
28
 
 
29
 
30
+ def translate(text, ip, port):
31
  myobj = {
32
+ 'id': '1',
33
+ 'src': text,
34
+ }
35
  port = str(int(port))
36
  url = 'http://' + ip + ':' + port + '/translate'
37
+ x = requests.post(url, json=myobj)
38
  json_response = json.loads(x.text)
39
  return json_response['tgt']
40
 
41
+
42
  # Class to align original and translated sentences
43
  # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
44
  class Aligner():
 
52
  self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
53
  self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
54
 
55
+ if platform.system().lower() == "windows":
56
+ fastalign_bin = "fast_align.exe"
57
+ atools_bin = "atools.exe"
58
+ else:
59
+ fastalign_bin = "./fast_align"
60
+ atools_bin = "./atools"
61
+
62
  self.forward_command = lambda \
63
+ x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
64
  self.reverse_command = lambda \
65
+ x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
66
 
67
+ self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
68
 
69
  def __simplify_alignment_file(self, file):
70
  with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
 
140
  return paragraphs_with_runs
141
 
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def tokenize_with_runs(runs, detokenizer):
144
  text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
145
  sentences = sent_tokenize(text_paragraph)
 
170
  word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
171
  token_index += 1
172
  else:
173
+ raise "Something unexpected happened I'm afraid"
174
  tokenized_sentences_with_style.append(sentence_with_style)
175
  return tokenized_sentences_with_style
176
 
 
287
  return new_runs
288
 
289
 
290
+ def translate_document(input_file,
 
291
  aligner,
292
  detokenizer,
293
  ip="192.168.20.216",
 
297
  # load original file, extract the paragraphs with their runs (which include style and formatting)
298
  doc = Document(input_file)
299
  paragraphs_with_runs = extract_paragraphs_with_runs(doc)
300
+
301
  # translate each paragraph
302
  translated_paragraphs = []
303
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
 
308
 
309
  processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
310
 
311
+ print("Generating alignments...")
312
  translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
313
  translated_paragraphs, aligner,
314
  temp_folder, detokenizer)
315
+ print("Finished alignments")
316
+
317
  # flatten the sentences into a list of tokens
318
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
319
  # group the tokens by style/run
320
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
321
+ print("Grouped by style")
322
 
323
  # group the runs by original paragraph
324
  translated_paragraphs_with_style = defaultdict(list)
 
344
  run.font.color.rgb = item['font_color']
345
 
346
  out_doc.save("translated.docx")
347
+ print("Saved file")
348
  return "translated.docx"