Spaces:
Sleeping
Sleeping
Added support for windows and linux, removed unused function, added more logs
Browse files- translate_docx.py +27 -47
translate_docx.py
CHANGED
@@ -10,6 +10,7 @@ from docx import Document
|
|
10 |
from docx.text.hyperlink import Hyperlink
|
11 |
from docx.text.run import Run
|
12 |
import nltk
|
|
|
13 |
|
14 |
nltk.download('punkt')
|
15 |
nltk.download('punkt_tab')
|
@@ -22,21 +23,22 @@ from subprocess import Popen, PIPE
|
|
22 |
from itertools import groupby
|
23 |
import fileinput
|
24 |
|
25 |
-
ip="192.168.20.216"
|
26 |
-
port="8000"
|
27 |
|
28 |
-
def translate(text, ip, port):
|
29 |
|
|
|
30 |
myobj = {
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
port = str(int(port))
|
35 |
url = 'http://' + ip + ':' + port + '/translate'
|
36 |
-
x = requests.post(url, json
|
37 |
json_response = json.loads(x.text)
|
38 |
return json_response['tgt']
|
39 |
|
|
|
40 |
# Class to align original and translated sentences
|
41 |
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
|
42 |
class Aligner():
|
@@ -50,12 +52,19 @@ class Aligner():
|
|
50 |
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
51 |
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
self.forward_command = lambda \
|
54 |
-
x: f'
|
55 |
self.reverse_command = lambda \
|
56 |
-
x: f'
|
57 |
|
58 |
-
self.symmetric_command = f'
|
59 |
|
60 |
def __simplify_alignment_file(self, file):
|
61 |
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
@@ -131,39 +140,6 @@ def extract_paragraphs_with_runs(doc):
|
|
131 |
return paragraphs_with_runs
|
132 |
|
133 |
|
134 |
-
def tokenize_paragraph_with_runs2(runs_in_paragraph):
|
135 |
-
text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
|
136 |
-
sentences = sent_tokenize(text_paragraph)
|
137 |
-
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
|
138 |
-
|
139 |
-
tokenized_sentences_with_style = []
|
140 |
-
for tokenized_sentence in tokenized_sentences:
|
141 |
-
tokenized_sentence_with_style = []
|
142 |
-
token_idx = 0
|
143 |
-
for run in runs_in_paragraph:
|
144 |
-
text_in_run = run["text"].strip()
|
145 |
-
|
146 |
-
if text_in_run == tokenized_sentence[token_idx]:
|
147 |
-
new_run = run.copy()
|
148 |
-
new_run["text"] = text_in_run
|
149 |
-
tokenized_sentence_with_style.append(new_run)
|
150 |
-
token_idx += 1
|
151 |
-
if token_idx >= len(tokenized_sentence):
|
152 |
-
break
|
153 |
-
elif len(text_in_run) > len(tokenized_sentence[token_idx]):
|
154 |
-
if text_in_run.startswith(tokenized_sentence[token_idx]):
|
155 |
-
for token in word_tokenize(text_in_run):
|
156 |
-
if token == tokenized_sentence[token_idx]:
|
157 |
-
new_run = run.copy()
|
158 |
-
new_run["text"] = token
|
159 |
-
tokenized_sentence_with_style.append(new_run)
|
160 |
-
token_idx += 1
|
161 |
-
else:
|
162 |
-
raise "oops"
|
163 |
-
tokenized_sentences_with_style.append(tokenized_sentence_with_style)
|
164 |
-
return tokenized_sentences_with_style
|
165 |
-
|
166 |
-
|
167 |
def tokenize_with_runs(runs, detokenizer):
|
168 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
169 |
sentences = sent_tokenize(text_paragraph)
|
@@ -194,7 +170,7 @@ def tokenize_with_runs(runs, detokenizer):
|
|
194 |
word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
|
195 |
token_index += 1
|
196 |
else:
|
197 |
-
raise "
|
198 |
tokenized_sentences_with_style.append(sentence_with_style)
|
199 |
return tokenized_sentences_with_style
|
200 |
|
@@ -311,8 +287,7 @@ def preprocess_runs(runs_in_paragraph):
|
|
311 |
return new_runs
|
312 |
|
313 |
|
314 |
-
|
315 |
-
def translate_document(input_file,
|
316 |
aligner,
|
317 |
detokenizer,
|
318 |
ip="192.168.20.216",
|
@@ -322,7 +297,7 @@ def translate_document(input_file,
|
|
322 |
# load original file, extract the paragraphs with their runs (which include style and formatting)
|
323 |
doc = Document(input_file)
|
324 |
paragraphs_with_runs = extract_paragraphs_with_runs(doc)
|
325 |
-
|
326 |
# translate each paragraph
|
327 |
translated_paragraphs = []
|
328 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
@@ -333,13 +308,17 @@ def translate_document(input_file,
|
|
333 |
|
334 |
processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
|
335 |
|
|
|
336 |
translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
|
337 |
translated_paragraphs, aligner,
|
338 |
temp_folder, detokenizer)
|
|
|
|
|
339 |
# flatten the sentences into a list of tokens
|
340 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
341 |
# group the tokens by style/run
|
342 |
translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
|
|
|
343 |
|
344 |
# group the runs by original paragraph
|
345 |
translated_paragraphs_with_style = defaultdict(list)
|
@@ -365,4 +344,5 @@ def translate_document(input_file,
|
|
365 |
run.font.color.rgb = item['font_color']
|
366 |
|
367 |
out_doc.save("translated.docx")
|
|
|
368 |
return "translated.docx"
|
|
|
10 |
from docx.text.hyperlink import Hyperlink
|
11 |
from docx.text.run import Run
|
12 |
import nltk
|
13 |
+
import platform
|
14 |
|
15 |
nltk.download('punkt')
|
16 |
nltk.download('punkt_tab')
|
|
|
23 |
from itertools import groupby
|
24 |
import fileinput
|
25 |
|
26 |
+
ip = "192.168.20.216"
|
27 |
+
port = "8000"
|
28 |
|
|
|
29 |
|
30 |
+
def translate(text, ip, port):
|
31 |
myobj = {
|
32 |
+
'id': '1',
|
33 |
+
'src': text,
|
34 |
+
}
|
35 |
port = str(int(port))
|
36 |
url = 'http://' + ip + ':' + port + '/translate'
|
37 |
+
x = requests.post(url, json=myobj)
|
38 |
json_response = json.loads(x.text)
|
39 |
return json_response['tgt']
|
40 |
|
41 |
+
|
42 |
# Class to align original and translated sentences
|
43 |
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
|
44 |
class Aligner():
|
|
|
52 |
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
53 |
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
54 |
|
55 |
+
if platform.system().lower() == "windows":
|
56 |
+
fastalign_bin = "fast_align.exe"
|
57 |
+
atools_bin = "atools.exe"
|
58 |
+
else:
|
59 |
+
fastalign_bin = "./fast_align"
|
60 |
+
atools_bin = "./atools"
|
61 |
+
|
62 |
self.forward_command = lambda \
|
63 |
+
x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
|
64 |
self.reverse_command = lambda \
|
65 |
+
x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
|
66 |
|
67 |
+
self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
|
68 |
|
69 |
def __simplify_alignment_file(self, file):
|
70 |
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
|
|
140 |
return paragraphs_with_runs
|
141 |
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
def tokenize_with_runs(runs, detokenizer):
|
144 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
145 |
sentences = sent_tokenize(text_paragraph)
|
|
|
170 |
word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
|
171 |
token_index += 1
|
172 |
else:
|
173 |
+
raise "Something unexpected happened I'm afraid"
|
174 |
tokenized_sentences_with_style.append(sentence_with_style)
|
175 |
return tokenized_sentences_with_style
|
176 |
|
|
|
287 |
return new_runs
|
288 |
|
289 |
|
290 |
+
def translate_document(input_file,
|
|
|
291 |
aligner,
|
292 |
detokenizer,
|
293 |
ip="192.168.20.216",
|
|
|
297 |
# load original file, extract the paragraphs with their runs (which include style and formatting)
|
298 |
doc = Document(input_file)
|
299 |
paragraphs_with_runs = extract_paragraphs_with_runs(doc)
|
300 |
+
|
301 |
# translate each paragraph
|
302 |
translated_paragraphs = []
|
303 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
|
|
308 |
|
309 |
processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
|
310 |
|
311 |
+
print("Generating alignments...")
|
312 |
translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
|
313 |
translated_paragraphs, aligner,
|
314 |
temp_folder, detokenizer)
|
315 |
+
print("Finished alignments")
|
316 |
+
|
317 |
# flatten the sentences into a list of tokens
|
318 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
319 |
# group the tokens by style/run
|
320 |
translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
|
321 |
+
print("Grouped by style")
|
322 |
|
323 |
# group the runs by original paragraph
|
324 |
translated_paragraphs_with_style = defaultdict(list)
|
|
|
344 |
run.font.color.rgb = item['font_color']
|
345 |
|
346 |
out_doc.save("translated.docx")
|
347 |
+
print("Saved file")
|
348 |
return "translated.docx"
|