Spaces:
Sleeping
Sleeping
carlosep93
commited on
Commit
·
100f3e3
1
Parent(s):
8030df1
gradio app for windows
Browse files- gradio_app.py +39 -0
- translate_docx.py +368 -0
gradio_app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from pathlib import Path
|
3 |
+
import requests
|
4 |
+
import json
|
5 |
+
from translate_docx import translate_document, translate, Aligner
|
6 |
+
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
7 |
+
|
8 |
+
|
9 |
+
ip='10.192.31.127'
|
10 |
+
config_folder = 'fast_align_config'
|
11 |
+
source_lang = 'en'
|
12 |
+
target_lang = 'ca'
|
13 |
+
temp_folder = 'tmp'
|
14 |
+
aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
|
15 |
+
detokenizer = TreebankWordDetokenizer()
|
16 |
+
|
17 |
+
|
18 |
+
def upload_file(filepath):
|
19 |
+
translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
|
20 |
+
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
|
21 |
+
|
22 |
+
def download_file():
|
23 |
+
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|
24 |
+
|
25 |
+
|
26 |
+
with gr.Blocks() as demo:
|
27 |
+
|
28 |
+
with gr.Tab("Text"):
|
29 |
+
gr.Interface(fn=translate, inputs=["text","text","text"], outputs="text")
|
30 |
+
with gr.Tab("Docx documents"):
|
31 |
+
gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
|
32 |
+
with gr.Row():
|
33 |
+
u = gr.UploadButton("Upload a file", file_count="single")
|
34 |
+
d = gr.DownloadButton("Download the file", visible=False)
|
35 |
+
|
36 |
+
u.upload(upload_file, u, [u, d])
|
37 |
+
d.click(download_file, None, [u, d])
|
38 |
+
if __name__ == "__main__":
|
39 |
+
demo.launch()
|
translate_docx.py
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import json
|
3 |
+
import requests
|
4 |
+
import tqdm
|
5 |
+
import os
|
6 |
+
import string
|
7 |
+
from collections import defaultdict
|
8 |
+
|
9 |
+
from docx import Document
|
10 |
+
from docx.text.hyperlink import Hyperlink
|
11 |
+
from docx.text.run import Run
|
12 |
+
import nltk
|
13 |
+
|
14 |
+
nltk.download('punkt')
|
15 |
+
nltk.download('punkt_tab')
|
16 |
+
|
17 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
18 |
+
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
19 |
+
|
20 |
+
from subprocess import Popen, PIPE
|
21 |
+
|
22 |
+
from itertools import groupby
|
23 |
+
import fileinput
|
24 |
+
|
25 |
+
ip="192.168.20.216"
|
26 |
+
port="8000"
|
27 |
+
|
28 |
+
def translate(text, ip, port):
|
29 |
+
|
30 |
+
myobj = {
|
31 |
+
'id': '1',
|
32 |
+
'src': text,
|
33 |
+
}
|
34 |
+
port = str(int(port))
|
35 |
+
url = 'http://' + ip + ':' + port + '/translate'
|
36 |
+
x = requests.post(url, json = myobj)
|
37 |
+
json_response = json.loads(x.text)
|
38 |
+
return json_response['tgt']
|
39 |
+
|
40 |
+
# Class to align original and translated sentences
|
41 |
+
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
|
42 |
+
class Aligner():
|
43 |
+
def __init__(self, config_folder, source_lang, target_lang, temp_folder):
|
44 |
+
forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
|
45 |
+
reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
|
46 |
+
|
47 |
+
fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
|
48 |
+
rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
|
49 |
+
|
50 |
+
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
51 |
+
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
52 |
+
|
53 |
+
self.forward_command = lambda \
|
54 |
+
x: f'fast_align.exe -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
|
55 |
+
self.reverse_command = lambda \
|
56 |
+
x: f'fast_align.exe -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
|
57 |
+
|
58 |
+
self.symmetric_command = f'atools.exe -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
|
59 |
+
|
60 |
+
def __simplify_alignment_file(self, file):
|
61 |
+
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
62 |
+
for line in f:
|
63 |
+
print(line.split('|||')[2].strip())
|
64 |
+
|
65 |
+
def __read_err(self, err):
|
66 |
+
(T, m) = ('', '')
|
67 |
+
for line in open(err):
|
68 |
+
# expected target length = source length * N
|
69 |
+
if 'expected target length' in line:
|
70 |
+
m = line.split()[-1]
|
71 |
+
# final tension: N
|
72 |
+
elif 'final tension' in line:
|
73 |
+
T = line.split()[-1]
|
74 |
+
return T, m
|
75 |
+
|
76 |
+
def align(self, file):
|
77 |
+
# generate forward alignment
|
78 |
+
process = Popen(self.forward_command(file), shell=True)
|
79 |
+
process.wait()
|
80 |
+
# generate reverse alignment
|
81 |
+
process = Popen(self.reverse_command(file), shell=True)
|
82 |
+
process.wait()
|
83 |
+
|
84 |
+
# for some reason the output file contains more information than needed, remove it
|
85 |
+
self.__simplify_alignment_file(self.forward_alignment_file_path)
|
86 |
+
self.__simplify_alignment_file(self.reverse_alignment_file_path)
|
87 |
+
|
88 |
+
# generate symmetrical alignment
|
89 |
+
process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
90 |
+
process.wait()
|
91 |
+
|
92 |
+
# get final alignments and format them
|
93 |
+
alignments_str = process.communicate()[0].decode('utf-8')
|
94 |
+
alignments = []
|
95 |
+
for line in alignments_str.splitlines():
|
96 |
+
alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
|
97 |
+
|
98 |
+
return alignments
|
99 |
+
|
100 |
+
|
101 |
+
# Function to extract paragraphs with their runs
|
102 |
+
def extract_paragraphs_with_runs(doc):
|
103 |
+
paragraphs_with_runs = []
|
104 |
+
for idx, paragraph in enumerate(doc.paragraphs):
|
105 |
+
runs = []
|
106 |
+
for item in paragraph.iter_inner_content():
|
107 |
+
if isinstance(item, Run):
|
108 |
+
runs.append({
|
109 |
+
'text': item.text,
|
110 |
+
'bold': item.bold,
|
111 |
+
'italic': item.italic,
|
112 |
+
'underline': item.underline,
|
113 |
+
'font_name': item.font.name,
|
114 |
+
'font_size': item.font.size,
|
115 |
+
'font_color': item.font.color.rgb,
|
116 |
+
'paragraph_index': idx
|
117 |
+
})
|
118 |
+
elif isinstance(item, Hyperlink):
|
119 |
+
runs.append({
|
120 |
+
'text': item.runs[0].text,
|
121 |
+
'bold': item.runs[0].bold,
|
122 |
+
'italic': item.runs[0].italic,
|
123 |
+
'underline': item.runs[0].underline,
|
124 |
+
'font_name': item.runs[0].font.name,
|
125 |
+
'font_size': item.runs[0].font.size,
|
126 |
+
'font_color': item.runs[0].font.color.rgb,
|
127 |
+
'paragraph_index': idx
|
128 |
+
})
|
129 |
+
|
130 |
+
paragraphs_with_runs.append(runs)
|
131 |
+
return paragraphs_with_runs
|
132 |
+
|
133 |
+
|
134 |
+
def tokenize_paragraph_with_runs2(runs_in_paragraph):
|
135 |
+
text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
|
136 |
+
sentences = sent_tokenize(text_paragraph)
|
137 |
+
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
|
138 |
+
|
139 |
+
tokenized_sentences_with_style = []
|
140 |
+
for tokenized_sentence in tokenized_sentences:
|
141 |
+
tokenized_sentence_with_style = []
|
142 |
+
token_idx = 0
|
143 |
+
for run in runs_in_paragraph:
|
144 |
+
text_in_run = run["text"].strip()
|
145 |
+
|
146 |
+
if text_in_run == tokenized_sentence[token_idx]:
|
147 |
+
new_run = run.copy()
|
148 |
+
new_run["text"] = text_in_run
|
149 |
+
tokenized_sentence_with_style.append(new_run)
|
150 |
+
token_idx += 1
|
151 |
+
if token_idx >= len(tokenized_sentence):
|
152 |
+
break
|
153 |
+
elif len(text_in_run) > len(tokenized_sentence[token_idx]):
|
154 |
+
if text_in_run.startswith(tokenized_sentence[token_idx]):
|
155 |
+
for token in word_tokenize(text_in_run):
|
156 |
+
if token == tokenized_sentence[token_idx]:
|
157 |
+
new_run = run.copy()
|
158 |
+
new_run["text"] = token
|
159 |
+
tokenized_sentence_with_style.append(new_run)
|
160 |
+
token_idx += 1
|
161 |
+
else:
|
162 |
+
raise "oops"
|
163 |
+
tokenized_sentences_with_style.append(tokenized_sentence_with_style)
|
164 |
+
return tokenized_sentences_with_style
|
165 |
+
|
166 |
+
|
167 |
+
def tokenize_with_runs(runs, detokenizer):
|
168 |
+
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
169 |
+
sentences = sent_tokenize(text_paragraph)
|
170 |
+
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
|
171 |
+
|
172 |
+
tokens_with_style = []
|
173 |
+
for run in runs:
|
174 |
+
tokens = word_tokenize(run["text"])
|
175 |
+
for token in tokens:
|
176 |
+
tokens_with_style.append(run.copy())
|
177 |
+
tokens_with_style[-1]["text"] = token
|
178 |
+
|
179 |
+
token_index = 0
|
180 |
+
tokenized_sentences_with_style = []
|
181 |
+
for sentence in tokenized_sentences:
|
182 |
+
sentence_with_style = []
|
183 |
+
for word in sentence:
|
184 |
+
if word == tokens_with_style[token_index]["text"]:
|
185 |
+
sentence_with_style.append(tokens_with_style[token_index])
|
186 |
+
token_index += 1
|
187 |
+
else:
|
188 |
+
if word.startswith(tokens_with_style[token_index]["text"]):
|
189 |
+
# this token might be split into several runs
|
190 |
+
word_left = word
|
191 |
+
|
192 |
+
while word_left:
|
193 |
+
sentence_with_style.append(tokens_with_style[token_index])
|
194 |
+
word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
|
195 |
+
token_index += 1
|
196 |
+
else:
|
197 |
+
raise "oops"
|
198 |
+
tokenized_sentences_with_style.append(sentence_with_style)
|
199 |
+
return tokenized_sentences_with_style
|
200 |
+
|
201 |
+
|
202 |
+
def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, aligner, temp_folder, detokenizer):
|
203 |
+
# clean temp folder
|
204 |
+
for f in os.listdir(temp_folder):
|
205 |
+
os.remove(os.path.join(temp_folder, f))
|
206 |
+
|
207 |
+
temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
|
208 |
+
|
209 |
+
# tokenize the original text by sentence and words while keeping the style
|
210 |
+
original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
|
211 |
+
original_paragraphs_with_runs]
|
212 |
+
|
213 |
+
# flatten all the runs so we can align with just one call instead of one per paragraph
|
214 |
+
original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
|
215 |
+
sublist]
|
216 |
+
|
217 |
+
# tokenize the translated text by sentence and word
|
218 |
+
translated_tokenized_sentences = [word_tokenize(sentence) for
|
219 |
+
translated_paragraph in translated_paragraphs for sentence in
|
220 |
+
sent_tokenize(translated_paragraph)]
|
221 |
+
|
222 |
+
# write the file that fastalign will use
|
223 |
+
with open(temp_file_path, "w") as out_file:
|
224 |
+
for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
|
225 |
+
out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
|
226 |
+
|
227 |
+
alignments = aligner.align(temp_file_path)
|
228 |
+
|
229 |
+
# using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
|
230 |
+
translated_sentences_with_style = []
|
231 |
+
for sentence_idx, sentence_alignments in enumerate(alignments):
|
232 |
+
|
233 |
+
# reverse the order of the alignments and build a dict with it
|
234 |
+
sentence_alignments = {target: source for source, target in sentence_alignments}
|
235 |
+
|
236 |
+
translated_sentence_with_style = []
|
237 |
+
for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
|
238 |
+
# fastalign has found a token aligned with the translated one
|
239 |
+
if token_idx in sentence_alignments.keys():
|
240 |
+
# get the aligned token
|
241 |
+
original_idx = sentence_alignments[token_idx]
|
242 |
+
new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
|
243 |
+
new_entry["text"] = translated_token
|
244 |
+
translated_sentence_with_style.append(new_entry)
|
245 |
+
else:
|
246 |
+
# WARNING this is a test
|
247 |
+
# since fastalign doesn't know from which word to reference this token, copy the style of the previous word
|
248 |
+
new_entry = translated_sentence_with_style[-1].copy()
|
249 |
+
new_entry["text"] = translated_token
|
250 |
+
translated_sentence_with_style.append(new_entry)
|
251 |
+
|
252 |
+
translated_sentences_with_style.append(translated_sentence_with_style)
|
253 |
+
|
254 |
+
return translated_sentences_with_style
|
255 |
+
|
256 |
+
|
257 |
+
# group contiguous elements with the same boolean values
|
258 |
+
def group_by_style(values, detokenizer):
|
259 |
+
groups = []
|
260 |
+
for key, group in groupby(values, key=lambda x: (
|
261 |
+
x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'],
|
262 |
+
x['paragraph_index'])):
|
263 |
+
text = detokenizer.detokenize([item['text'] for item in group])
|
264 |
+
|
265 |
+
if groups and not text.startswith((",", ";", ":", ".", ")")):
|
266 |
+
text = " " + text
|
267 |
+
|
268 |
+
groups.append({"text": text,
|
269 |
+
"bold": key[0],
|
270 |
+
"italic": key[1],
|
271 |
+
"underline": key[2],
|
272 |
+
"font_name": key[3],
|
273 |
+
"font_size": key[4],
|
274 |
+
"font_color": key[5],
|
275 |
+
'paragraph_index': key[6]})
|
276 |
+
return groups
|
277 |
+
|
278 |
+
|
279 |
+
def preprocess_runs(runs_in_paragraph):
|
280 |
+
new_runs = []
|
281 |
+
|
282 |
+
for run in runs_in_paragraph:
|
283 |
+
|
284 |
+
# sometimes the parameters are False and sometimes they are None, set them all to False
|
285 |
+
for key, value in run.items():
|
286 |
+
if value is None and not key.startswith("font"):
|
287 |
+
run[key] = False
|
288 |
+
|
289 |
+
if not new_runs:
|
290 |
+
new_runs.append(run)
|
291 |
+
else:
|
292 |
+
# if the previous run has the same format as the current run, we merge the two runs together
|
293 |
+
if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
|
294 |
+
new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
|
295 |
+
and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
|
296 |
+
and new_runs[-1]["underline"] == run["underline"]
|
297 |
+
and new_runs[-1]["paragraph_index"] == run["paragraph_index"]):
|
298 |
+
new_runs[-1]["text"] += run["text"]
|
299 |
+
else:
|
300 |
+
new_runs.append(run)
|
301 |
+
|
302 |
+
# we want to split runs that contain more than one sentence to avoid problems later when aligning styles
|
303 |
+
sentences = sent_tokenize(new_runs[-1]["text"])
|
304 |
+
if len(sentences) > 1:
|
305 |
+
new_runs[-1]["text"] = sentences[0]
|
306 |
+
for sentence in sentences[1:]:
|
307 |
+
new_run = new_runs[-1].copy()
|
308 |
+
new_run["text"] = sentence
|
309 |
+
new_runs.append(new_run)
|
310 |
+
|
311 |
+
return new_runs
|
312 |
+
|
313 |
+
|
314 |
+
|
315 |
+
def translate_document(input_file,
|
316 |
+
aligner,
|
317 |
+
detokenizer,
|
318 |
+
ip="192.168.20.216",
|
319 |
+
temp_folder="tmp",
|
320 |
+
port="8000"):
|
321 |
+
os.makedirs(temp_folder, exist_ok=True)
|
322 |
+
# load original file, extract the paragraphs with their runs (which include style and formatting)
|
323 |
+
doc = Document(input_file)
|
324 |
+
paragraphs_with_runs = extract_paragraphs_with_runs(doc)
|
325 |
+
|
326 |
+
# translate each paragraph
|
327 |
+
translated_paragraphs = []
|
328 |
+
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
329 |
+
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
330 |
+
translated_paragraphs.append(translate(paragraph_text, ip, port))
|
331 |
+
|
332 |
+
out_doc = Document()
|
333 |
+
|
334 |
+
processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
|
335 |
+
|
336 |
+
translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
|
337 |
+
translated_paragraphs, aligner,
|
338 |
+
temp_folder, detokenizer)
|
339 |
+
# flatten the sentences into a list of tokens
|
340 |
+
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
341 |
+
# group the tokens by style/run
|
342 |
+
translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
|
343 |
+
|
344 |
+
# group the runs by original paragraph
|
345 |
+
translated_paragraphs_with_style = defaultdict(list)
|
346 |
+
for item in translated_runs_with_style:
|
347 |
+
translated_paragraphs_with_style[item['paragraph_index']].append(item)
|
348 |
+
|
349 |
+
for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
|
350 |
+
# in case there are empty paragraphs
|
351 |
+
if not original_paragraph.text:
|
352 |
+
out_doc.add_paragraph(style=original_paragraph.style)
|
353 |
+
continue
|
354 |
+
|
355 |
+
para = out_doc.add_paragraph(style=original_paragraph.style)
|
356 |
+
|
357 |
+
for item in translated_paragraphs_with_style[paragraph_index]:
|
358 |
+
run = para.add_run(item["text"])
|
359 |
+
# Preserve original run formatting
|
360 |
+
run.bold = item['bold']
|
361 |
+
run.italic = item['italic']
|
362 |
+
run.underline = item['underline']
|
363 |
+
run.font.name = item['font_name']
|
364 |
+
run.font.size = item['font_size']
|
365 |
+
run.font.color.rgb = item['font_color']
|
366 |
+
|
367 |
+
out_doc.save("translated.docx")
|
368 |
+
return "translated.docx"
|