Spaces:
Sleeping
Sleeping
Modified the script so we only run fastalign once instead of once per paragraph, reducing significantly the run time. It involves flattening all the text while keeping the original paragraph index to be able to reconstruct the original structure.
Browse files
main.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
import
|
3 |
|
4 |
from docx import Document
|
5 |
from docx.text.hyperlink import Hyperlink
|
@@ -82,7 +82,7 @@ class Aligner():
|
|
82 |
# Function to extract paragraphs with their runs
|
83 |
def extract_paragraphs_with_runs(doc):
|
84 |
paragraphs_with_runs = []
|
85 |
-
for paragraph in doc.paragraphs:
|
86 |
runs = []
|
87 |
for item in paragraph.iter_inner_content():
|
88 |
if isinstance(item, Run):
|
@@ -93,7 +93,8 @@ def extract_paragraphs_with_runs(doc):
|
|
93 |
'underline': item.underline,
|
94 |
'font_name': item.font.name,
|
95 |
'font_size': item.font.size,
|
96 |
-
'font_color': item.font.color.rgb
|
|
|
97 |
})
|
98 |
elif isinstance(item, Hyperlink):
|
99 |
runs.append({
|
@@ -103,10 +104,10 @@ def extract_paragraphs_with_runs(doc):
|
|
103 |
'underline': item.runs[0].underline,
|
104 |
'font_name': item.runs[0].font.name,
|
105 |
'font_size': item.runs[0].font.size,
|
106 |
-
'font_color': item.runs[0].font.color.rgb
|
|
|
107 |
})
|
108 |
|
109 |
-
|
110 |
paragraphs_with_runs.append(runs)
|
111 |
return paragraphs_with_runs
|
112 |
|
@@ -144,13 +145,13 @@ def tokenize_paragraph_with_runs2(runs_in_paragraph):
|
|
144 |
return tokenized_sentences_with_style
|
145 |
|
146 |
|
147 |
-
def
|
148 |
-
text_paragraph = detokenizer.detokenize([run["text"] for run in
|
149 |
sentences = sent_tokenize(text_paragraph)
|
150 |
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
|
151 |
|
152 |
tokens_with_style = []
|
153 |
-
for run in
|
154 |
tokens = word_tokenize(run["text"])
|
155 |
for token in tokens:
|
156 |
tokens_with_style.append(run.copy())
|
@@ -179,7 +180,7 @@ def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer):
|
|
179 |
return tokenized_sentences_with_style
|
180 |
|
181 |
|
182 |
-
def generate_alignments(
|
183 |
# clean temp folder
|
184 |
for f in os.listdir(temp_folder):
|
185 |
os.remove(os.path.join(temp_folder, f))
|
@@ -187,9 +188,17 @@ def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligne
|
|
187 |
temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
|
188 |
|
189 |
# tokenize the original text by sentence and words while keeping the style
|
190 |
-
original_tokenized_sentences_with_style =
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
# tokenize the translated text by sentence and word
|
192 |
-
translated_tokenized_sentences = [word_tokenize(sentence) for
|
|
|
|
|
193 |
|
194 |
# write the file that fastalign will use
|
195 |
with open(temp_file_path, "w") as out_file:
|
@@ -236,7 +245,8 @@ def translate_paragraph(paragraph_text):
|
|
236 |
def group_by_style(values, detokenizer):
|
237 |
groups = []
|
238 |
for key, group in groupby(values, key=lambda x: (
|
239 |
-
x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color']
|
|
|
240 |
text = detokenizer.detokenize([item['text'] for item in group])
|
241 |
|
242 |
if groups and not text.startswith((",", ";", ":", ".", ")")):
|
@@ -248,7 +258,8 @@ def group_by_style(values, detokenizer):
|
|
248 |
"underline": key[2],
|
249 |
"font_name": key[3],
|
250 |
"font_size": key[4],
|
251 |
-
"font_color": key[5]
|
|
|
252 |
return groups
|
253 |
|
254 |
|
@@ -269,7 +280,8 @@ def preprocess_runs(runs_in_paragraph):
|
|
269 |
if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
|
270 |
new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
|
271 |
and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
|
272 |
-
and new_runs[-1]["underline"] == run["underline"]
|
|
|
273 |
new_runs[-1]["text"] += run["text"]
|
274 |
else:
|
275 |
new_runs.append(run)
|
@@ -309,31 +321,33 @@ if __name__ == "__main__":
|
|
309 |
for paragraph in paragraphs_with_runs:
|
310 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
311 |
translated_paragraphs.append(translate_paragraph(paragraph_text))
|
312 |
-
|
313 |
out_doc = Document()
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
# in case there are empty paragraphs
|
319 |
-
if
|
320 |
out_doc.add_paragraph(style=original_paragraph.style)
|
321 |
continue
|
322 |
|
323 |
-
original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
|
324 |
-
|
325 |
-
paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
|
326 |
-
temp_folder, detokenizer)
|
327 |
-
|
328 |
para = out_doc.add_paragraph(style=original_paragraph.style)
|
329 |
|
330 |
-
|
331 |
-
paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
|
332 |
-
|
333 |
-
# merge tokens into runs and detokenize
|
334 |
-
paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
|
335 |
-
|
336 |
-
for item in paragraph_with_runs:
|
337 |
run = para.add_run(item["text"])
|
338 |
# Preserve original run formatting
|
339 |
run.bold = item['bold']
|
|
|
1 |
import os
|
2 |
+
from collections import defaultdict
|
3 |
|
4 |
from docx import Document
|
5 |
from docx.text.hyperlink import Hyperlink
|
|
|
82 |
# Function to extract paragraphs with their runs
|
83 |
def extract_paragraphs_with_runs(doc):
|
84 |
paragraphs_with_runs = []
|
85 |
+
for idx, paragraph in enumerate(doc.paragraphs):
|
86 |
runs = []
|
87 |
for item in paragraph.iter_inner_content():
|
88 |
if isinstance(item, Run):
|
|
|
93 |
'underline': item.underline,
|
94 |
'font_name': item.font.name,
|
95 |
'font_size': item.font.size,
|
96 |
+
'font_color': item.font.color.rgb,
|
97 |
+
'paragraph_index': idx
|
98 |
})
|
99 |
elif isinstance(item, Hyperlink):
|
100 |
runs.append({
|
|
|
104 |
'underline': item.runs[0].underline,
|
105 |
'font_name': item.runs[0].font.name,
|
106 |
'font_size': item.runs[0].font.size,
|
107 |
+
'font_color': item.runs[0].font.color.rgb,
|
108 |
+
'paragraph_index': idx
|
109 |
})
|
110 |
|
|
|
111 |
paragraphs_with_runs.append(runs)
|
112 |
return paragraphs_with_runs
|
113 |
|
|
|
145 |
return tokenized_sentences_with_style
|
146 |
|
147 |
|
148 |
+
def tokenize_with_runs(runs, detokenizer):
|
149 |
+
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
150 |
sentences = sent_tokenize(text_paragraph)
|
151 |
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
|
152 |
|
153 |
tokens_with_style = []
|
154 |
+
for run in runs:
|
155 |
tokens = word_tokenize(run["text"])
|
156 |
for token in tokens:
|
157 |
tokens_with_style.append(run.copy())
|
|
|
180 |
return tokenized_sentences_with_style
|
181 |
|
182 |
|
183 |
+
def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, aligner, temp_folder, detokenizer):
|
184 |
# clean temp folder
|
185 |
for f in os.listdir(temp_folder):
|
186 |
os.remove(os.path.join(temp_folder, f))
|
|
|
188 |
temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
|
189 |
|
190 |
# tokenize the original text by sentence and words while keeping the style
|
191 |
+
original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
|
192 |
+
original_paragraphs_with_runs]
|
193 |
+
|
194 |
+
# flatten all the runs so we can align with just one call instead of one per paragraph
|
195 |
+
original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
|
196 |
+
sublist]
|
197 |
+
|
198 |
# tokenize the translated text by sentence and word
|
199 |
+
translated_tokenized_sentences = [word_tokenize(sentence) for
|
200 |
+
translated_paragraph in translated_paragraphs for sentence in
|
201 |
+
sent_tokenize(translated_paragraph)]
|
202 |
|
203 |
# write the file that fastalign will use
|
204 |
with open(temp_file_path, "w") as out_file:
|
|
|
245 |
def group_by_style(values, detokenizer):
|
246 |
groups = []
|
247 |
for key, group in groupby(values, key=lambda x: (
|
248 |
+
x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'],
|
249 |
+
x['paragraph_index'])):
|
250 |
text = detokenizer.detokenize([item['text'] for item in group])
|
251 |
|
252 |
if groups and not text.startswith((",", ";", ":", ".", ")")):
|
|
|
258 |
"underline": key[2],
|
259 |
"font_name": key[3],
|
260 |
"font_size": key[4],
|
261 |
+
"font_color": key[5],
|
262 |
+
'paragraph_index': key[6]})
|
263 |
return groups
|
264 |
|
265 |
|
|
|
280 |
if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
|
281 |
new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
|
282 |
and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
|
283 |
+
and new_runs[-1]["underline"] == run["underline"]
|
284 |
+
and new_runs[-1]["paragraph_index"] == run["paragraph_index"]):
|
285 |
new_runs[-1]["text"] += run["text"]
|
286 |
else:
|
287 |
new_runs.append(run)
|
|
|
321 |
for paragraph in paragraphs_with_runs:
|
322 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
323 |
translated_paragraphs.append(translate_paragraph(paragraph_text))
|
324 |
+
|
325 |
out_doc = Document()
|
326 |
|
327 |
+
processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
|
328 |
+
|
329 |
+
translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
|
330 |
+
translated_paragraphs, aligner,
|
331 |
+
temp_folder, detokenizer)
|
332 |
+
# flatten the sentences into a list of tokens
|
333 |
+
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
334 |
+
# group the tokens by style/run
|
335 |
+
translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
|
336 |
+
|
337 |
+
# group the runs by original paragraph
|
338 |
+
translated_paragraphs_with_style = defaultdict(list)
|
339 |
+
for item in translated_runs_with_style:
|
340 |
+
translated_paragraphs_with_style[item['paragraph_index']].append(item)
|
341 |
+
|
342 |
+
for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
|
343 |
# in case there are empty paragraphs
|
344 |
+
if not original_paragraph.text:
|
345 |
out_doc.add_paragraph(style=original_paragraph.style)
|
346 |
continue
|
347 |
|
|
|
|
|
|
|
|
|
|
|
348 |
para = out_doc.add_paragraph(style=original_paragraph.style)
|
349 |
|
350 |
+
for item in translated_paragraphs_with_style[paragraph_index]:
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
run = para.add_run(item["text"])
|
352 |
# Preserve original run formatting
|
353 |
run.bold = item['bold']
|