Spaces:
Sleeping
Sleeping
Fixed issues when dealing with hyperlinks (for now we keep the text and formatting but not the link), also improved format handling and sped things up a bit by avoiding loading fastalign with empty paragraphs
Browse files
main.py
CHANGED
@@ -2,6 +2,8 @@ import os
|
|
2 |
import string
|
3 |
|
4 |
from docx import Document
|
|
|
|
|
5 |
import nltk
|
6 |
|
7 |
nltk.download('punkt')
|
@@ -80,18 +82,31 @@ class Aligner():
|
|
80 |
# Function to extract paragraphs with their runs
|
81 |
def extract_paragraphs_with_runs(doc):
|
82 |
paragraphs_with_runs = []
|
83 |
-
for
|
84 |
runs = []
|
85 |
-
for
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
paragraphs_with_runs.append(runs)
|
96 |
return paragraphs_with_runs
|
97 |
|
@@ -241,6 +256,12 @@ def preprocess_runs(runs_in_paragraph):
|
|
241 |
new_runs = []
|
242 |
|
243 |
for run in runs_in_paragraph:
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
if not new_runs:
|
245 |
new_runs.append(run)
|
246 |
else:
|
@@ -266,7 +287,7 @@ def preprocess_runs(runs_in_paragraph):
|
|
266 |
|
267 |
|
268 |
if __name__ == "__main__":
|
269 |
-
input_file = 'data/
|
270 |
output_file = 'data/translated_output.docx'
|
271 |
source_lang = 'ca'
|
272 |
target_lang = 'en'
|
@@ -294,9 +315,9 @@ if __name__ == "__main__":
|
|
294 |
for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
|
295 |
translated_paragraphs,
|
296 |
doc.paragraphs):
|
297 |
-
#
|
298 |
-
if not original_runs_in_paragraph:
|
299 |
-
|
300 |
|
301 |
original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
|
302 |
|
|
|
2 |
import string
|
3 |
|
4 |
from docx import Document
|
5 |
+
from docx.text.hyperlink import Hyperlink
|
6 |
+
from docx.text.run import Run
|
7 |
import nltk
|
8 |
|
9 |
nltk.download('punkt')
|
|
|
82 |
# Function to extract paragraphs with their runs
|
83 |
def extract_paragraphs_with_runs(doc):
|
84 |
paragraphs_with_runs = []
|
85 |
+
for paragraph in doc.paragraphs:
|
86 |
runs = []
|
87 |
+
for item in paragraph.iter_inner_content():
|
88 |
+
if isinstance(item, Run):
|
89 |
+
runs.append({
|
90 |
+
'text': item.text,
|
91 |
+
'bold': item.bold,
|
92 |
+
'italic': item.italic,
|
93 |
+
'underline': item.underline,
|
94 |
+
'font_name': item.font.name,
|
95 |
+
'font_size': item.font.size,
|
96 |
+
'font_color': item.font.color.rgb
|
97 |
+
})
|
98 |
+
elif isinstance(item, Hyperlink):
|
99 |
+
runs.append({
|
100 |
+
'text': item.runs[0].text,
|
101 |
+
'bold': item.runs[0].bold,
|
102 |
+
'italic': item.runs[0].italic,
|
103 |
+
'underline': item.runs[0].underline,
|
104 |
+
'font_name': item.runs[0].font.name,
|
105 |
+
'font_size': item.runs[0].font.size,
|
106 |
+
'font_color': item.runs[0].font.color.rgb
|
107 |
+
})
|
108 |
+
|
109 |
+
|
110 |
paragraphs_with_runs.append(runs)
|
111 |
return paragraphs_with_runs
|
112 |
|
|
|
256 |
new_runs = []
|
257 |
|
258 |
for run in runs_in_paragraph:
|
259 |
+
|
260 |
+
# sometimes the parameters are False and sometimes they are None, set them all to False
|
261 |
+
for key, value in run.items():
|
262 |
+
if value is None and not key.startswith("font"):
|
263 |
+
run[key] = False
|
264 |
+
|
265 |
if not new_runs:
|
266 |
new_runs.append(run)
|
267 |
else:
|
|
|
287 |
|
288 |
|
289 |
if __name__ == "__main__":
|
290 |
+
input_file = 'data/test3.docx'
|
291 |
output_file = 'data/translated_output.docx'
|
292 |
source_lang = 'ca'
|
293 |
target_lang = 'en'
|
|
|
315 |
for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
|
316 |
translated_paragraphs,
|
317 |
doc.paragraphs):
|
318 |
+
# in case there are empty paragraphs
|
319 |
+
if len(original_runs_in_paragraph) == 1 and not original_runs_in_paragraph[0]["text"]:
|
320 |
+
out_doc.add_paragraph(style=original_paragraph.style)
|
321 |
|
322 |
original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
|
323 |
|