mjuvilla commited on
Commit
595da73
·
1 Parent(s): b568903

Fixed issues when dealing with hyperlinks (for now we keep the text and formatting but not the link), also improved format handling and sped things up a bit by avoiding loading fastalign with empty paragraphs

Browse files
Files changed (1) hide show
  1. main.py +36 -15
main.py CHANGED
@@ -2,6 +2,8 @@ import os
2
  import string
3
 
4
  from docx import Document
 
 
5
  import nltk
6
 
7
  nltk.download('punkt')
@@ -80,18 +82,31 @@ class Aligner():
80
  # Function to extract paragraphs with their runs
81
  def extract_paragraphs_with_runs(doc):
82
  paragraphs_with_runs = []
83
- for para in doc.paragraphs:
84
  runs = []
85
- for run in para.runs:
86
- runs.append({
87
- 'text': run.text,
88
- 'bold': run.bold,
89
- 'italic': run.italic,
90
- 'underline': run.underline,
91
- 'font_name': run.font.name,
92
- 'font_size': run.font.size,
93
- 'font_color': run.font.color.rgb
94
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  paragraphs_with_runs.append(runs)
96
  return paragraphs_with_runs
97
 
@@ -241,6 +256,12 @@ def preprocess_runs(runs_in_paragraph):
241
  new_runs = []
242
 
243
  for run in runs_in_paragraph:
 
 
 
 
 
 
244
  if not new_runs:
245
  new_runs.append(run)
246
  else:
@@ -266,7 +287,7 @@ def preprocess_runs(runs_in_paragraph):
266
 
267
 
268
  if __name__ == "__main__":
269
- input_file = 'data/test2.docx'
270
  output_file = 'data/translated_output.docx'
271
  source_lang = 'ca'
272
  target_lang = 'en'
@@ -294,9 +315,9 @@ if __name__ == "__main__":
294
  for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
295
  translated_paragraphs,
296
  doc.paragraphs):
297
- # sometimes we get empty paragraphs for some reason, I think it's just docx shenanigans
298
- if not original_runs_in_paragraph:
299
- continue
300
 
301
  original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
302
 
 
2
  import string
3
 
4
  from docx import Document
5
+ from docx.text.hyperlink import Hyperlink
6
+ from docx.text.run import Run
7
  import nltk
8
 
9
  nltk.download('punkt')
 
82
  # Function to extract paragraphs with their runs
83
  def extract_paragraphs_with_runs(doc):
84
  paragraphs_with_runs = []
85
+ for paragraph in doc.paragraphs:
86
  runs = []
87
+ for item in paragraph.iter_inner_content():
88
+ if isinstance(item, Run):
89
+ runs.append({
90
+ 'text': item.text,
91
+ 'bold': item.bold,
92
+ 'italic': item.italic,
93
+ 'underline': item.underline,
94
+ 'font_name': item.font.name,
95
+ 'font_size': item.font.size,
96
+ 'font_color': item.font.color.rgb
97
+ })
98
+ elif isinstance(item, Hyperlink):
99
+ runs.append({
100
+ 'text': item.runs[0].text,
101
+ 'bold': item.runs[0].bold,
102
+ 'italic': item.runs[0].italic,
103
+ 'underline': item.runs[0].underline,
104
+ 'font_name': item.runs[0].font.name,
105
+ 'font_size': item.runs[0].font.size,
106
+ 'font_color': item.runs[0].font.color.rgb
107
+ })
108
+
109
+
110
  paragraphs_with_runs.append(runs)
111
  return paragraphs_with_runs
112
 
 
256
  new_runs = []
257
 
258
  for run in runs_in_paragraph:
259
+
260
+ # sometimes the parameters are False and sometimes they are None, set them all to False
261
+ for key, value in run.items():
262
+ if value is None and not key.startswith("font"):
263
+ run[key] = False
264
+
265
  if not new_runs:
266
  new_runs.append(run)
267
  else:
 
287
 
288
 
289
  if __name__ == "__main__":
290
+ input_file = 'data/test3.docx'
291
  output_file = 'data/translated_output.docx'
292
  source_lang = 'ca'
293
  target_lang = 'en'
 
315
  for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
316
  translated_paragraphs,
317
  doc.paragraphs):
318
+ # in case there are empty paragraphs
319
+ if len(original_runs_in_paragraph) == 1 and not original_runs_in_paragraph[0]["text"]:
320
+ out_doc.add_paragraph(style=original_paragraph.style)
321
 
322
  original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
323