mjuvilla commited on
Commit
0fc4acd
·
1 Parent(s): 08ca2fd

Modified the script so we only run fastalign once instead of once per paragraph, reducing significantly the run time. It involves flattening all the text while keeping the original paragraph index to be able to reconstruct the original structure.

Browse files
Files changed (1) hide show
  1. main.py +45 -31
main.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- import string
3
 
4
  from docx import Document
5
  from docx.text.hyperlink import Hyperlink
@@ -82,7 +82,7 @@ class Aligner():
82
  # Function to extract paragraphs with their runs
83
  def extract_paragraphs_with_runs(doc):
84
  paragraphs_with_runs = []
85
- for paragraph in doc.paragraphs:
86
  runs = []
87
  for item in paragraph.iter_inner_content():
88
  if isinstance(item, Run):
@@ -93,7 +93,8 @@ def extract_paragraphs_with_runs(doc):
93
  'underline': item.underline,
94
  'font_name': item.font.name,
95
  'font_size': item.font.size,
96
- 'font_color': item.font.color.rgb
 
97
  })
98
  elif isinstance(item, Hyperlink):
99
  runs.append({
@@ -103,10 +104,10 @@ def extract_paragraphs_with_runs(doc):
103
  'underline': item.runs[0].underline,
104
  'font_name': item.runs[0].font.name,
105
  'font_size': item.runs[0].font.size,
106
- 'font_color': item.runs[0].font.color.rgb
 
107
  })
108
 
109
-
110
  paragraphs_with_runs.append(runs)
111
  return paragraphs_with_runs
112
 
@@ -144,13 +145,13 @@ def tokenize_paragraph_with_runs2(runs_in_paragraph):
144
  return tokenized_sentences_with_style
145
 
146
 
147
- def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer):
148
- text_paragraph = detokenizer.detokenize([run["text"] for run in runs_in_paragraph])
149
  sentences = sent_tokenize(text_paragraph)
150
  tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
151
 
152
  tokens_with_style = []
153
- for run in runs_in_paragraph:
154
  tokens = word_tokenize(run["text"])
155
  for token in tokens:
156
  tokens_with_style.append(run.copy())
@@ -179,7 +180,7 @@ def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer):
179
  return tokenized_sentences_with_style
180
 
181
 
182
- def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner, temp_folder, detokenizer):
183
  # clean temp folder
184
  for f in os.listdir(temp_folder):
185
  os.remove(os.path.join(temp_folder, f))
@@ -187,9 +188,17 @@ def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligne
187
  temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
188
 
189
  # tokenize the original text by sentence and words while keeping the style
190
- original_tokenized_sentences_with_style = tokenize_paragraph_with_runs(original_runs_in_paragraph, detokenizer)
 
 
 
 
 
 
191
  # tokenize the translated text by sentence and word
192
- translated_tokenized_sentences = [word_tokenize(sentence) for sentence in sent_tokenize(translated_paragraph)]
 
 
193
 
194
  # write the file that fastalign will use
195
  with open(temp_file_path, "w") as out_file:
@@ -236,7 +245,8 @@ def translate_paragraph(paragraph_text):
236
  def group_by_style(values, detokenizer):
237
  groups = []
238
  for key, group in groupby(values, key=lambda x: (
239
- x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
 
240
  text = detokenizer.detokenize([item['text'] for item in group])
241
 
242
  if groups and not text.startswith((",", ";", ":", ".", ")")):
@@ -248,7 +258,8 @@ def group_by_style(values, detokenizer):
248
  "underline": key[2],
249
  "font_name": key[3],
250
  "font_size": key[4],
251
- "font_color": key[5]})
 
252
  return groups
253
 
254
 
@@ -269,7 +280,8 @@ def preprocess_runs(runs_in_paragraph):
269
  if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
270
  new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
271
  and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
272
- and new_runs[-1]["underline"] == run["underline"]):
 
273
  new_runs[-1]["text"] += run["text"]
274
  else:
275
  new_runs.append(run)
@@ -309,31 +321,33 @@ if __name__ == "__main__":
309
  for paragraph in paragraphs_with_runs:
310
  paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
311
  translated_paragraphs.append(translate_paragraph(paragraph_text))
312
-
313
  out_doc = Document()
314
 
315
- for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
316
- translated_paragraphs,
317
- doc.paragraphs):
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  # in case there are empty paragraphs
319
- if len(original_runs_in_paragraph) == 1 and not original_runs_in_paragraph[0]["text"]:
320
  out_doc.add_paragraph(style=original_paragraph.style)
321
  continue
322
 
323
- original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
324
-
325
- paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
326
- temp_folder, detokenizer)
327
-
328
  para = out_doc.add_paragraph(style=original_paragraph.style)
329
 
330
- # flatten the paragraph, we don't need it to split into sentences anymore
331
- paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
332
-
333
- # merge tokens into runs and detokenize
334
- paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
335
-
336
- for item in paragraph_with_runs:
337
  run = para.add_run(item["text"])
338
  # Preserve original run formatting
339
  run.bold = item['bold']
 
1
  import os
2
+ from collections import defaultdict
3
 
4
  from docx import Document
5
  from docx.text.hyperlink import Hyperlink
 
82
  # Function to extract paragraphs with their runs
83
  def extract_paragraphs_with_runs(doc):
84
  paragraphs_with_runs = []
85
+ for idx, paragraph in enumerate(doc.paragraphs):
86
  runs = []
87
  for item in paragraph.iter_inner_content():
88
  if isinstance(item, Run):
 
93
  'underline': item.underline,
94
  'font_name': item.font.name,
95
  'font_size': item.font.size,
96
+ 'font_color': item.font.color.rgb,
97
+ 'paragraph_index': idx
98
  })
99
  elif isinstance(item, Hyperlink):
100
  runs.append({
 
104
  'underline': item.runs[0].underline,
105
  'font_name': item.runs[0].font.name,
106
  'font_size': item.runs[0].font.size,
107
+ 'font_color': item.runs[0].font.color.rgb,
108
+ 'paragraph_index': idx
109
  })
110
 
 
111
  paragraphs_with_runs.append(runs)
112
  return paragraphs_with_runs
113
 
 
145
  return tokenized_sentences_with_style
146
 
147
 
148
+ def tokenize_with_runs(runs, detokenizer):
149
+ text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
150
  sentences = sent_tokenize(text_paragraph)
151
  tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
152
 
153
  tokens_with_style = []
154
+ for run in runs:
155
  tokens = word_tokenize(run["text"])
156
  for token in tokens:
157
  tokens_with_style.append(run.copy())
 
180
  return tokenized_sentences_with_style
181
 
182
 
183
+ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, aligner, temp_folder, detokenizer):
184
  # clean temp folder
185
  for f in os.listdir(temp_folder):
186
  os.remove(os.path.join(temp_folder, f))
 
188
  temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
189
 
190
  # tokenize the original text by sentence and words while keeping the style
191
+ original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
192
+ original_paragraphs_with_runs]
193
+
194
+ # flatten all the runs so we can align with just one call instead of one per paragraph
195
+ original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
196
+ sublist]
197
+
198
  # tokenize the translated text by sentence and word
199
+ translated_tokenized_sentences = [word_tokenize(sentence) for
200
+ translated_paragraph in translated_paragraphs for sentence in
201
+ sent_tokenize(translated_paragraph)]
202
 
203
  # write the file that fastalign will use
204
  with open(temp_file_path, "w") as out_file:
 
245
  def group_by_style(values, detokenizer):
246
  groups = []
247
  for key, group in groupby(values, key=lambda x: (
248
+ x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'],
249
+ x['paragraph_index'])):
250
  text = detokenizer.detokenize([item['text'] for item in group])
251
 
252
  if groups and not text.startswith((",", ";", ":", ".", ")")):
 
258
  "underline": key[2],
259
  "font_name": key[3],
260
  "font_size": key[4],
261
+ "font_color": key[5],
262
+ 'paragraph_index': key[6]})
263
  return groups
264
 
265
 
 
280
  if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
281
  new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
282
  and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
283
+ and new_runs[-1]["underline"] == run["underline"]
284
+ and new_runs[-1]["paragraph_index"] == run["paragraph_index"]):
285
  new_runs[-1]["text"] += run["text"]
286
  else:
287
  new_runs.append(run)
 
321
  for paragraph in paragraphs_with_runs:
322
  paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
323
  translated_paragraphs.append(translate_paragraph(paragraph_text))
324
+
325
  out_doc = Document()
326
 
327
+ processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
328
+
329
+ translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
330
+ translated_paragraphs, aligner,
331
+ temp_folder, detokenizer)
332
+ # flatten the sentences into a list of tokens
333
+ translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
334
+ # group the tokens by style/run
335
+ translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
336
+
337
+ # group the runs by original paragraph
338
+ translated_paragraphs_with_style = defaultdict(list)
339
+ for item in translated_runs_with_style:
340
+ translated_paragraphs_with_style[item['paragraph_index']].append(item)
341
+
342
+ for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
343
  # in case there are empty paragraphs
344
+ if not original_paragraph.text:
345
  out_doc.add_paragraph(style=original_paragraph.style)
346
  continue
347
 
 
 
 
 
 
348
  para = out_doc.add_paragraph(style=original_paragraph.style)
349
 
350
+ for item in translated_paragraphs_with_style[paragraph_index]:
 
 
 
 
 
 
351
  run = para.add_run(item["text"])
352
  # Preserve original run formatting
353
  run.bold = item['bold']