mjuvilla commited on
Commit
0348f21
·
1 Parent(s): 0efc9da

fixed some formatting issues

Browse files
Files changed (1) hide show
  1. translate_docx.py +11 -3
translate_docx.py CHANGED
@@ -247,7 +247,7 @@ def group_by_style(values, detokenizer):
247
  x['paragraph_index'])):
248
  text = detokenizer.detokenize([item['text'] for item in group])
249
 
250
- if groups and not text.startswith((",", ";", ":", ".", ")")):
251
  text = " " + text
252
 
253
  groups.append({"text": text,
@@ -330,9 +330,17 @@ def translate_document(input_file,
330
  print("Grouped by style")
331
 
332
  # group the runs by original paragraph
333
- translated_paragraphs_with_style = defaultdict(list)
334
  for item in translated_runs_with_style:
335
- translated_paragraphs_with_style[item['paragraph_index']].append(item)
 
 
 
 
 
 
 
 
336
 
337
  for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
338
  # in case there are empty paragraphs
 
247
  x['paragraph_index'])):
248
  text = detokenizer.detokenize([item['text'] for item in group])
249
 
250
+ if groups and not text.startswith((",", ";", ":", ".", ")", "!", "?")):
251
  text = " " + text
252
 
253
  groups.append({"text": text,
 
330
  print("Grouped by style")
331
 
332
  # group the runs by original paragraph
333
+ translated_paragraphs_with_style = dict()
334
  for item in translated_runs_with_style:
335
+ if item['paragraph_index'] in translated_paragraphs_with_style:
336
+ translated_paragraphs_with_style[item['paragraph_index']].append(item)
337
+ else:
338
+ # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
339
+ # didn't know where paragraphs started and ended
340
+ first_item_in_paragraph = item.copy()
341
+ first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
342
+ translated_paragraphs_with_style[item['paragraph_index']] = []
343
+ translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
344
 
345
  for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
346
  # in case there are empty paragraphs