Spaces:

LangTech-MT
/

document-translator

Sleeping

mjuvilla commited on Apr 16

Commit

0348f21

1 Parent(s): 0efc9da

fixed some formatting issues

Files changed (1) hide show

translate_docx.py CHANGED Viewed

@@ -247,7 +247,7 @@ def group_by_style(values, detokenizer):
             x['paragraph_index'])):
         text = detokenizer.detokenize([item['text'] for item in group])
-        if groups and not text.startswith((",", ";", ":", ".", ")")):
             text = " " + text
         groups.append({"text": text,
@@ -330,9 +330,17 @@ def translate_document(input_file,
     print("Grouped by style")
     # group the runs by original paragraph
-    translated_paragraphs_with_style = defaultdict(list)
     for item in translated_runs_with_style:
-        translated_paragraphs_with_style[item['paragraph_index']].append(item)
     for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
         # in case there are empty paragraphs

             x['paragraph_index'])):
         text = detokenizer.detokenize([item['text'] for item in group])
+        if groups and not text.startswith((",", ";", ":", ".", ")", "!", "?")):
             text = " " + text
         groups.append({"text": text,
     print("Grouped by style")
     # group the runs by original paragraph
+    translated_paragraphs_with_style = dict()
     for item in translated_runs_with_style:
+        if item['paragraph_index'] in translated_paragraphs_with_style:
+            translated_paragraphs_with_style[item['paragraph_index']].append(item)
+        else:
+            # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
+            # didn't know where paragraphs started and ended
+            first_item_in_paragraph = item.copy()
+            first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
+            translated_paragraphs_with_style[item['paragraph_index']] = []
+            translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
     for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
         # in case there are empty paragraphs