Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Apr 11

Commit

0fc4acd

1 Parent(s): 08ca2fd

Modified the script so we only run fastalign once instead of once per paragraph, reducing significantly the run time. It involves flattening all the text while keeping the original paragraph index to be able to reconstruct the original structure.

Browse files

Files changed (1) hide show

main.py +45 -31

main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-import string
 from docx import Document
 from docx.text.hyperlink import Hyperlink
@@ -82,7 +82,7 @@ class Aligner():
 # Function to extract paragraphs with their runs
 def extract_paragraphs_with_runs(doc):
     paragraphs_with_runs = []
-    for paragraph in doc.paragraphs:
         runs = []
         for item in paragraph.iter_inner_content():
             if isinstance(item, Run):
@@ -93,7 +93,8 @@ def extract_paragraphs_with_runs(doc):
                     'underline': item.underline,
                     'font_name': item.font.name,
                     'font_size': item.font.size,
-                    'font_color': item.font.color.rgb
                 })
             elif isinstance(item, Hyperlink):
                 runs.append({
@@ -103,10 +104,10 @@ def extract_paragraphs_with_runs(doc):
                     'underline': item.runs[0].underline,
                     'font_name': item.runs[0].font.name,
                     'font_size': item.runs[0].font.size,
-                    'font_color': item.runs[0].font.color.rgb
                 })
         paragraphs_with_runs.append(runs)
     return paragraphs_with_runs
@@ -144,13 +145,13 @@ def tokenize_paragraph_with_runs2(runs_in_paragraph):
     return tokenized_sentences_with_style
-def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer):
-    text_paragraph = detokenizer.detokenize([run["text"] for run in runs_in_paragraph])
     sentences = sent_tokenize(text_paragraph)
     tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
     tokens_with_style = []
-    for run in runs_in_paragraph:
         tokens = word_tokenize(run["text"])
         for token in tokens:
             tokens_with_style.append(run.copy())
@@ -179,7 +180,7 @@ def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer):
     return tokenized_sentences_with_style
-def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner, temp_folder, detokenizer):
     # clean temp folder
     for f in os.listdir(temp_folder):
         os.remove(os.path.join(temp_folder, f))
@@ -187,9 +188,17 @@ def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligne
     temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
     # tokenize the original text by sentence and words while keeping the style
-    original_tokenized_sentences_with_style = tokenize_paragraph_with_runs(original_runs_in_paragraph, detokenizer)
     # tokenize the translated text by sentence and word
-    translated_tokenized_sentences = [word_tokenize(sentence) for sentence in sent_tokenize(translated_paragraph)]
     # write the file that fastalign will use
     with open(temp_file_path, "w") as out_file:
@@ -236,7 +245,8 @@ def translate_paragraph(paragraph_text):
 def group_by_style(values, detokenizer):
     groups = []
     for key, group in groupby(values, key=lambda x: (
-            x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
         text = detokenizer.detokenize([item['text'] for item in group])
         if groups and not text.startswith((",", ";", ":", ".", ")")):
@@ -248,7 +258,8 @@ def group_by_style(values, detokenizer):
                        "underline": key[2],
                        "font_name": key[3],
                        "font_size": key[4],
-                       "font_color": key[5]})
     return groups
@@ -269,7 +280,8 @@ def preprocess_runs(runs_in_paragraph):
             if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
                     new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
                     and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
-                    and new_runs[-1]["underline"] == run["underline"]):
                 new_runs[-1]["text"] += run["text"]
             else:
                 new_runs.append(run)
@@ -309,31 +321,33 @@ if __name__ == "__main__":
     for paragraph in paragraphs_with_runs:
         paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
         translated_paragraphs.append(translate_paragraph(paragraph_text))
     out_doc = Document()
-    for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
-                                                                                    translated_paragraphs,
-                                                                                    doc.paragraphs):
         # in case there are empty paragraphs
-        if len(original_runs_in_paragraph) == 1 and not original_runs_in_paragraph[0]["text"]:
             out_doc.add_paragraph(style=original_paragraph.style)
             continue
-        original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
-        paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
-                                                   temp_folder, detokenizer)
         para = out_doc.add_paragraph(style=original_paragraph.style)
-        # flatten the paragraph, we don't need it to split into sentences anymore
-        paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
-        # merge tokens into runs and detokenize
-        paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
-        for item in paragraph_with_runs:
             run = para.add_run(item["text"])
             # Preserve original run formatting
             run.bold = item['bold']

 import os
+from collections import defaultdict
 from docx import Document
 from docx.text.hyperlink import Hyperlink
 # Function to extract paragraphs with their runs
 def extract_paragraphs_with_runs(doc):
     paragraphs_with_runs = []
+    for idx, paragraph in enumerate(doc.paragraphs):
         runs = []
         for item in paragraph.iter_inner_content():
             if isinstance(item, Run):
                     'underline': item.underline,
                     'font_name': item.font.name,
                     'font_size': item.font.size,
+                    'font_color': item.font.color.rgb,
+                    'paragraph_index': idx
                 })
             elif isinstance(item, Hyperlink):
                 runs.append({
                     'underline': item.runs[0].underline,
                     'font_name': item.runs[0].font.name,
                     'font_size': item.runs[0].font.size,
+                    'font_color': item.runs[0].font.color.rgb,
+                    'paragraph_index': idx
                 })
         paragraphs_with_runs.append(runs)
     return paragraphs_with_runs
     return tokenized_sentences_with_style
+def tokenize_with_runs(runs, detokenizer):
+    text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
     sentences = sent_tokenize(text_paragraph)
     tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
     tokens_with_style = []
+    for run in runs:
         tokens = word_tokenize(run["text"])
         for token in tokens:
             tokens_with_style.append(run.copy())
     return tokenized_sentences_with_style
+def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, aligner, temp_folder, detokenizer):
     # clean temp folder
     for f in os.listdir(temp_folder):
         os.remove(os.path.join(temp_folder, f))
     temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
     # tokenize the original text by sentence and words while keeping the style
+    original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
+                                               original_paragraphs_with_runs]
+    # flatten all the runs so we can align with just one call instead of one per paragraph
+    original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
+                                               sublist]
     # tokenize the translated text by sentence and word
+    translated_tokenized_sentences = [word_tokenize(sentence) for
+                                      translated_paragraph in translated_paragraphs for sentence in
+                                      sent_tokenize(translated_paragraph)]
     # write the file that fastalign will use
     with open(temp_file_path, "w") as out_file:
 def group_by_style(values, detokenizer):
     groups = []
     for key, group in groupby(values, key=lambda x: (
+            x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'],
+            x['paragraph_index'])):
         text = detokenizer.detokenize([item['text'] for item in group])
         if groups and not text.startswith((",", ";", ":", ".", ")")):
                        "underline": key[2],
                        "font_name": key[3],
                        "font_size": key[4],
+                       "font_color": key[5],
+                       'paragraph_index': key[6]})
     return groups
             if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
                     new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
                     and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
+                    and new_runs[-1]["underline"] == run["underline"]
+                    and new_runs[-1]["paragraph_index"] == run["paragraph_index"]):
                 new_runs[-1]["text"] += run["text"]
             else:
                 new_runs.append(run)
     for paragraph in paragraphs_with_runs:
         paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
         translated_paragraphs.append(translate_paragraph(paragraph_text))
     out_doc = Document()
+    processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
+    translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
+                                                          translated_paragraphs, aligner,
+                                                          temp_folder, detokenizer)
+    # flatten the sentences into a list of tokens
+    translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
+    # group the tokens by style/run
+    translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
+    # group the runs by original paragraph
+    translated_paragraphs_with_style = defaultdict(list)
+    for item in translated_runs_with_style:
+        translated_paragraphs_with_style[item['paragraph_index']].append(item)
+    for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
         # in case there are empty paragraphs
+        if not original_paragraph.text:
             out_doc.add_paragraph(style=original_paragraph.style)
             continue
         para = out_doc.add_paragraph(style=original_paragraph.style)
+        for item in translated_paragraphs_with_style[paragraph_index]:
             run = para.add_run(item["text"])
             # Preserve original run formatting
             run.bold = item['bold']