Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Apr 11

Commit

595da73

1 Parent(s): b568903

Fixed issues when dealing with hyperlinks (for now we keep the text and formatting but not the link), also improved format handling and sped things up a bit by avoiding loading fastalign with empty paragraphs

Browse files

Files changed (1) hide show

main.py +36 -15

main.py CHANGED Viewed

@@ -2,6 +2,8 @@ import os
 import string
 from docx import Document
 import nltk
 nltk.download('punkt')
@@ -80,18 +82,31 @@ class Aligner():
 # Function to extract paragraphs with their runs
 def extract_paragraphs_with_runs(doc):
     paragraphs_with_runs = []
-    for para in doc.paragraphs:
         runs = []
-        for run in para.runs:
-            runs.append({
-                'text': run.text,
-                'bold': run.bold,
-                'italic': run.italic,
-                'underline': run.underline,
-                'font_name': run.font.name,
-                'font_size': run.font.size,
-                'font_color': run.font.color.rgb
-            })
         paragraphs_with_runs.append(runs)
     return paragraphs_with_runs
@@ -241,6 +256,12 @@ def preprocess_runs(runs_in_paragraph):
     new_runs = []
     for run in runs_in_paragraph:
         if not new_runs:
             new_runs.append(run)
         else:
@@ -266,7 +287,7 @@ def preprocess_runs(runs_in_paragraph):
 if __name__ == "__main__":
-    input_file = 'data/test2.docx'
     output_file = 'data/translated_output.docx'
     source_lang = 'ca'
     target_lang = 'en'
@@ -294,9 +315,9 @@ if __name__ == "__main__":
     for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
                                                                                     translated_paragraphs,
                                                                                     doc.paragraphs):
-        # sometimes we get empty paragraphs for some reason, I think it's just docx shenanigans
-        if not original_runs_in_paragraph:
-            continue
         original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)

 import string
 from docx import Document
+from docx.text.hyperlink import Hyperlink
+from docx.text.run import Run
 import nltk
 nltk.download('punkt')
 # Function to extract paragraphs with their runs
 def extract_paragraphs_with_runs(doc):
     paragraphs_with_runs = []
+    for paragraph in doc.paragraphs:
         runs = []
+        for item in paragraph.iter_inner_content():
+            if isinstance(item, Run):
+                runs.append({
+                    'text': item.text,
+                    'bold': item.bold,
+                    'italic': item.italic,
+                    'underline': item.underline,
+                    'font_name': item.font.name,
+                    'font_size': item.font.size,
+                    'font_color': item.font.color.rgb
+                })
+            elif isinstance(item, Hyperlink):
+                runs.append({
+                    'text': item.runs[0].text,
+                    'bold': item.runs[0].bold,
+                    'italic': item.runs[0].italic,
+                    'underline': item.runs[0].underline,
+                    'font_name': item.runs[0].font.name,
+                    'font_size': item.runs[0].font.size,
+                    'font_color': item.runs[0].font.color.rgb
+                })
         paragraphs_with_runs.append(runs)
     return paragraphs_with_runs
     new_runs = []
     for run in runs_in_paragraph:
+        # sometimes the parameters are False and sometimes they are None, set them all to False
+        for key, value in run.items():
+            if value is None and not key.startswith("font"):
+                run[key] = False
         if not new_runs:
             new_runs.append(run)
         else:
 if __name__ == "__main__":
+    input_file = 'data/test3.docx'
     output_file = 'data/translated_output.docx'
     source_lang = 'ca'
     target_lang = 'en'
     for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
                                                                                     translated_paragraphs,
                                                                                     doc.paragraphs):
+        # in case there are empty paragraphs
+        if len(original_runs_in_paragraph) == 1 and not original_runs_in_paragraph[0]["text"]:
+            out_doc.add_paragraph(style=original_paragraph.style)
         original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)