Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on May 20

Commit

4420a7f

1 Parent(s): 7743917

Fixed bug when processing docx files

Browse files

Files changed (1) hide show

src/translate_any_doc.py +71 -40

src/translate_any_doc.py CHANGED Viewed

@@ -44,7 +44,7 @@ def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal
     return os.path.join(original_xliff_file_path + f".{source_lang}")
-def get_runs_from_paragraph(text: str, paragraph_index: int) ->  list[dict[str, str | tuple[str, ...]]]:
     """
     Given some text that may or may not contain some chunks tagged with something like <g id=1> </g>, extract each
     of the runs of text and convert them into dictionaries to keep this information
@@ -59,39 +59,61 @@ def get_runs_from_paragraph(text: str, paragraph_index: int) ->  list[dict[str,
     tag_stack = []
     runs = []
-    buffer = ''
     pos = 0
-    tag_pattern = re.compile(r'<(/?)g(?: id="(\d+)")?>')
-    while pos < len(text):
-        match = tag_pattern.search(text, pos)
-        if match:
-            start, end = match.span()
-            # Add any text before this tag as a run
-            if start > pos:
-                buffer = text[pos:start]
-                if buffer:
-                    runs.append({"text": buffer, "id": tuple(tag_stack) if tag_stack else None,
-                                 "paragraph_index": paragraph_index})
-            is_closing, tag_id = match.groups()
-            if is_closing:
-                # Pop the last matching tag ID
-                if tag_stack:
-                    tag_stack.pop()
             else:
-                # Opening tag
-                tag_stack.append(tag_id)
-            pos = end  # Move position past this tag
         else:
-            # No more tags, capture the rest
-            buffer = text[pos:]
-            if buffer:
-                runs.append(
-                    {"text": buffer, "id": tuple(tag_stack) if tag_stack else None, "paragraph_index": paragraph_index})
-            break
     return runs
@@ -115,9 +137,12 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
     tokens_with_style = []
     for run in runs:
         tokens = word_tokenize(run["text"])
-        for token in tokens:
             tokens_with_style.append(run.copy())
-            tokens_with_style[-1]["text"] = token
     token_index = 0
     tokenized_sentences_with_style = []
@@ -259,7 +284,11 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
             return ''.join(f'</g>' for _ in to_close)
         def open_tags(to_open):
-            return ''.join(f'<g id="{gid}">' for gid in to_open)
         for key, paragraph in paragraphs_with_style.items():
             output = []
@@ -335,17 +364,19 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
     translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
     # group the runs by original paragraph
-    translated_paragraphs_with_style = dict()
     for item in translated_runs_with_style:
-        if item['paragraph_index'] in translated_paragraphs_with_style:
-            translated_paragraphs_with_style[item['paragraph_index']].append(item)
-        else:
-            # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
-            # didn't know where paragraphs started and ended
             first_item_in_paragraph = item.copy()
             first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
             translated_paragraphs_with_style[item['paragraph_index']] = []
             translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
     # save to new plain text file
     translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
@@ -367,5 +398,5 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
     output = stdout.decode('utf-8')
     translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
-    print("Saved file")
     return translated_file_path

     return os.path.join(original_xliff_file_path + f".{source_lang}")
+def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[str, str | tuple[str, ...]]]:
     """
     Given some text that may or may not contain some chunks tagged with something like <g id=1> </g>, extract each
     of the runs of text and convert them into dictionaries to keep this information
     tag_stack = []
     runs = []
     pos = 0
+    # Match any tag: <tag id="123"/>, </tag>, or <tag id="123">
+    tag_pattern = re.compile(r'<(/?)(\w+)(?:\s+id="(\d+)")?\s*(/?)>')
+    for match in tag_pattern.finditer(paragraph):
+        start, end = match.span()
+        is_closing = match.group(1) == "/"
+        tag_name = match.group(2)
+        tag_id = match.group(3)
+        is_self_closing = match.group(4) == "/"
+        # Text before this tag
+        if start > pos:
+            text = paragraph[pos:start]
+            if text:
+                runs.append({
+                    "text": text,
+                    "id": tag_stack.copy(),
+                    "paragraph_index": paragraph_index
+                })
+        if is_closing:
+            # Closing tag </tag>
+            expected_prefix = f"{tag_name}_"
+            if tag_stack and tag_stack[-1].startswith(expected_prefix):
+                tag_stack.pop()
             else:
+                raise ValueError(f"Mismatched closing tag </{tag_name}>")
+        elif is_self_closing:
+            # Self-closing tag like <x id="1"/>
+            if tag_id is None:
+                raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
+            runs.append({
+                "text": "",
+                "id": [f"{tag_name}_{tag_id}"],
+                "paragraph_index": paragraph_index
+            })
         else:
+            # Opening tag <tag id="...">
+            if tag_id is None:
+                raise ValueError(f"Opening tag <{tag_name}> missing id")
+            tag_stack.append(f"{tag_name}_{tag_id}")
+        pos = end
+    # Final trailing text
+    if pos < len(paragraph):
+        text = paragraph[pos:]
+        if text:
+            runs.append({
+                "text": text,
+                "id": tag_stack.copy(),
+                "paragraph_index": paragraph_index
+            })
     return runs
     tokens_with_style = []
     for run in runs:
         tokens = word_tokenize(run["text"])
+        if tokens:
+            for token in tokens:
+                tokens_with_style.append(run.copy())
+                tokens_with_style[-1]["text"] = token
+        else:
             tokens_with_style.append(run.copy())
     token_index = 0
     tokenized_sentences_with_style = []
             return ''.join(f'</g>' for _ in to_close)
         def open_tags(to_open):
+            tag = ""
+            for gid in to_open:
+                tag_type, tag_id = gid.split("_")
+                tag += f'<{tag_type} id="{tag_id}">'
+            return tag
         for key, paragraph in paragraphs_with_style.items():
             output = []
     translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
     # group the runs by original paragraph
+    translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
+                                        range(len(paragraphs_with_runs))}
     for item in translated_runs_with_style:
+        # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
+        # didn't know where paragraphs started and ended
+        if not translated_paragraphs_with_style[item['paragraph_index']][0]["text"]:
             first_item_in_paragraph = item.copy()
             first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
             translated_paragraphs_with_style[item['paragraph_index']] = []
             translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
+        else:
+            translated_paragraphs_with_style[item['paragraph_index']].append(item)
     # save to new plain text file
     translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
     output = stdout.decode('utf-8')
     translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
+    print(f"Saved file in {translated_file_path}")
     return translated_file_path