Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Apr 28

Commit

c79a1ef

1 Parent(s): bc3b289

turns out odts work a bit differently and runs could have more than one tag id, which complicates things quite a lot

Browse files

Files changed (1) hide show

src/translate_any_doc.py +76 -28

src/translate_any_doc.py CHANGED Viewed

@@ -57,30 +57,43 @@ def get_runs_from_paragraph(text: str, paragraph_index: int) -> list[dict[str, s
     list[dict]: Where each element is a run with text, tag id (if any, if not None) and paragraph_index
     """
-    pattern = r'<g id="(\d+)">(.*?)</g>'
-    chunks = []
-    last_index = 0
-    for match in re.finditer(pattern, text):
-        start, end = match.span()
-        id_ = match.group(1)
-        content = match.group(2)
-        # Add plain text before the tag, if any
-        if start > last_index:
-            plain_text = text[last_index:start]
-            chunks.append({"text": plain_text, "id": None, "paragraph_index": paragraph_index})
-        # Add tagged content
-        if content != " ":
-            chunks.append({"text": content, "id": id_, "paragraph_index": paragraph_index})
-        last_index = end
-    # Add any remaining plain text after the last tag
-    if last_index < len(text):
-        chunks.append({"text": text[last_index:], "id": None, "paragraph_index": paragraph_index})
-    return chunks
 def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dict[str, str]]]:
@@ -239,14 +252,49 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
     out_file_path: Path to the file where the plain text will be saved
     """
     with open(out_file_path, "w") as out_file:
         for key, paragraph in paragraphs_with_style.items():
-            text_paragraph = ""
             for run in paragraph:
-                if run["id"]:
-                    text_paragraph += f'<g id="{run["id"]}">{run["text"]}</g>'
-                else:
-                    text_paragraph += run["text"]
-            out_file.write(text_paragraph + "\n")
 def translate_document(input_file: str, source_lang: str, target_lang: str,

     list[dict]: Where each element is a run with text, tag id (if any, if not None) and paragraph_index
     """
+    tag_stack = []
+    runs = []
+    buffer = ''
+    pos = 0
+    tag_pattern = re.compile(r'<(/?)g(?: id="(\d+)")?>')
+    while pos < len(text):
+        match = tag_pattern.search(text, pos)
+        if match:
+            start, end = match.span()
+            # Add any text before this tag as a run
+            if start > pos:
+                buffer = text[pos:start]
+                if buffer:
+                    runs.append({"text": buffer, "id": tuple(tag_stack) if tag_stack else None,
+                                 "paragraph_index": paragraph_index})
+            is_closing, tag_id = match.groups()
+            if is_closing:
+                # Pop the last matching tag ID
+                if tag_stack:
+                    tag_stack.pop()
+            else:
+                # Opening tag
+                tag_stack.append(tag_id)
+            pos = end  # Move position past this tag
+        else:
+            # No more tags, capture the rest
+            buffer = text[pos:]
+            if buffer:
+                runs.append(
+                    {"text": buffer, "id": tuple(tag_stack) if tag_stack else None, "paragraph_index": paragraph_index})
+            break
+    return runs
 def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dict[str, str]]]:
     out_file_path: Path to the file where the plain text will be saved
     """
     with open(out_file_path, "w") as out_file:
+        current_stack = []
+        def close_tags(to_close):
+            return ''.join(f'</g>' for _ in to_close)
+        def open_tags(to_open):
+            return ''.join(f'<g id="{gid}">' for gid in to_open)
         for key, paragraph in paragraphs_with_style.items():
+            output = []
             for run in paragraph:
+                ids = list(run["id"]) if run["id"] else []
+                # Find the point where current and new IDs diverge
+                common_prefix_len = 0
+                for a, b in zip(current_stack, ids):
+                    if a == b:
+                        common_prefix_len += 1
+                    else:
+                        break
+                # Close tags not in the new stack
+                to_close = current_stack[common_prefix_len:]
+                if to_close:
+                    output.append(close_tags(to_close))
+                # Open new tags
+                to_open = ids[common_prefix_len:]
+                if to_open:
+                    output.append(open_tags(to_open))
+                # Add text
+                output.append(run["text"])
+                # Update the stack
+                current_stack = ids
+            # Close any remaining open tags
+            if current_stack:
+                output.append(close_tags(current_stack))
+            out_file.write("".join(output) + "\n")
 def translate_document(input_file: str, source_lang: str, target_lang: str,