mjuvilla commited on
Commit
c79a1ef
·
1 Parent(s): bc3b289

turns out odts work a bit differently and runs could have more than one tag id, which complicates things quite a lot

Browse files
Files changed (1) hide show
  1. src/translate_any_doc.py +76 -28
src/translate_any_doc.py CHANGED
@@ -57,30 +57,43 @@ def get_runs_from_paragraph(text: str, paragraph_index: int) -> list[dict[str, s
57
  list[dict]: Where each element is a run with text, tag id (if any, if not None) and paragraph_index
58
  """
59
 
60
- pattern = r'<g id="(\d+)">(.*?)</g>'
61
- chunks = []
62
- last_index = 0
63
-
64
- for match in re.finditer(pattern, text):
65
- start, end = match.span()
66
- id_ = match.group(1)
67
- content = match.group(2)
68
-
69
- # Add plain text before the tag, if any
70
- if start > last_index:
71
- plain_text = text[last_index:start]
72
- chunks.append({"text": plain_text, "id": None, "paragraph_index": paragraph_index})
73
-
74
- # Add tagged content
75
- if content != " ":
76
- chunks.append({"text": content, "id": id_, "paragraph_index": paragraph_index})
77
- last_index = end
 
 
 
 
 
 
 
 
78
 
79
- # Add any remaining plain text after the last tag
80
- if last_index < len(text):
81
- chunks.append({"text": text[last_index:], "id": None, "paragraph_index": paragraph_index})
 
 
 
 
 
82
 
83
- return chunks
84
 
85
 
86
  def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dict[str, str]]]:
@@ -239,14 +252,49 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
239
  out_file_path: Path to the file where the plain text will be saved
240
  """
241
  with open(out_file_path, "w") as out_file:
 
 
 
 
 
 
 
 
 
242
  for key, paragraph in paragraphs_with_style.items():
243
- text_paragraph = ""
244
  for run in paragraph:
245
- if run["id"]:
246
- text_paragraph += f'<g id="{run["id"]}">{run["text"]}</g>'
247
- else:
248
- text_paragraph += run["text"]
249
- out_file.write(text_paragraph + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
 
252
  def translate_document(input_file: str, source_lang: str, target_lang: str,
 
57
  list[dict]: Where each element is a run with text, tag id (if any, if not None) and paragraph_index
58
  """
59
 
60
+ tag_stack = []
61
+ runs = []
62
+ buffer = ''
63
+ pos = 0
64
+
65
+ tag_pattern = re.compile(r'<(/?)g(?: id="(\d+)")?>')
66
+
67
+ while pos < len(text):
68
+ match = tag_pattern.search(text, pos)
69
+ if match:
70
+ start, end = match.span()
71
+ # Add any text before this tag as a run
72
+ if start > pos:
73
+ buffer = text[pos:start]
74
+ if buffer:
75
+ runs.append({"text": buffer, "id": tuple(tag_stack) if tag_stack else None,
76
+ "paragraph_index": paragraph_index})
77
+
78
+ is_closing, tag_id = match.groups()
79
+ if is_closing:
80
+ # Pop the last matching tag ID
81
+ if tag_stack:
82
+ tag_stack.pop()
83
+ else:
84
+ # Opening tag
85
+ tag_stack.append(tag_id)
86
 
87
+ pos = end # Move position past this tag
88
+ else:
89
+ # No more tags, capture the rest
90
+ buffer = text[pos:]
91
+ if buffer:
92
+ runs.append(
93
+ {"text": buffer, "id": tuple(tag_stack) if tag_stack else None, "paragraph_index": paragraph_index})
94
+ break
95
 
96
+ return runs
97
 
98
 
99
  def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dict[str, str]]]:
 
252
  out_file_path: Path to the file where the plain text will be saved
253
  """
254
  with open(out_file_path, "w") as out_file:
255
+
256
+ current_stack = []
257
+
258
+ def close_tags(to_close):
259
+ return ''.join(f'</g>' for _ in to_close)
260
+
261
+ def open_tags(to_open):
262
+ return ''.join(f'<g id="{gid}">' for gid in to_open)
263
+
264
  for key, paragraph in paragraphs_with_style.items():
265
+ output = []
266
  for run in paragraph:
267
+ ids = list(run["id"]) if run["id"] else []
268
+
269
+ # Find the point where current and new IDs diverge
270
+ common_prefix_len = 0
271
+ for a, b in zip(current_stack, ids):
272
+ if a == b:
273
+ common_prefix_len += 1
274
+ else:
275
+ break
276
+
277
+ # Close tags not in the new stack
278
+ to_close = current_stack[common_prefix_len:]
279
+ if to_close:
280
+ output.append(close_tags(to_close))
281
+
282
+ # Open new tags
283
+ to_open = ids[common_prefix_len:]
284
+ if to_open:
285
+ output.append(open_tags(to_open))
286
+
287
+ # Add text
288
+ output.append(run["text"])
289
+
290
+ # Update the stack
291
+ current_stack = ids
292
+
293
+ # Close any remaining open tags
294
+ if current_stack:
295
+ output.append(close_tags(current_stack))
296
+
297
+ out_file.write("".join(output) + "\n")
298
 
299
 
300
  def translate_document(input_file: str, source_lang: str, target_lang: str,