Spaces:
Sleeping
Sleeping
turns out odts work a bit differently and runs could have more than one tag id, which complicates things quite a lot
Browse files- src/translate_any_doc.py +76 -28
src/translate_any_doc.py
CHANGED
@@ -57,30 +57,43 @@ def get_runs_from_paragraph(text: str, paragraph_index: int) -> list[dict[str, s
|
|
57 |
list[dict]: Where each element is a run with text, tag id (if any, if not None) and paragraph_index
|
58 |
"""
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
-
return
|
84 |
|
85 |
|
86 |
def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dict[str, str]]]:
|
@@ -239,14 +252,49 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
|
|
239 |
out_file_path: Path to the file where the plain text will be saved
|
240 |
"""
|
241 |
with open(out_file_path, "w") as out_file:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
for key, paragraph in paragraphs_with_style.items():
|
243 |
-
|
244 |
for run in paragraph:
|
245 |
-
if run["id"]
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
|
251 |
|
252 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
|
57 |
list[dict]: Where each element is a run with text, tag id (if any, if not None) and paragraph_index
|
58 |
"""
|
59 |
|
60 |
+
tag_stack = []
|
61 |
+
runs = []
|
62 |
+
buffer = ''
|
63 |
+
pos = 0
|
64 |
+
|
65 |
+
tag_pattern = re.compile(r'<(/?)g(?: id="(\d+)")?>')
|
66 |
+
|
67 |
+
while pos < len(text):
|
68 |
+
match = tag_pattern.search(text, pos)
|
69 |
+
if match:
|
70 |
+
start, end = match.span()
|
71 |
+
# Add any text before this tag as a run
|
72 |
+
if start > pos:
|
73 |
+
buffer = text[pos:start]
|
74 |
+
if buffer:
|
75 |
+
runs.append({"text": buffer, "id": tuple(tag_stack) if tag_stack else None,
|
76 |
+
"paragraph_index": paragraph_index})
|
77 |
+
|
78 |
+
is_closing, tag_id = match.groups()
|
79 |
+
if is_closing:
|
80 |
+
# Pop the last matching tag ID
|
81 |
+
if tag_stack:
|
82 |
+
tag_stack.pop()
|
83 |
+
else:
|
84 |
+
# Opening tag
|
85 |
+
tag_stack.append(tag_id)
|
86 |
|
87 |
+
pos = end # Move position past this tag
|
88 |
+
else:
|
89 |
+
# No more tags, capture the rest
|
90 |
+
buffer = text[pos:]
|
91 |
+
if buffer:
|
92 |
+
runs.append(
|
93 |
+
{"text": buffer, "id": tuple(tag_stack) if tag_stack else None, "paragraph_index": paragraph_index})
|
94 |
+
break
|
95 |
|
96 |
+
return runs
|
97 |
|
98 |
|
99 |
def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dict[str, str]]]:
|
|
|
252 |
out_file_path: Path to the file where the plain text will be saved
|
253 |
"""
|
254 |
with open(out_file_path, "w") as out_file:
|
255 |
+
|
256 |
+
current_stack = []
|
257 |
+
|
258 |
+
def close_tags(to_close):
|
259 |
+
return ''.join(f'</g>' for _ in to_close)
|
260 |
+
|
261 |
+
def open_tags(to_open):
|
262 |
+
return ''.join(f'<g id="{gid}">' for gid in to_open)
|
263 |
+
|
264 |
for key, paragraph in paragraphs_with_style.items():
|
265 |
+
output = []
|
266 |
for run in paragraph:
|
267 |
+
ids = list(run["id"]) if run["id"] else []
|
268 |
+
|
269 |
+
# Find the point where current and new IDs diverge
|
270 |
+
common_prefix_len = 0
|
271 |
+
for a, b in zip(current_stack, ids):
|
272 |
+
if a == b:
|
273 |
+
common_prefix_len += 1
|
274 |
+
else:
|
275 |
+
break
|
276 |
+
|
277 |
+
# Close tags not in the new stack
|
278 |
+
to_close = current_stack[common_prefix_len:]
|
279 |
+
if to_close:
|
280 |
+
output.append(close_tags(to_close))
|
281 |
+
|
282 |
+
# Open new tags
|
283 |
+
to_open = ids[common_prefix_len:]
|
284 |
+
if to_open:
|
285 |
+
output.append(open_tags(to_open))
|
286 |
+
|
287 |
+
# Add text
|
288 |
+
output.append(run["text"])
|
289 |
+
|
290 |
+
# Update the stack
|
291 |
+
current_stack = ids
|
292 |
+
|
293 |
+
# Close any remaining open tags
|
294 |
+
if current_stack:
|
295 |
+
output.append(close_tags(current_stack))
|
296 |
+
|
297 |
+
out_file.write("".join(output) + "\n")
|
298 |
|
299 |
|
300 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|