mjuvilla commited on
Commit
4420a7f
·
1 Parent(s): 7743917

Fixed bug when processing docx files

Browse files
Files changed (1) hide show
  1. src/translate_any_doc.py +71 -40
src/translate_any_doc.py CHANGED
@@ -44,7 +44,7 @@ def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal
44
  return os.path.join(original_xliff_file_path + f".{source_lang}")
45
 
46
 
47
- def get_runs_from_paragraph(text: str, paragraph_index: int) -> list[dict[str, str | tuple[str, ...]]]:
48
  """
49
  Given some text that may or may not contain some chunks tagged with something like <g id=1> </g>, extract each
50
  of the runs of text and convert them into dictionaries to keep this information
@@ -59,39 +59,61 @@ def get_runs_from_paragraph(text: str, paragraph_index: int) -> list[dict[str,
59
 
60
  tag_stack = []
61
  runs = []
62
- buffer = ''
63
  pos = 0
64
 
65
- tag_pattern = re.compile(r'<(/?)g(?: id="(\d+)")?>')
66
-
67
- while pos < len(text):
68
- match = tag_pattern.search(text, pos)
69
- if match:
70
- start, end = match.span()
71
- # Add any text before this tag as a run
72
- if start > pos:
73
- buffer = text[pos:start]
74
- if buffer:
75
- runs.append({"text": buffer, "id": tuple(tag_stack) if tag_stack else None,
76
- "paragraph_index": paragraph_index})
77
-
78
- is_closing, tag_id = match.groups()
79
- if is_closing:
80
- # Pop the last matching tag ID
81
- if tag_stack:
82
- tag_stack.pop()
 
 
 
 
 
 
 
83
  else:
84
- # Opening tag
85
- tag_stack.append(tag_id)
86
-
87
- pos = end # Move position past this tag
 
 
 
 
 
 
88
  else:
89
- # No more tags, capture the rest
90
- buffer = text[pos:]
91
- if buffer:
92
- runs.append(
93
- {"text": buffer, "id": tuple(tag_stack) if tag_stack else None, "paragraph_index": paragraph_index})
94
- break
 
 
 
 
 
 
 
 
 
 
95
 
96
  return runs
97
 
@@ -115,9 +137,12 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
115
  tokens_with_style = []
116
  for run in runs:
117
  tokens = word_tokenize(run["text"])
118
- for token in tokens:
 
 
 
 
119
  tokens_with_style.append(run.copy())
120
- tokens_with_style[-1]["text"] = token
121
 
122
  token_index = 0
123
  tokenized_sentences_with_style = []
@@ -259,7 +284,11 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
259
  return ''.join(f'</g>' for _ in to_close)
260
 
261
  def open_tags(to_open):
262
- return ''.join(f'<g id="{gid}">' for gid in to_open)
 
 
 
 
263
 
264
  for key, paragraph in paragraphs_with_style.items():
265
  output = []
@@ -335,17 +364,19 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
335
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
336
 
337
  # group the runs by original paragraph
338
- translated_paragraphs_with_style = dict()
 
 
339
  for item in translated_runs_with_style:
340
- if item['paragraph_index'] in translated_paragraphs_with_style:
341
- translated_paragraphs_with_style[item['paragraph_index']].append(item)
342
- else:
343
- # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
344
- # didn't know where paragraphs started and ended
345
  first_item_in_paragraph = item.copy()
346
  first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
347
  translated_paragraphs_with_style[item['paragraph_index']] = []
348
  translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
 
 
349
 
350
  # save to new plain text file
351
  translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
@@ -367,5 +398,5 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
367
  output = stdout.decode('utf-8')
368
  translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
369
 
370
- print("Saved file")
371
  return translated_file_path
 
44
  return os.path.join(original_xliff_file_path + f".{source_lang}")
45
 
46
 
47
+ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[str, str | tuple[str, ...]]]:
48
  """
49
  Given some text that may or may not contain some chunks tagged with something like <g id=1> </g>, extract each
50
  of the runs of text and convert them into dictionaries to keep this information
 
59
 
60
  tag_stack = []
61
  runs = []
 
62
  pos = 0
63
 
64
+ # Match any tag: <tag id="123"/>, </tag>, or <tag id="123">
65
+ tag_pattern = re.compile(r'<(/?)(\w+)(?:\s+id="(\d+)")?\s*(/?)>')
66
+
67
+ for match in tag_pattern.finditer(paragraph):
68
+ start, end = match.span()
69
+ is_closing = match.group(1) == "/"
70
+ tag_name = match.group(2)
71
+ tag_id = match.group(3)
72
+ is_self_closing = match.group(4) == "/"
73
+
74
+ # Text before this tag
75
+ if start > pos:
76
+ text = paragraph[pos:start]
77
+ if text:
78
+ runs.append({
79
+ "text": text,
80
+ "id": tag_stack.copy(),
81
+ "paragraph_index": paragraph_index
82
+ })
83
+
84
+ if is_closing:
85
+ # Closing tag </tag>
86
+ expected_prefix = f"{tag_name}_"
87
+ if tag_stack and tag_stack[-1].startswith(expected_prefix):
88
+ tag_stack.pop()
89
  else:
90
+ raise ValueError(f"Mismatched closing tag </{tag_name}>")
91
+ elif is_self_closing:
92
+ # Self-closing tag like <x id="1"/>
93
+ if tag_id is None:
94
+ raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
95
+ runs.append({
96
+ "text": "",
97
+ "id": [f"{tag_name}_{tag_id}"],
98
+ "paragraph_index": paragraph_index
99
+ })
100
  else:
101
+ # Opening tag <tag id="...">
102
+ if tag_id is None:
103
+ raise ValueError(f"Opening tag <{tag_name}> missing id")
104
+ tag_stack.append(f"{tag_name}_{tag_id}")
105
+
106
+ pos = end
107
+
108
+ # Final trailing text
109
+ if pos < len(paragraph):
110
+ text = paragraph[pos:]
111
+ if text:
112
+ runs.append({
113
+ "text": text,
114
+ "id": tag_stack.copy(),
115
+ "paragraph_index": paragraph_index
116
+ })
117
 
118
  return runs
119
 
 
137
  tokens_with_style = []
138
  for run in runs:
139
  tokens = word_tokenize(run["text"])
140
+ if tokens:
141
+ for token in tokens:
142
+ tokens_with_style.append(run.copy())
143
+ tokens_with_style[-1]["text"] = token
144
+ else:
145
  tokens_with_style.append(run.copy())
 
146
 
147
  token_index = 0
148
  tokenized_sentences_with_style = []
 
284
  return ''.join(f'</g>' for _ in to_close)
285
 
286
  def open_tags(to_open):
287
+ tag = ""
288
+ for gid in to_open:
289
+ tag_type, tag_id = gid.split("_")
290
+ tag += f'<{tag_type} id="{tag_id}">'
291
+ return tag
292
 
293
  for key, paragraph in paragraphs_with_style.items():
294
  output = []
 
364
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
365
 
366
  # group the runs by original paragraph
367
+ translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
368
+ range(len(paragraphs_with_runs))}
369
+
370
  for item in translated_runs_with_style:
371
+ # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
372
+ # didn't know where paragraphs started and ended
373
+ if not translated_paragraphs_with_style[item['paragraph_index']][0]["text"]:
 
 
374
  first_item_in_paragraph = item.copy()
375
  first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
376
  translated_paragraphs_with_style[item['paragraph_index']] = []
377
  translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
378
+ else:
379
+ translated_paragraphs_with_style[item['paragraph_index']].append(item)
380
 
381
  # save to new plain text file
382
  translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
 
398
  output = stdout.decode('utf-8')
399
  translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
400
 
401
+ print(f"Saved file in {translated_file_path}")
402
  return translated_file_path