mjuvilla commited on
Commit
978cbf1
·
0 Parent(s):

First commit. For now the translation has not been integrated but reading a docx and writing its translation while keeping the formatting and style should work

Browse files
Files changed (3) hide show
  1. main.py +318 -0
  2. readme.md +18 -0
  3. requirements.txt +2 -0
main.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from docx import Document
4
+ import nltk
5
+
6
+ nltk.download('punkt')
7
+ nltk.download('punkt_tab')
8
+
9
+ from nltk.tokenize import sent_tokenize, word_tokenize
10
+ from nltk.tokenize.treebank import TreebankWordDetokenizer
11
+
12
+ from subprocess import Popen, PIPE
13
+
14
+ from itertools import groupby
15
+ import fileinput
16
+
17
+
18
+ # Class to align original and translated sentences
19
+ # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
20
+ class Aligner():
21
+ def __init__(self, config_folder, source_lang, target_lang, temp_folder):
22
+ forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
23
+ reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
24
+
25
+ fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
26
+ rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
27
+
28
+ self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
29
+ self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
30
+
31
+ self.forward_command = lambda \
32
+ x: f'./fast_align -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
33
+ self.reverse_command = lambda \
34
+ x: f'./fast_align -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
35
+
36
+ self.symmetric_command = f'./atools -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
37
+
38
+ def __simplify_alignment_file(self, file):
39
+ with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
40
+ for line in f:
41
+ print(line.split('|||')[2].strip())
42
+
43
+ def __read_err(self, err):
44
+ (T, m) = ('', '')
45
+ for line in open(err):
46
+ # expected target length = source length * N
47
+ if 'expected target length' in line:
48
+ m = line.split()[-1]
49
+ # final tension: N
50
+ elif 'final tension' in line:
51
+ T = line.split()[-1]
52
+ return T, m
53
+
54
+ def align(self, file):
55
+ # generate forward alignment
56
+ process = Popen(self.forward_command(file), shell=True)
57
+ process.wait()
58
+ # generate reverse alignment
59
+ process = Popen(self.reverse_command(file), shell=True)
60
+ process.wait()
61
+
62
+ # for some reason the output file contains more information than needed, remove it
63
+ self.__simplify_alignment_file(self.forward_alignment_file_path)
64
+ self.__simplify_alignment_file(self.reverse_alignment_file_path)
65
+
66
+ # generate symmetrical alignment
67
+ process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
68
+ process.wait()
69
+
70
+ # get final alignments and format them
71
+ alignments_str = process.communicate()[0].decode('utf-8')
72
+ alignments = []
73
+ for line in alignments_str.splitlines():
74
+ alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
75
+
76
+ return alignments
77
+
78
+
79
+ # Function to extract paragraphs with their runs
80
+ def extract_paragraphs_with_runs(doc):
81
+ paragraphs_with_runs = []
82
+ for para in doc.paragraphs:
83
+ runs = []
84
+ for run in para.runs:
85
+ runs.append({
86
+ 'text': run.text,
87
+ 'bold': run.bold,
88
+ 'italic': run.italic,
89
+ 'underline': run.underline,
90
+ 'font_name': run.font.name,
91
+ 'font_size': run.font.size,
92
+ 'font_color': run.font.color.rgb
93
+ })
94
+ paragraphs_with_runs.append(runs)
95
+ return paragraphs_with_runs
96
+
97
+
98
+ def tokenize_paragraph_with_runs2(runs_in_paragraph):
99
+ text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
100
+ sentences = sent_tokenize(text_paragraph)
101
+ tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
102
+
103
+ tokenized_sentences_with_style = []
104
+ for tokenized_sentence in tokenized_sentences:
105
+ tokenized_sentence_with_style = []
106
+ token_idx = 0
107
+ for run in runs_in_paragraph:
108
+ text_in_run = run["text"].strip()
109
+
110
+ if text_in_run == tokenized_sentence[token_idx]:
111
+ new_run = run.copy()
112
+ new_run["text"] = text_in_run
113
+ tokenized_sentence_with_style.append(new_run)
114
+ token_idx += 1
115
+ if token_idx >= len(tokenized_sentence):
116
+ break
117
+ elif len(text_in_run) > len(tokenized_sentence[token_idx]):
118
+ if text_in_run.startswith(tokenized_sentence[token_idx]):
119
+ for token in word_tokenize(text_in_run):
120
+ if token == tokenized_sentence[token_idx]:
121
+ new_run = run.copy()
122
+ new_run["text"] = token
123
+ tokenized_sentence_with_style.append(new_run)
124
+ token_idx += 1
125
+ else:
126
+ raise "oops"
127
+ tokenized_sentences_with_style.append(tokenized_sentence_with_style)
128
+ return tokenized_sentences_with_style
129
+
130
+
131
+ def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer):
132
+ text_paragraph = detokenizer.detokenize([run["text"] for run in runs_in_paragraph])
133
+ sentences = sent_tokenize(text_paragraph)
134
+ tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
135
+
136
+ tokens_with_style = []
137
+ for run in runs_in_paragraph:
138
+ tokens = word_tokenize(run["text"])
139
+ for token in tokens:
140
+ tokens_with_style.append(run.copy())
141
+ tokens_with_style[-1]["text"] = token
142
+
143
+ token_index = 0
144
+ tokenized_sentences_with_style = []
145
+ for sentence in tokenized_sentences:
146
+ sentence_with_style = []
147
+ for word in sentence:
148
+ if word == tokens_with_style[token_index]["text"]:
149
+ sentence_with_style.append(tokens_with_style[token_index])
150
+ token_index += 1
151
+ else:
152
+ if word.startswith(tokens_with_style[token_index]["text"]):
153
+ # this token might be split into several runs
154
+ word_left = word
155
+
156
+ while word_left:
157
+ sentence_with_style.append(tokens_with_style[token_index])
158
+ word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
159
+ token_index += 1
160
+ else:
161
+ raise "oops"
162
+ tokenized_sentences_with_style.append(sentence_with_style)
163
+ return tokenized_sentences_with_style
164
+
165
+
166
+ def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner, temp_folder, detokenizer):
167
+ # clean temp folder
168
+ for f in os.listdir(temp_folder):
169
+ os.remove(os.path.join(temp_folder, f))
170
+
171
+ temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
172
+
173
+ # tokenize the original text by sentence and words while keeping the style
174
+ original_tokenized_sentences_with_style = tokenize_paragraph_with_runs(original_runs_in_paragraph, detokenizer)
175
+ # tokenize the translated text by sentence and word
176
+ translated_tokenized_sentences = [word_tokenize(sentence) for sentence in sent_tokenize(translated_paragraph)]
177
+
178
+ # write the file that fastalign will use
179
+ with open(temp_file_path, "w") as out_file:
180
+ for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
181
+ out_file.write(f"{" ".join(item["text"] for item in original)} ||| {" ".join(translated)}\n")
182
+
183
+ alignments = aligner.align(temp_file_path)
184
+
185
+ # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
186
+ translated_sentences_with_style = []
187
+ for sentence_idx, sentence_alignments in enumerate(alignments):
188
+
189
+ # reverse the order of the alignments and build a dict with it
190
+ sentence_alignments = {target: source for source, target in sentence_alignments}
191
+
192
+ translated_sentence_with_style = []
193
+ for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
194
+ # fastalign has found a token aligned with the translated one
195
+ if token_idx in sentence_alignments.keys():
196
+ # get the aligned token
197
+ original_idx = sentence_alignments[token_idx]
198
+ new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
199
+ new_entry["text"] = translated_token
200
+ translated_sentence_with_style.append(new_entry)
201
+ else:
202
+ # WARNING this is a test
203
+ # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
204
+ new_entry = translated_sentence_with_style[-1].copy()
205
+ new_entry["text"] = translated_token
206
+ translated_sentence_with_style.append(new_entry)
207
+
208
+ translated_sentences_with_style.append(translated_sentence_with_style)
209
+
210
+ return translated_sentences_with_style
211
+
212
+
213
+ # TODO
214
+ def translate_paragraph(paragraph_text):
215
+ translated_paragraph = ""
216
+ return translated_paragraphs
217
+
218
+
219
+ # group contiguous elements with the same boolean values
220
+ def group_by_style(values, detokenizer):
221
+ groups = []
222
+ for key, group in groupby(values, key=lambda x: (
223
+ x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
224
+ text = detokenizer.detokenize([item['text'] for item in group])
225
+
226
+ groups.append({"text": text,
227
+ "bold": key[0],
228
+ "italic": key[1],
229
+ "underline": key[2],
230
+ "font_name": key[3],
231
+ "font_size": key[4],
232
+ "font_color": key[5]})
233
+ return groups
234
+
235
+
236
+ def preprocess_runs(runs_in_paragraph):
237
+ new_runs = []
238
+
239
+ for run in runs_in_paragraph:
240
+ if not new_runs:
241
+ new_runs.append(run)
242
+ else:
243
+ # if the previous run has the same format as the current run, we merge the two runs together
244
+ if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
245
+ new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
246
+ and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
247
+ and new_runs[-1]["underline"] == run["underline"]):
248
+ new_runs[-1]["text"] += run["text"]
249
+ else:
250
+ new_runs.append(run)
251
+
252
+ # we want to split runs that contain more than one sentence to avoid problems later when aligning styles
253
+ sentences = sent_tokenize(new_runs[-1]["text"])
254
+ if len(sentences) > 1:
255
+ new_runs[-1]["text"] = sentences[0]
256
+ for sentence in sentences[1:]:
257
+ new_run = new_runs[-1].copy()
258
+ new_run["text"] = sentence
259
+ new_runs.append(new_run)
260
+
261
+ return new_runs
262
+
263
+
264
+ if __name__ == "__main__":
265
+ input_file = 'data/test2.docx'
266
+ output_file = 'data/translated_output.docx'
267
+ source_lang = 'ca'
268
+ target_lang = 'en'
269
+ config_folder = "fast_align_config"
270
+ temp_folder = "tmp"
271
+
272
+ aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
273
+
274
+ os.makedirs(temp_folder, exist_ok=True)
275
+
276
+ # load original file, extract the paragraphs with their runs (which include style and formatting)
277
+ doc = Document(input_file)
278
+ paragraphs_with_runs = extract_paragraphs_with_runs(doc)
279
+
280
+ detokenizer = TreebankWordDetokenizer()
281
+
282
+ # translate each paragraph
283
+ translated_paragraphs = []
284
+ for paragraph in paragraphs_with_runs:
285
+ paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
286
+ translated_paragraphs.append(translate_paragraph(paragraph_text))
287
+
288
+ out_doc = Document()
289
+
290
+ for original_runs_in_paragraph, translated_paragraph in zip(paragraphs_with_runs, translated_paragraphs):
291
+ # sometimes we get empty paragraphs for some reason, I think it's just docx shenanigans
292
+ if not original_runs_in_paragraph:
293
+ continue
294
+
295
+ original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
296
+
297
+ paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
298
+ temp_folder, detokenizer)
299
+
300
+ para = out_doc.add_paragraph()
301
+
302
+ # flatten the paragraph, we don't need it to split into sentences anymore
303
+ paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
304
+
305
+ # merge tokens into runs and detokenize
306
+ paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
307
+
308
+ for item in paragraph_with_runs:
309
+ run = para.add_run(item["text"] + " ")
310
+ # Preserve original run formatting
311
+ run.bold = item['bold']
312
+ run.italic = item['italic']
313
+ run.underline = item['underline']
314
+ run.font.name = item['font_name']
315
+ run.font.size = item['font_size']
316
+ run.font.color.rgb = item['font_color']
317
+
318
+ out_doc.save(output_file)
readme.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # document_translator
2
+
3
+ Project to translate files (for now .docx) using BSC's models while keeping the formatting and style of the original file.
4
+
5
+ ## Requirements
6
+ ### python 3.12
7
+
8
+ ### fast_align
9
+
10
+ Clone https://github.com/clab/fast_align, run the compilation commands indicated in the project's readme, place fast_align and atools (.exe if using windows) in this project's root.
11
+
12
+ ### fast_align fine-tuning files
13
+
14
+ I took the 4 files (ca-en.params, ca-en.err, en-ca.params and en-ca.err) from https://huggingface.co/projecte-aina/aina-translator-ca-en/tree/main. Maybe we could automatize the download of these files. For now, place these files in config_folder (defined in main.py).
15
+
16
+ ### python requirements
17
+
18
+ pip install -r requirements.txt
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ nltk~=3.9.1
2
+ python-docx~=1.1.2