mjuvilla commited on
Commit
f5f4b70
·
1 Parent(s): 1792639

removed old file

Browse files
Files changed (1) hide show
  1. main.py +0 -399
main.py DELETED
@@ -1,399 +0,0 @@
1
- import os
2
- from collections import defaultdict
3
-
4
- from docx import Document
5
- from docx.text.hyperlink import Hyperlink
6
- from docx.text.run import Run
7
- import nltk
8
-
9
- nltk.download('punkt')
10
- nltk.download('punkt_tab')
11
-
12
- from nltk.tokenize import sent_tokenize, word_tokenize
13
- from nltk.tokenize.treebank import TreebankWordDetokenizer
14
-
15
- from subprocess import Popen, PIPE
16
-
17
- from itertools import groupby
18
- import fileinput
19
-
20
- from datetime import datetime
21
- from transformers import AutoTokenizer, AutoModelForCausalLM
22
- import torch
23
- from iso639 import languages
24
- import tqdm
25
-
26
-
27
- class Translator():
28
- def __init__(self, model_path, source_lang, target_lang):
29
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
30
-
31
- self.model = AutoModelForCausalLM.from_pretrained(
32
- model_path,
33
- device_map="auto",
34
- torch_dtype=torch.bfloat16
35
- )
36
-
37
- self.prompt_f = lambda x: (f"Translate the following text from {source_lang} into "
38
- f"{target_lang}.\n{source_lang}: {x} \n{target_lang}:")
39
-
40
- def translate(self, text):
41
- message = [{"role": "user", "content": self.prompt_f(text)}]
42
- date_string = datetime.today().strftime('%Y-%m-%d')
43
-
44
- prompt = self.tokenizer.apply_chat_template(
45
- message,
46
- tokenize=False,
47
- add_generation_prompt=True,
48
- date_string=date_string
49
- )
50
-
51
- inputs = self.tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
52
- input_length = inputs.shape[1]
53
- outputs = self.model.generate(input_ids=inputs.to(self.model.device),
54
- max_new_tokens=400,
55
- early_stopping=True,
56
- num_beams=5)
57
-
58
- return self.tokenizer.decode(outputs[0, input_length:], skip_special_tokens=True)
59
-
60
-
61
- # Class to align original and translated sentences
62
- # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
63
- class Aligner():
64
- def __init__(self, config_folder, source_lang, target_lang, temp_folder):
65
- forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
66
- reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
67
-
68
- fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
69
- rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
70
-
71
- self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
72
- self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
73
-
74
- self.forward_command = lambda \
75
- x: f'./fast_align -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
76
- self.reverse_command = lambda \
77
- x: f'./fast_align -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
78
-
79
- self.symmetric_command = f'./atools -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
80
-
81
- def __simplify_alignment_file(self, file):
82
- with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
83
- for line in f:
84
- print(line.split('|||')[2].strip())
85
-
86
- def __read_err(self, err):
87
- (T, m) = ('', '')
88
- for line in open(err):
89
- # expected target length = source length * N
90
- if 'expected target length' in line:
91
- m = line.split()[-1]
92
- # final tension: N
93
- elif 'final tension' in line:
94
- T = line.split()[-1]
95
- return T, m
96
-
97
- def align(self, file):
98
- # generate forward alignment
99
- process = Popen(self.forward_command(file), shell=True)
100
- process.wait()
101
- # generate reverse alignment
102
- process = Popen(self.reverse_command(file), shell=True)
103
- process.wait()
104
-
105
- # for some reason the output file contains more information than needed, remove it
106
- self.__simplify_alignment_file(self.forward_alignment_file_path)
107
- self.__simplify_alignment_file(self.reverse_alignment_file_path)
108
-
109
- # generate symmetrical alignment
110
- process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
111
- process.wait()
112
-
113
- # get final alignments and format them
114
- alignments_str = process.communicate()[0].decode('utf-8')
115
- alignments = []
116
- for line in alignments_str.splitlines():
117
- alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
118
-
119
- return alignments
120
-
121
-
122
- # Function to extract paragraphs with their runs
123
- def extract_paragraphs_with_runs(doc):
124
- paragraphs_with_runs = []
125
- for idx, paragraph in enumerate(doc.paragraphs):
126
- runs = []
127
- for item in paragraph.iter_inner_content():
128
- if isinstance(item, Run):
129
- runs.append({
130
- 'text': item.text,
131
- 'bold': item.bold,
132
- 'italic': item.italic,
133
- 'underline': item.underline,
134
- 'font_name': item.font.name,
135
- 'font_size': item.font.size,
136
- 'font_color': item.font.color.rgb,
137
- 'paragraph_index': idx
138
- })
139
- elif isinstance(item, Hyperlink):
140
- runs.append({
141
- 'text': item.runs[0].text,
142
- 'bold': item.runs[0].bold,
143
- 'italic': item.runs[0].italic,
144
- 'underline': item.runs[0].underline,
145
- 'font_name': item.runs[0].font.name,
146
- 'font_size': item.runs[0].font.size,
147
- 'font_color': item.runs[0].font.color.rgb,
148
- 'paragraph_index': idx
149
- })
150
-
151
- paragraphs_with_runs.append(runs)
152
- return paragraphs_with_runs
153
-
154
-
155
- def tokenize_paragraph_with_runs2(runs_in_paragraph):
156
- text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
157
- sentences = sent_tokenize(text_paragraph)
158
- tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
159
-
160
- tokenized_sentences_with_style = []
161
- for tokenized_sentence in tokenized_sentences:
162
- tokenized_sentence_with_style = []
163
- token_idx = 0
164
- for run in runs_in_paragraph:
165
- text_in_run = run["text"].strip()
166
-
167
- if text_in_run == tokenized_sentence[token_idx]:
168
- new_run = run.copy()
169
- new_run["text"] = text_in_run
170
- tokenized_sentence_with_style.append(new_run)
171
- token_idx += 1
172
- if token_idx >= len(tokenized_sentence):
173
- break
174
- elif len(text_in_run) > len(tokenized_sentence[token_idx]):
175
- if text_in_run.startswith(tokenized_sentence[token_idx]):
176
- for token in word_tokenize(text_in_run):
177
- if token == tokenized_sentence[token_idx]:
178
- new_run = run.copy()
179
- new_run["text"] = token
180
- tokenized_sentence_with_style.append(new_run)
181
- token_idx += 1
182
- else:
183
- raise "oops"
184
- tokenized_sentences_with_style.append(tokenized_sentence_with_style)
185
- return tokenized_sentences_with_style
186
-
187
-
188
- def tokenize_with_runs(runs, detokenizer):
189
- text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
190
- sentences = sent_tokenize(text_paragraph)
191
- tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
192
-
193
- tokens_with_style = []
194
- for run in runs:
195
- tokens = word_tokenize(run["text"])
196
- for token in tokens:
197
- tokens_with_style.append(run.copy())
198
- tokens_with_style[-1]["text"] = token
199
-
200
- token_index = 0
201
- tokenized_sentences_with_style = []
202
- for sentence in tokenized_sentences:
203
- sentence_with_style = []
204
- for word in sentence:
205
- if word == tokens_with_style[token_index]["text"]:
206
- sentence_with_style.append(tokens_with_style[token_index])
207
- token_index += 1
208
- else:
209
- if word.startswith(tokens_with_style[token_index]["text"]):
210
- # this token might be split into several runs
211
- word_left = word
212
-
213
- while word_left:
214
- sentence_with_style.append(tokens_with_style[token_index])
215
- word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
216
- token_index += 1
217
- else:
218
- raise "oops"
219
- tokenized_sentences_with_style.append(sentence_with_style)
220
- return tokenized_sentences_with_style
221
-
222
-
223
- def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, aligner, temp_folder, detokenizer):
224
- # clean temp folder
225
- for f in os.listdir(temp_folder):
226
- os.remove(os.path.join(temp_folder, f))
227
-
228
- temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
229
-
230
- # tokenize the original text by sentence and words while keeping the style
231
- original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
232
- original_paragraphs_with_runs]
233
-
234
- # flatten all the runs so we can align with just one call instead of one per paragraph
235
- original_tokenized_sentences_with_style = [item for sublist in original_tokenized_sentences_with_style for item in
236
- sublist]
237
-
238
- # tokenize the translated text by sentence and word
239
- translated_tokenized_sentences = [word_tokenize(sentence) for
240
- translated_paragraph in translated_paragraphs for sentence in
241
- sent_tokenize(translated_paragraph)]
242
-
243
- # write the file that fastalign will use
244
- with open(temp_file_path, "w") as out_file:
245
- for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
246
- out_file.write(f"{" ".join(item["text"] for item in original)} ||| {" ".join(translated)}\n")
247
-
248
- alignments = aligner.align(temp_file_path)
249
-
250
- # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
251
- translated_sentences_with_style = []
252
- for sentence_idx, sentence_alignments in enumerate(alignments):
253
-
254
- # reverse the order of the alignments and build a dict with it
255
- sentence_alignments = {target: source for source, target in sentence_alignments}
256
-
257
- translated_sentence_with_style = []
258
- for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
259
- # fastalign has found a token aligned with the translated one
260
- if token_idx in sentence_alignments.keys():
261
- # get the aligned token
262
- original_idx = sentence_alignments[token_idx]
263
- new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
264
- new_entry["text"] = translated_token
265
- translated_sentence_with_style.append(new_entry)
266
- else:
267
- # WARNING this is a test
268
- # since fastalign doesn't know from which word to reference this token, copy the style of the previous word
269
- new_entry = translated_sentence_with_style[-1].copy()
270
- new_entry["text"] = translated_token
271
- translated_sentence_with_style.append(new_entry)
272
-
273
- translated_sentences_with_style.append(translated_sentence_with_style)
274
-
275
- return translated_sentences_with_style
276
-
277
-
278
- # group contiguous elements with the same boolean values
279
- def group_by_style(values, detokenizer):
280
- groups = []
281
- for key, group in groupby(values, key=lambda x: (
282
- x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'],
283
- x['paragraph_index'])):
284
- text = detokenizer.detokenize([item['text'] for item in group])
285
-
286
- if groups and not text.startswith((",", ";", ":", ".", ")")):
287
- text = " " + text
288
-
289
- groups.append({"text": text,
290
- "bold": key[0],
291
- "italic": key[1],
292
- "underline": key[2],
293
- "font_name": key[3],
294
- "font_size": key[4],
295
- "font_color": key[5],
296
- 'paragraph_index': key[6]})
297
- return groups
298
-
299
-
300
- def preprocess_runs(runs_in_paragraph):
301
- new_runs = []
302
-
303
- for run in runs_in_paragraph:
304
-
305
- # sometimes the parameters are False and sometimes they are None, set them all to False
306
- for key, value in run.items():
307
- if value is None and not key.startswith("font"):
308
- run[key] = False
309
-
310
- if not new_runs:
311
- new_runs.append(run)
312
- else:
313
- # if the previous run has the same format as the current run, we merge the two runs together
314
- if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
315
- new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
316
- and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
317
- and new_runs[-1]["underline"] == run["underline"]
318
- and new_runs[-1]["paragraph_index"] == run["paragraph_index"]):
319
- new_runs[-1]["text"] += run["text"]
320
- else:
321
- new_runs.append(run)
322
-
323
- # we want to split runs that contain more than one sentence to avoid problems later when aligning styles
324
- sentences = sent_tokenize(new_runs[-1]["text"])
325
- if len(sentences) > 1:
326
- new_runs[-1]["text"] = sentences[0]
327
- for sentence in sentences[1:]:
328
- new_run = new_runs[-1].copy()
329
- new_run["text"] = sentence
330
- new_runs.append(new_run)
331
-
332
- return new_runs
333
-
334
-
335
- if __name__ == "__main__":
336
- input_file = 'data/test3.docx'
337
- output_file = 'data/translated_output.docx'
338
- source_lang = 'ca'
339
- target_lang = 'en'
340
- config_folder = "fast_align_config"
341
- temp_folder = "tmp"
342
-
343
- aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
344
-
345
- os.makedirs(temp_folder, exist_ok=True)
346
-
347
- # load original file, extract the paragraphs with their runs (which include style and formatting)
348
- doc = Document(input_file)
349
- paragraphs_with_runs = extract_paragraphs_with_runs(doc)
350
-
351
- detokenizer = TreebankWordDetokenizer()
352
-
353
- translator = Translator("BSC-LT/salamandraTA-7b-instruct", languages.get(alpha2=source_lang).name,
354
- languages.get(alpha2=target_lang).name)
355
-
356
- # translate each paragraph
357
- translated_paragraphs = []
358
- for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
359
- paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
360
- translated_paragraphs.append(translator.translate(paragraph_text))
361
-
362
- print(translated_paragraphs)
363
-
364
- out_doc = Document()
365
-
366
- processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
367
-
368
- translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
369
- translated_paragraphs, aligner,
370
- temp_folder, detokenizer)
371
- # flatten the sentences into a list of tokens
372
- translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
373
- # group the tokens by style/run
374
- translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
375
-
376
- # group the runs by original paragraph
377
- translated_paragraphs_with_style = defaultdict(list)
378
- for item in translated_runs_with_style:
379
- translated_paragraphs_with_style[item['paragraph_index']].append(item)
380
-
381
- for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
382
- # in case there are empty paragraphs
383
- if not original_paragraph.text:
384
- out_doc.add_paragraph(style=original_paragraph.style)
385
- continue
386
-
387
- para = out_doc.add_paragraph(style=original_paragraph.style)
388
-
389
- for item in translated_paragraphs_with_style[paragraph_index]:
390
- run = para.add_run(item["text"])
391
- # Preserve original run formatting
392
- run.bold = item['bold']
393
- run.italic = item['italic']
394
- run.underline = item['underline']
395
- run.font.name = item['font_name']
396
- run.font.size = item['font_size']
397
- run.font.color.rgb = item['font_color']
398
-
399
- out_doc.save(output_file)