mjuvilla commited on
Commit
fd61039
·
unverified ·
2 Parent(s): 8030df1 f5f4b70

Merge pull request #1 from langtech-bsc/windows

Browse files
Files changed (2) hide show
  1. gradio_app.py +39 -0
  2. main.py → translate_docx.py +44 -95
gradio_app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pathlib import Path
3
+ import requests
4
+ import json
5
+ from translate_docx import translate_document, translate, Aligner
6
+ from nltk.tokenize.treebank import TreebankWordDetokenizer
7
+
8
+
9
+ ip='10.192.31.127'
10
+ config_folder = 'fast_align_config'
11
+ source_lang = 'en'
12
+ target_lang = 'ca'
13
+ temp_folder = 'tmp'
14
+ aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
15
+ detokenizer = TreebankWordDetokenizer()
16
+
17
+
18
+ def upload_file(filepath):
19
+ translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
20
+ return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
21
+
22
+ def download_file():
23
+ return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
24
+
25
+
26
+ with gr.Blocks() as demo:
27
+
28
+ with gr.Tab("Text"):
29
+ gr.Interface(fn=translate, inputs=["text","text","text"], outputs="text")
30
+ with gr.Tab("Docx documents"):
31
+ gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
32
+ with gr.Row():
33
+ u = gr.UploadButton("Upload a file", file_count="single")
34
+ d = gr.DownloadButton("Download the file", visible=False)
35
+
36
+ u.upload(upload_file, u, [u, d])
37
+ d.click(download_file, None, [u, d])
38
+ if __name__ == "__main__":
39
+ demo.launch()
main.py → translate_docx.py RENAMED
@@ -1,10 +1,16 @@
 
 
 
 
1
  import os
 
2
  from collections import defaultdict
3
 
4
  from docx import Document
5
  from docx.text.hyperlink import Hyperlink
6
  from docx.text.run import Run
7
  import nltk
 
8
 
9
  nltk.download('punkt')
10
  nltk.download('punkt_tab')
@@ -17,45 +23,20 @@ from subprocess import Popen, PIPE
17
  from itertools import groupby
18
  import fileinput
19
 
20
- from datetime import datetime
21
- from transformers import AutoTokenizer, AutoModelForCausalLM
22
- import torch
23
- from iso639 import languages
24
- import tqdm
25
-
26
-
27
- class Translator():
28
- def __init__(self, model_path, source_lang, target_lang):
29
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
30
-
31
- self.model = AutoModelForCausalLM.from_pretrained(
32
- model_path,
33
- device_map="auto",
34
- torch_dtype=torch.bfloat16
35
- )
36
-
37
- self.prompt_f = lambda x: (f"Translate the following text from {source_lang} into "
38
- f"{target_lang}.\n{source_lang}: {x} \n{target_lang}:")
39
-
40
- def translate(self, text):
41
- message = [{"role": "user", "content": self.prompt_f(text)}]
42
- date_string = datetime.today().strftime('%Y-%m-%d')
43
 
44
- prompt = self.tokenizer.apply_chat_template(
45
- message,
46
- tokenize=False,
47
- add_generation_prompt=True,
48
- date_string=date_string
49
- )
50
 
51
- inputs = self.tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
52
- input_length = inputs.shape[1]
53
- outputs = self.model.generate(input_ids=inputs.to(self.model.device),
54
- max_new_tokens=400,
55
- early_stopping=True,
56
- num_beams=5)
57
-
58
- return self.tokenizer.decode(outputs[0, input_length:], skip_special_tokens=True)
 
 
59
 
60
 
61
  # Class to align original and translated sentences
@@ -71,12 +52,19 @@ class Aligner():
71
  self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
72
  self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
73
 
 
 
 
 
 
 
 
74
  self.forward_command = lambda \
75
- x: f'./fast_align -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
76
  self.reverse_command = lambda \
77
- x: f'./fast_align -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
78
 
79
- self.symmetric_command = f'./atools -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
80
 
81
  def __simplify_alignment_file(self, file):
82
  with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
@@ -152,39 +140,6 @@ def extract_paragraphs_with_runs(doc):
152
  return paragraphs_with_runs
153
 
154
 
155
- def tokenize_paragraph_with_runs2(runs_in_paragraph):
156
- text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
157
- sentences = sent_tokenize(text_paragraph)
158
- tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
159
-
160
- tokenized_sentences_with_style = []
161
- for tokenized_sentence in tokenized_sentences:
162
- tokenized_sentence_with_style = []
163
- token_idx = 0
164
- for run in runs_in_paragraph:
165
- text_in_run = run["text"].strip()
166
-
167
- if text_in_run == tokenized_sentence[token_idx]:
168
- new_run = run.copy()
169
- new_run["text"] = text_in_run
170
- tokenized_sentence_with_style.append(new_run)
171
- token_idx += 1
172
- if token_idx >= len(tokenized_sentence):
173
- break
174
- elif len(text_in_run) > len(tokenized_sentence[token_idx]):
175
- if text_in_run.startswith(tokenized_sentence[token_idx]):
176
- for token in word_tokenize(text_in_run):
177
- if token == tokenized_sentence[token_idx]:
178
- new_run = run.copy()
179
- new_run["text"] = token
180
- tokenized_sentence_with_style.append(new_run)
181
- token_idx += 1
182
- else:
183
- raise "oops"
184
- tokenized_sentences_with_style.append(tokenized_sentence_with_style)
185
- return tokenized_sentences_with_style
186
-
187
-
188
  def tokenize_with_runs(runs, detokenizer):
189
  text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
190
  sentences = sent_tokenize(text_paragraph)
@@ -215,7 +170,7 @@ def tokenize_with_runs(runs, detokenizer):
215
  word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
216
  token_index += 1
217
  else:
218
- raise "oops"
219
  tokenized_sentences_with_style.append(sentence_with_style)
220
  return tokenized_sentences_with_style
221
 
@@ -243,7 +198,7 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
243
  # write the file that fastalign will use
244
  with open(temp_file_path, "w") as out_file:
245
  for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
246
- out_file.write(f"{" ".join(item["text"] for item in original)} ||| {" ".join(translated)}\n")
247
 
248
  alignments = aligner.align(temp_file_path)
249
 
@@ -332,46 +287,38 @@ def preprocess_runs(runs_in_paragraph):
332
  return new_runs
333
 
334
 
335
- if __name__ == "__main__":
336
- input_file = 'data/test3.docx'
337
- output_file = 'data/translated_output.docx'
338
- source_lang = 'ca'
339
- target_lang = 'en'
340
- config_folder = "fast_align_config"
341
- temp_folder = "tmp"
342
-
343
- aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
344
-
345
  os.makedirs(temp_folder, exist_ok=True)
346
-
347
  # load original file, extract the paragraphs with their runs (which include style and formatting)
348
  doc = Document(input_file)
349
  paragraphs_with_runs = extract_paragraphs_with_runs(doc)
350
 
351
- detokenizer = TreebankWordDetokenizer()
352
-
353
- translator = Translator("BSC-LT/salamandraTA-7b-instruct", languages.get(alpha2=source_lang).name,
354
- languages.get(alpha2=target_lang).name)
355
-
356
  # translate each paragraph
357
  translated_paragraphs = []
358
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
359
  paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
360
- translated_paragraphs.append(translator.translate(paragraph_text))
361
-
362
- print(translated_paragraphs)
363
 
364
  out_doc = Document()
365
 
366
  processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
367
 
 
368
  translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
369
  translated_paragraphs, aligner,
370
  temp_folder, detokenizer)
 
 
371
  # flatten the sentences into a list of tokens
372
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
373
  # group the tokens by style/run
374
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
 
375
 
376
  # group the runs by original paragraph
377
  translated_paragraphs_with_style = defaultdict(list)
@@ -396,4 +343,6 @@ if __name__ == "__main__":
396
  run.font.size = item['font_size']
397
  run.font.color.rgb = item['font_color']
398
 
399
- out_doc.save(output_file)
 
 
 
1
+ import time
2
+ import json
3
+ import requests
4
+ import tqdm
5
  import os
6
+ import string
7
  from collections import defaultdict
8
 
9
  from docx import Document
10
  from docx.text.hyperlink import Hyperlink
11
  from docx.text.run import Run
12
  import nltk
13
+ import platform
14
 
15
  nltk.download('punkt')
16
  nltk.download('punkt_tab')
 
23
  from itertools import groupby
24
  import fileinput
25
 
26
+ ip = "192.168.20.216"
27
+ port = "8000"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
 
 
 
 
 
 
29
 
30
+ def translate(text, ip, port):
31
+ myobj = {
32
+ 'id': '1',
33
+ 'src': text,
34
+ }
35
+ port = str(int(port))
36
+ url = 'http://' + ip + ':' + port + '/translate'
37
+ x = requests.post(url, json=myobj)
38
+ json_response = json.loads(x.text)
39
+ return json_response['tgt']
40
 
41
 
42
  # Class to align original and translated sentences
 
52
  self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
53
  self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
54
 
55
+ if platform.system().lower() == "windows":
56
+ fastalign_bin = "fast_align.exe"
57
+ atools_bin = "atools.exe"
58
+ else:
59
+ fastalign_bin = "./fast_align"
60
+ atools_bin = "./atools"
61
+
62
  self.forward_command = lambda \
63
+ x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
64
  self.reverse_command = lambda \
65
+ x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
66
 
67
+ self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
68
 
69
  def __simplify_alignment_file(self, file):
70
  with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
 
140
  return paragraphs_with_runs
141
 
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def tokenize_with_runs(runs, detokenizer):
144
  text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
145
  sentences = sent_tokenize(text_paragraph)
 
170
  word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
171
  token_index += 1
172
  else:
173
+ raise "Something unexpected happened I'm afraid"
174
  tokenized_sentences_with_style.append(sentence_with_style)
175
  return tokenized_sentences_with_style
176
 
 
198
  # write the file that fastalign will use
199
  with open(temp_file_path, "w") as out_file:
200
  for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
201
+ out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
202
 
203
  alignments = aligner.align(temp_file_path)
204
 
 
287
  return new_runs
288
 
289
 
290
+ def translate_document(input_file,
291
+ aligner,
292
+ detokenizer,
293
+ ip="192.168.20.216",
294
+ temp_folder="tmp",
295
+ port="8000"):
 
 
 
 
296
  os.makedirs(temp_folder, exist_ok=True)
 
297
  # load original file, extract the paragraphs with their runs (which include style and formatting)
298
  doc = Document(input_file)
299
  paragraphs_with_runs = extract_paragraphs_with_runs(doc)
300
 
 
 
 
 
 
301
  # translate each paragraph
302
  translated_paragraphs = []
303
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
304
  paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
305
+ translated_paragraphs.append(translate(paragraph_text, ip, port))
 
 
306
 
307
  out_doc = Document()
308
 
309
  processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
310
 
311
+ print("Generating alignments...")
312
  translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
313
  translated_paragraphs, aligner,
314
  temp_folder, detokenizer)
315
+ print("Finished alignments")
316
+
317
  # flatten the sentences into a list of tokens
318
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
319
  # group the tokens by style/run
320
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
321
+ print("Grouped by style")
322
 
323
  # group the runs by original paragraph
324
  translated_paragraphs_with_style = defaultdict(list)
 
343
  run.font.size = item['font_size']
344
  run.font.color.rgb = item['font_color']
345
 
346
+ out_doc.save("translated.docx")
347
+ print("Saved file")
348
+ return "translated.docx"