Spaces:
Sleeping
Sleeping
Merge pull request #1 from langtech-bsc/windows
Browse files- gradio_app.py +39 -0
- main.py → translate_docx.py +44 -95
gradio_app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from pathlib import Path
|
3 |
+
import requests
|
4 |
+
import json
|
5 |
+
from translate_docx import translate_document, translate, Aligner
|
6 |
+
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
7 |
+
|
8 |
+
|
9 |
+
ip='10.192.31.127'
|
10 |
+
config_folder = 'fast_align_config'
|
11 |
+
source_lang = 'en'
|
12 |
+
target_lang = 'ca'
|
13 |
+
temp_folder = 'tmp'
|
14 |
+
aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
|
15 |
+
detokenizer = TreebankWordDetokenizer()
|
16 |
+
|
17 |
+
|
18 |
+
def upload_file(filepath):
|
19 |
+
translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
|
20 |
+
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
|
21 |
+
|
22 |
+
def download_file():
|
23 |
+
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|
24 |
+
|
25 |
+
|
26 |
+
with gr.Blocks() as demo:
|
27 |
+
|
28 |
+
with gr.Tab("Text"):
|
29 |
+
gr.Interface(fn=translate, inputs=["text","text","text"], outputs="text")
|
30 |
+
with gr.Tab("Docx documents"):
|
31 |
+
gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
|
32 |
+
with gr.Row():
|
33 |
+
u = gr.UploadButton("Upload a file", file_count="single")
|
34 |
+
d = gr.DownloadButton("Download the file", visible=False)
|
35 |
+
|
36 |
+
u.upload(upload_file, u, [u, d])
|
37 |
+
d.click(download_file, None, [u, d])
|
38 |
+
if __name__ == "__main__":
|
39 |
+
demo.launch()
|
main.py → translate_docx.py
RENAMED
@@ -1,10 +1,16 @@
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
2 |
from collections import defaultdict
|
3 |
|
4 |
from docx import Document
|
5 |
from docx.text.hyperlink import Hyperlink
|
6 |
from docx.text.run import Run
|
7 |
import nltk
|
|
|
8 |
|
9 |
nltk.download('punkt')
|
10 |
nltk.download('punkt_tab')
|
@@ -17,45 +23,20 @@ from subprocess import Popen, PIPE
|
|
17 |
from itertools import groupby
|
18 |
import fileinput
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
import torch
|
23 |
-
from iso639 import languages
|
24 |
-
import tqdm
|
25 |
-
|
26 |
-
|
27 |
-
class Translator():
|
28 |
-
def __init__(self, model_path, source_lang, target_lang):
|
29 |
-
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
30 |
-
|
31 |
-
self.model = AutoModelForCausalLM.from_pretrained(
|
32 |
-
model_path,
|
33 |
-
device_map="auto",
|
34 |
-
torch_dtype=torch.bfloat16
|
35 |
-
)
|
36 |
-
|
37 |
-
self.prompt_f = lambda x: (f"Translate the following text from {source_lang} into "
|
38 |
-
f"{target_lang}.\n{source_lang}: {x} \n{target_lang}:")
|
39 |
-
|
40 |
-
def translate(self, text):
|
41 |
-
message = [{"role": "user", "content": self.prompt_f(text)}]
|
42 |
-
date_string = datetime.today().strftime('%Y-%m-%d')
|
43 |
|
44 |
-
prompt = self.tokenizer.apply_chat_template(
|
45 |
-
message,
|
46 |
-
tokenize=False,
|
47 |
-
add_generation_prompt=True,
|
48 |
-
date_string=date_string
|
49 |
-
)
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
59 |
|
60 |
|
61 |
# Class to align original and translated sentences
|
@@ -71,12 +52,19 @@ class Aligner():
|
|
71 |
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
72 |
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
self.forward_command = lambda \
|
75 |
-
x: f'
|
76 |
self.reverse_command = lambda \
|
77 |
-
x: f'
|
78 |
|
79 |
-
self.symmetric_command = f'
|
80 |
|
81 |
def __simplify_alignment_file(self, file):
|
82 |
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
@@ -152,39 +140,6 @@ def extract_paragraphs_with_runs(doc):
|
|
152 |
return paragraphs_with_runs
|
153 |
|
154 |
|
155 |
-
def tokenize_paragraph_with_runs2(runs_in_paragraph):
|
156 |
-
text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
|
157 |
-
sentences = sent_tokenize(text_paragraph)
|
158 |
-
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
|
159 |
-
|
160 |
-
tokenized_sentences_with_style = []
|
161 |
-
for tokenized_sentence in tokenized_sentences:
|
162 |
-
tokenized_sentence_with_style = []
|
163 |
-
token_idx = 0
|
164 |
-
for run in runs_in_paragraph:
|
165 |
-
text_in_run = run["text"].strip()
|
166 |
-
|
167 |
-
if text_in_run == tokenized_sentence[token_idx]:
|
168 |
-
new_run = run.copy()
|
169 |
-
new_run["text"] = text_in_run
|
170 |
-
tokenized_sentence_with_style.append(new_run)
|
171 |
-
token_idx += 1
|
172 |
-
if token_idx >= len(tokenized_sentence):
|
173 |
-
break
|
174 |
-
elif len(text_in_run) > len(tokenized_sentence[token_idx]):
|
175 |
-
if text_in_run.startswith(tokenized_sentence[token_idx]):
|
176 |
-
for token in word_tokenize(text_in_run):
|
177 |
-
if token == tokenized_sentence[token_idx]:
|
178 |
-
new_run = run.copy()
|
179 |
-
new_run["text"] = token
|
180 |
-
tokenized_sentence_with_style.append(new_run)
|
181 |
-
token_idx += 1
|
182 |
-
else:
|
183 |
-
raise "oops"
|
184 |
-
tokenized_sentences_with_style.append(tokenized_sentence_with_style)
|
185 |
-
return tokenized_sentences_with_style
|
186 |
-
|
187 |
-
|
188 |
def tokenize_with_runs(runs, detokenizer):
|
189 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
190 |
sentences = sent_tokenize(text_paragraph)
|
@@ -215,7 +170,7 @@ def tokenize_with_runs(runs, detokenizer):
|
|
215 |
word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
|
216 |
token_index += 1
|
217 |
else:
|
218 |
-
raise "
|
219 |
tokenized_sentences_with_style.append(sentence_with_style)
|
220 |
return tokenized_sentences_with_style
|
221 |
|
@@ -243,7 +198,7 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
|
|
243 |
# write the file that fastalign will use
|
244 |
with open(temp_file_path, "w") as out_file:
|
245 |
for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
|
246 |
-
out_file.write(f"{
|
247 |
|
248 |
alignments = aligner.align(temp_file_path)
|
249 |
|
@@ -332,46 +287,38 @@ def preprocess_runs(runs_in_paragraph):
|
|
332 |
return new_runs
|
333 |
|
334 |
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
temp_folder = "tmp"
|
342 |
-
|
343 |
-
aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
|
344 |
-
|
345 |
os.makedirs(temp_folder, exist_ok=True)
|
346 |
-
|
347 |
# load original file, extract the paragraphs with their runs (which include style and formatting)
|
348 |
doc = Document(input_file)
|
349 |
paragraphs_with_runs = extract_paragraphs_with_runs(doc)
|
350 |
|
351 |
-
detokenizer = TreebankWordDetokenizer()
|
352 |
-
|
353 |
-
translator = Translator("BSC-LT/salamandraTA-7b-instruct", languages.get(alpha2=source_lang).name,
|
354 |
-
languages.get(alpha2=target_lang).name)
|
355 |
-
|
356 |
# translate each paragraph
|
357 |
translated_paragraphs = []
|
358 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
359 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
360 |
-
translated_paragraphs.append(
|
361 |
-
|
362 |
-
print(translated_paragraphs)
|
363 |
|
364 |
out_doc = Document()
|
365 |
|
366 |
processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
|
367 |
|
|
|
368 |
translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
|
369 |
translated_paragraphs, aligner,
|
370 |
temp_folder, detokenizer)
|
|
|
|
|
371 |
# flatten the sentences into a list of tokens
|
372 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
373 |
# group the tokens by style/run
|
374 |
translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
|
|
|
375 |
|
376 |
# group the runs by original paragraph
|
377 |
translated_paragraphs_with_style = defaultdict(list)
|
@@ -396,4 +343,6 @@ if __name__ == "__main__":
|
|
396 |
run.font.size = item['font_size']
|
397 |
run.font.color.rgb = item['font_color']
|
398 |
|
399 |
-
out_doc.save(
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import json
|
3 |
+
import requests
|
4 |
+
import tqdm
|
5 |
import os
|
6 |
+
import string
|
7 |
from collections import defaultdict
|
8 |
|
9 |
from docx import Document
|
10 |
from docx.text.hyperlink import Hyperlink
|
11 |
from docx.text.run import Run
|
12 |
import nltk
|
13 |
+
import platform
|
14 |
|
15 |
nltk.download('punkt')
|
16 |
nltk.download('punkt_tab')
|
|
|
23 |
from itertools import groupby
|
24 |
import fileinput
|
25 |
|
26 |
+
ip = "192.168.20.216"
|
27 |
+
port = "8000"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
def translate(text, ip, port):
|
31 |
+
myobj = {
|
32 |
+
'id': '1',
|
33 |
+
'src': text,
|
34 |
+
}
|
35 |
+
port = str(int(port))
|
36 |
+
url = 'http://' + ip + ':' + port + '/translate'
|
37 |
+
x = requests.post(url, json=myobj)
|
38 |
+
json_response = json.loads(x.text)
|
39 |
+
return json_response['tgt']
|
40 |
|
41 |
|
42 |
# Class to align original and translated sentences
|
|
|
52 |
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
53 |
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
54 |
|
55 |
+
if platform.system().lower() == "windows":
|
56 |
+
fastalign_bin = "fast_align.exe"
|
57 |
+
atools_bin = "atools.exe"
|
58 |
+
else:
|
59 |
+
fastalign_bin = "./fast_align"
|
60 |
+
atools_bin = "./atools"
|
61 |
+
|
62 |
self.forward_command = lambda \
|
63 |
+
x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
|
64 |
self.reverse_command = lambda \
|
65 |
+
x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
|
66 |
|
67 |
+
self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
|
68 |
|
69 |
def __simplify_alignment_file(self, file):
|
70 |
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
|
|
140 |
return paragraphs_with_runs
|
141 |
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
def tokenize_with_runs(runs, detokenizer):
|
144 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
145 |
sentences = sent_tokenize(text_paragraph)
|
|
|
170 |
word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
|
171 |
token_index += 1
|
172 |
else:
|
173 |
+
raise "Something unexpected happened I'm afraid"
|
174 |
tokenized_sentences_with_style.append(sentence_with_style)
|
175 |
return tokenized_sentences_with_style
|
176 |
|
|
|
198 |
# write the file that fastalign will use
|
199 |
with open(temp_file_path, "w") as out_file:
|
200 |
for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
|
201 |
+
out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
|
202 |
|
203 |
alignments = aligner.align(temp_file_path)
|
204 |
|
|
|
287 |
return new_runs
|
288 |
|
289 |
|
290 |
+
def translate_document(input_file,
|
291 |
+
aligner,
|
292 |
+
detokenizer,
|
293 |
+
ip="192.168.20.216",
|
294 |
+
temp_folder="tmp",
|
295 |
+
port="8000"):
|
|
|
|
|
|
|
|
|
296 |
os.makedirs(temp_folder, exist_ok=True)
|
|
|
297 |
# load original file, extract the paragraphs with their runs (which include style and formatting)
|
298 |
doc = Document(input_file)
|
299 |
paragraphs_with_runs = extract_paragraphs_with_runs(doc)
|
300 |
|
|
|
|
|
|
|
|
|
|
|
301 |
# translate each paragraph
|
302 |
translated_paragraphs = []
|
303 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
304 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
305 |
+
translated_paragraphs.append(translate(paragraph_text, ip, port))
|
|
|
|
|
306 |
|
307 |
out_doc = Document()
|
308 |
|
309 |
processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
|
310 |
|
311 |
+
print("Generating alignments...")
|
312 |
translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
|
313 |
translated_paragraphs, aligner,
|
314 |
temp_folder, detokenizer)
|
315 |
+
print("Finished alignments")
|
316 |
+
|
317 |
# flatten the sentences into a list of tokens
|
318 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
319 |
# group the tokens by style/run
|
320 |
translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
|
321 |
+
print("Grouped by style")
|
322 |
|
323 |
# group the runs by original paragraph
|
324 |
translated_paragraphs_with_style = defaultdict(list)
|
|
|
343 |
run.font.size = item['font_size']
|
344 |
run.font.color.rgb = item['font_color']
|
345 |
|
346 |
+
out_doc.save("translated.docx")
|
347 |
+
print("Saved file")
|
348 |
+
return "translated.docx"
|