Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Apr 15

Commit

fd61039

unverified ·

2 Parent(s): 8030df1 f5f4b70

Merge pull request #1 from langtech-bsc/windows

Browse files

Files changed (2) hide show

gradio_app.py +39 -0
main.py → translate_docx.py +44 -95

gradio_app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import gradio as gr
+from pathlib import Path
+import requests
+import json
+from translate_docx import translate_document, translate, Aligner
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+ip='10.192.31.127'
+config_folder = 'fast_align_config'
+source_lang = 'en'
+target_lang = 'ca'
+temp_folder = 'tmp'
+aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
+detokenizer = TreebankWordDetokenizer()
+def upload_file(filepath):
+    translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
+    return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
+def download_file():
+    return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
+with gr.Blocks() as demo:
+    with gr.Tab("Text"):
+        gr.Interface(fn=translate, inputs=["text","text","text"], outputs="text")
+    with gr.Tab("Docx documents"):
+        gr.Markdown("First upload a file and and then you'll be able download it (but only once!)")
+        with gr.Row():
+            u = gr.UploadButton("Upload a file", file_count="single")
+            d = gr.DownloadButton("Download the file", visible=False)
+        u.upload(upload_file, u, [u, d])
+        d.click(download_file, None, [u, d])
+if __name__ == "__main__":
+    demo.launch()

main.py → translate_docx.py RENAMED Viewed

@@ -1,10 +1,16 @@
 import os
 from collections import defaultdict
 from docx import Document
 from docx.text.hyperlink import Hyperlink
 from docx.text.run import Run
 import nltk
 nltk.download('punkt')
 nltk.download('punkt_tab')
@@ -17,45 +23,20 @@ from subprocess import Popen, PIPE
 from itertools import groupby
 import fileinput
-from datetime import datetime
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-from iso639 import languages
-import tqdm
-class Translator():
-    def __init__(self, model_path, source_lang, target_lang):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            device_map="auto",
-            torch_dtype=torch.bfloat16
-        )
-        self.prompt_f = lambda x: (f"Translate the following text from {source_lang} into "
-                                   f"{target_lang}.\n{source_lang}: {x} \n{target_lang}:")
-    def translate(self, text):
-        message = [{"role": "user", "content": self.prompt_f(text)}]
-        date_string = datetime.today().strftime('%Y-%m-%d')
-        prompt = self.tokenizer.apply_chat_template(
-            message,
-            tokenize=False,
-            add_generation_prompt=True,
-            date_string=date_string
-        )
-        inputs = self.tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
-        input_length = inputs.shape[1]
-        outputs = self.model.generate(input_ids=inputs.to(self.model.device),
-                                      max_new_tokens=400,
-                                      early_stopping=True,
-                                      num_beams=5)
-        return self.tokenizer.decode(outputs[0, input_length:], skip_special_tokens=True)
 # Class to align original and translated sentences
@@ -71,12 +52,19 @@ class Aligner():
         self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
         self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
         self.forward_command = lambda \
-                x: f'./fast_align -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
         self.reverse_command = lambda \
-                x: f'./fast_align -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
-        self.symmetric_command = f'./atools -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
     def __simplify_alignment_file(self, file):
         with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
@@ -152,39 +140,6 @@ def extract_paragraphs_with_runs(doc):
     return paragraphs_with_runs
-def tokenize_paragraph_with_runs2(runs_in_paragraph):
-    text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
-    sentences = sent_tokenize(text_paragraph)
-    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
-    tokenized_sentences_with_style = []
-    for tokenized_sentence in tokenized_sentences:
-        tokenized_sentence_with_style = []
-        token_idx = 0
-        for run in runs_in_paragraph:
-            text_in_run = run["text"].strip()
-            if text_in_run == tokenized_sentence[token_idx]:
-                new_run = run.copy()
-                new_run["text"] = text_in_run
-                tokenized_sentence_with_style.append(new_run)
-                token_idx += 1
-                if token_idx >= len(tokenized_sentence):
-                    break
-            elif len(text_in_run) > len(tokenized_sentence[token_idx]):
-                if text_in_run.startswith(tokenized_sentence[token_idx]):
-                    for token in word_tokenize(text_in_run):
-                        if token == tokenized_sentence[token_idx]:
-                            new_run = run.copy()
-                            new_run["text"] = token
-                            tokenized_sentence_with_style.append(new_run)
-                        token_idx += 1
-            else:
-                raise "oops"
-        tokenized_sentences_with_style.append(tokenized_sentence_with_style)
-    return tokenized_sentences_with_style
 def tokenize_with_runs(runs, detokenizer):
     text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
     sentences = sent_tokenize(text_paragraph)
@@ -215,7 +170,7 @@ def tokenize_with_runs(runs, detokenizer):
                         word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
                         token_index += 1
                 else:
-                    raise "oops"
         tokenized_sentences_with_style.append(sentence_with_style)
     return tokenized_sentences_with_style
@@ -243,7 +198,7 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
     # write the file that fastalign will use
     with open(temp_file_path, "w") as out_file:
         for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
-            out_file.write(f"{" ".join(item["text"] for item in original)} ||| {" ".join(translated)}\n")
     alignments = aligner.align(temp_file_path)
@@ -332,46 +287,38 @@ def preprocess_runs(runs_in_paragraph):
     return new_runs
-if __name__ == "__main__":
-    input_file = 'data/test3.docx'
-    output_file = 'data/translated_output.docx'
-    source_lang = 'ca'
-    target_lang = 'en'
-    config_folder = "fast_align_config"
-    temp_folder = "tmp"
-    aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
     os.makedirs(temp_folder, exist_ok=True)
     # load original file, extract the paragraphs with their runs (which include style and formatting)
     doc = Document(input_file)
     paragraphs_with_runs = extract_paragraphs_with_runs(doc)
-    detokenizer = TreebankWordDetokenizer()
-    translator = Translator("BSC-LT/salamandraTA-7b-instruct", languages.get(alpha2=source_lang).name,
-                            languages.get(alpha2=target_lang).name)
     # translate each paragraph
     translated_paragraphs = []
     for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
         paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
-        translated_paragraphs.append(translator.translate(paragraph_text))
-    print(translated_paragraphs)
     out_doc = Document()
     processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
     translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
                                                           translated_paragraphs, aligner,
                                                           temp_folder, detokenizer)
     # flatten the sentences into a list of tokens
     translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
     # group the tokens by style/run
     translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
     # group the runs by original paragraph
     translated_paragraphs_with_style = defaultdict(list)
@@ -396,4 +343,6 @@ if __name__ == "__main__":
             run.font.size = item['font_size']
             run.font.color.rgb = item['font_color']
-    out_doc.save(output_file)

+import time
+import json
+import requests
+import tqdm
 import os
+import string
 from collections import defaultdict
 from docx import Document
 from docx.text.hyperlink import Hyperlink
 from docx.text.run import Run
 import nltk
+import platform
 nltk.download('punkt')
 nltk.download('punkt_tab')
 from itertools import groupby
 import fileinput
+ip = "192.168.20.216"
+port = "8000"
+def translate(text, ip, port):
+    myobj = {
+        'id': '1',
+        'src': text,
+    }
+    port = str(int(port))
+    url = 'http://' + ip + ':' + port + '/translate'
+    x = requests.post(url, json=myobj)
+    json_response = json.loads(x.text)
+    return json_response['tgt']
 # Class to align original and translated sentences
         self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
         self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
+        if platform.system().lower() == "windows":
+            fastalign_bin = "fast_align.exe"
+            atools_bin = "atools.exe"
+        else:
+            fastalign_bin = "./fast_align"
+            atools_bin = "./atools"
         self.forward_command = lambda \
+                x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
         self.reverse_command = lambda \
+                x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
+        self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
     def __simplify_alignment_file(self, file):
         with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
     return paragraphs_with_runs
 def tokenize_with_runs(runs, detokenizer):
     text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
     sentences = sent_tokenize(text_paragraph)
                         word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
                         token_index += 1
                 else:
+                    raise "Something unexpected happened I'm afraid"
         tokenized_sentences_with_style.append(sentence_with_style)
     return tokenized_sentences_with_style
     # write the file that fastalign will use
     with open(temp_file_path, "w") as out_file:
         for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
+            out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
     alignments = aligner.align(temp_file_path)
     return new_runs
+def translate_document(input_file,
+                       aligner,
+                       detokenizer,
+                       ip="192.168.20.216",
+                       temp_folder="tmp",
+                       port="8000"):
     os.makedirs(temp_folder, exist_ok=True)
     # load original file, extract the paragraphs with their runs (which include style and formatting)
     doc = Document(input_file)
     paragraphs_with_runs = extract_paragraphs_with_runs(doc)
     # translate each paragraph
     translated_paragraphs = []
     for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
         paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
+        translated_paragraphs.append(translate(paragraph_text, ip, port))
     out_doc = Document()
     processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
+    print("Generating alignments...")
     translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
                                                           translated_paragraphs, aligner,
                                                           temp_folder, detokenizer)
+    print("Finished alignments")
     # flatten the sentences into a list of tokens
     translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
     # group the tokens by style/run
     translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
+    print("Grouped by style")
     # group the runs by original paragraph
     translated_paragraphs_with_style = defaultdict(list)
             run.font.size = item['font_size']
             run.font.color.rgb = item['font_color']
+    out_doc.save("translated.docx")
+    print("Saved file")
+    return "translated.docx"