Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on 24 days ago

Commit

8be9040

1 Parent(s): 44978d8

created classes for running the translation models either from a local model or a huggingface endpoint. for now main.py only supports local models

Browse files

Files changed (4) hide show

main.py +44 -0
requirements.txt +2 -1
src/salamandraTA7b_translator.py +151 -17
src/salamandraTA7b_translator_HF.py +5 -52

main.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from src.aligner import Aligner
+from src.salamandraTA7b_translator import SalamandraTA7bTranslator, SalamandraTA7bQTranslator
+from src.salamandraTA7b_translator_HF import SalamandraTA7bTranslatorHF
+import os
+import time
+import argparse
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='main',
+        description='Translate a file')
+    parser.add_argument("-s", '--source_lang', type=str, required=True)
+    parser.add_argument("-t", '--target_lang', type=str, required=True)
+    parser.add_argument("-f", '--file_path', type=str, required=True)
+    parser.add_argument("-m", '--model_path', type=str, required=True)
+    parser.add_argument("-tt", '--translator_type', type=str,
+                        choices=["normal", "quantized"], default="none",
+                        help="normal=regular model; quantized=quantized model")
+    parser.add_argument('--fastalign_config_folder', type=str, default="fast_align_config")
+    parser.add_argument('--temp_folder', type=str, default="tmp")
+    args = parser.parse_args()
+    os.makedirs(args.temp_folder, exist_ok=True)
+    if args.translator_type == "normal":
+        translator = SalamandraTA7bTranslator(args.model_path)
+    elif args.translator_type == "quantized":
+        translator = SalamandraTA7bQTranslator(args.model_path)
+    else:
+        raise NotImplementedError(f"Option {args.translator_type} is not implemented.")
+    aligner = Aligner(args.fastalign_config_folder, args.source_lang, args.target_lang, args.temp_folder)
+    start_time = time.time()
+    for status, translated_file_name in translator.translate_document(args.file_path, args.source_lang,
+                                                                      args.target_lang):
+        if translated_file_name:  # finished
+            break
+        else:
+            print(status)
+    print(f"Finished document in {time.time() - start_time} seconds")

requirements.txt CHANGED Viewed

@@ -10,4 +10,5 @@ transformers~=4.57.1
 torch~=2.8.0
 huggingface-hub~=0.36.0
 vllm~=0.11.0
-iso-639~=0.4.5

 torch~=2.8.0
 huggingface-hub~=0.36.0
 vllm~=0.11.0
+iso-639~=0.4.5
+accelerate~=1.11.0

src/salamandraTA7b_translator.py CHANGED Viewed

@@ -1,24 +1,158 @@
-from gradio_client import Client
 from iso639 import languages
-class SalamandraTA7bTranslator:
-    def __init__(self, hf_token):
-        self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
-    def translate(self, text, source_lang, target_lang):
-        if not text:
-            return ""
-        # we assume that they are specifying the language by code so we need to convert it to name
         lang1 = languages.get(alpha2=source_lang).name
         lang2 = languages.get(alpha2=target_lang).name
-        result = self.client.predict(
-            task="Translation",
-            source=lang1,
-            target=lang2,
-            input_text=text,
-            mt_text=None,
-            api_name="/generate_output"
-        )
-        return result[0]

 from iso639 import languages
+from datetime import datetime
+from tqdm import tqdm
+from abc import ABC, abstractmethod
+import os
+import shutil
+from src.utils import file_to_moses, moses_to_file
+def generate_batches(lines, size_batches):
+    return (lines[i:i + size_batches] for i in range(0, len(lines), size_batches))
+def lines_to_moses(lines, out_file_path):
+    with open(out_file_path, "w") as out_file:
+        out_file.writelines(lines)
+class SalamandraTA7bTranslatorAbstract(ABC):
+    @abstractmethod
+    def __init__(self, model_path):
+        pass
+    @abstractmethod
+    def translate(self, lines, source_lang, target_lang):
+        pass
+    def translate_document(self, input_file, source_lang, target_lang,
+                           temp_folder: str = "tmp", tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0"):
+        input_filename = input_file.split("/")[-1]
+        os.makedirs(temp_folder, exist_ok=True)
+        # copy the original file to the temporal folder to avoid common issues with tikal
+        temp_input_file = os.path.join(temp_folder, input_filename)
+        shutil.copy(input_file, temp_input_file)
+        original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
+        plain_text_file = file_to_moses(temp_input_file, source_lang, target_lang, tikal_folder,
+                                        original_xliff_file)
+        lines = open(plain_text_file, "r", encoding="utf-8").read().splitlines()
+        translated_lines = self.translate(lines, source_lang, target_lang)
+        # create moses file with translated lines
+        translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
+        lines_to_moses(translated_lines, translated_moses_file)
+        # recreate the document with the translations
+        translated_file_path = moses_to_file(translated_moses_file, source_lang, target_lang, tikal_folder,
+                                             original_xliff_file)
+        print(f"Saved file in {translated_file_path}")
+        return translated_file_path
+class SalamandraTA7bTranslator(SalamandraTA7bTranslatorAbstract):
+    def __init__(self, model_path):
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        import torch
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            device_map="auto",
+            dtype=torch.bfloat16
+        ).eval()
+    def translate(self, lines, source_lang, target_lang):
         lang1 = languages.get(alpha2=source_lang).name
         lang2 = languages.get(alpha2=target_lang).name
+        prompt_template = lambda x: f"Translate the following text from {lang1} into {lang2}.\n{lang1}: {x} \n{lang2}:"
+        date_string = datetime.today().strftime('%Y-%m-%d')
+        # Create prompts for each sentence and record the length of each prompt (before generation)
+        total_translated = []
+        batches = generate_batches(lines, 100)
+        with tqdm(total=len(lines), desc='Translating...') as pbar:
+            for batch in batches:
+                prompts = []
+                input_lengths = []
+                for sentence in batch:
+                    text = prompt_template(sentence)
+                    message = [{"role": "user", "content": text}]
+                    prompt = self.tokenizer.apply_chat_template(
+                        message,
+                        tokenize=False,
+                        add_generation_prompt=True,
+                        date_string=date_string
+                    )
+                    prompts.append(prompt)
+                    # Record the prompt length so we can later slice off the prompt tokens from the generation output
+                    input_length = len(self.tokenizer.encode(prompt, add_special_tokens=False))
+                    input_lengths.append(input_length)
+                # Batch encode the prompts with padding
+                inputs = self.tokenizer(prompts, add_special_tokens=False, return_tensors="pt", padding=True)
+                input_ids = inputs["input_ids"].to(self.model.device)
+                attention_mask = inputs["attention_mask"].to(self.model.device)
+                # Generate translations in batch
+                outputs = self.model.generate(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    max_new_tokens=100,
+                    early_stopping=True,
+                    num_beams=1
+                )
+                # Decode and print each translation (slicing off the input prompt)
+                for i, output in enumerate(outputs):
+                    translation = self.tokenizer.decode(output[input_lengths[i]:], skip_special_tokens=True)
+                    total_translated.append(translation)
+        return total_translated
+class SalamandraTA7bQTranslator(SalamandraTA7bTranslatorAbstract):
+    def __init__(self, model_path):
+        from huggingface_hub import snapshot_download
+        from vllm import LLM
+        model_dir = snapshot_download(repo_id="BSC-LT/salamandraTA-7B-instruct-GGUF", revision="main")
+        model_name = "salamandrata_7b_inst_q4.gguf"
+        self.llm = LLM(model=model_dir + '/' + model_name, tokenizer=model_dir)
+    def translate(self, lines, source_lang, target_lang):
+        from vllm import SamplingParams
+        lang1 = languages.get(alpha2=source_lang).name
+        lang2 = languages.get(alpha2=target_lang).name
+        batches = generate_batches(lines, 100)
+        total_translated = []
+        prompt_template = lambda x: f"Translate the following text from {lang1} into {lang2}.\n{lang1}: {x} \n{lang2}:"
+        with tqdm(total=len(lines), desc='Translating...') as pbar:
+            for batch in batches:
+                messages = [[{"role": "user", "content": prompt_template(item)}] for item in batch]
+                outputs = self.llm.chat(messages,
+                                        sampling_params=SamplingParams(
+                                            temperature=0.1,
+                                            stop_token_ids=[5],
+                                            max_tokens=200)
+                                        )
+                translations = [item.outputs[0].text for item in outputs]
+                print(translations)
+                pbar.update(len(translations))
+                total_translated += translations
+        return total_translated

src/salamandraTA7b_translator_HF.py CHANGED Viewed

@@ -8,6 +8,7 @@ from subprocess import Popen, PIPE
 import re
 from src.aligner import Aligner
 import glob
 import spacy
@@ -60,8 +61,8 @@ class SalamandraTA7bTranslatorHF:
         shutil.copy(input_file, temp_input_file)
         original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
-        plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder,
-                                            original_xliff_file)
         # get paragraphs with runs
         paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
@@ -137,28 +138,8 @@ class SalamandraTA7bTranslatorHF:
         translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
         runs_to_plain_text(translated_paragraphs_with_style, translated_moses_file)
-        # put the translations into the xlf
-        tikal_moses_to_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-lm", original_xliff_file, "-sl",
-                                        source_lang, "-tl", target_lang, "-from", translated_moses_file, "-totrg",
-                                        "-noalttrans", "-to", original_xliff_file]
-        Popen(tikal_moses_to_xliff_command).wait()
-        # any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
-        # them. This may happen if a word in the original language has been split in more that one words that have other
-        # words in between, or an error in fastalign
-        text = open(original_xliff_file).read()
-        result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
-        open(original_xliff_file, "w").write(result)
-        # merge into a docx again
-        tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
-        final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
-        stdout, stderr = final_process.communicate()
-        final_process.wait()
-        # get the path to the output file
-        output = stdout.decode('utf-8')
-        translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
         print(f"Saved file in {translated_file_path}")
         yield "", translated_file_path
@@ -182,34 +163,6 @@ def get_leading_invisible(text):
     return text[:i]
-def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
-                      original_xliff_file_path: str) -> str:
-    """
-    Given a document, this function generates an xliff file and then a plain text file with the text contents
-    while keeping style and formatting using tags like <g id=1> </g>
-    Parameters:
-    input_file: Path to document to process
-    source_lang: Source language of the document
-    target_lang: Target language of the document
-    tikal_folder: Folder where tikal.sh is located
-    original_xliff_file_path: Path to xliff file to generate, which will be use later
-    Returns:
-    string: Path to plain text file
-    """
-    tikal_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-x", input_file, "-nocopy", "-sl", source_lang,
-                           "-tl", target_lang]
-    Popen(tikal_xliff_command).wait()
-    tikal_moses_command = [os.path.join(tikal_folder, "tikal.sh"), "-xm", original_xliff_file_path, "-sl", source_lang,
-                           "-tl", target_lang]
-    Popen(tikal_moses_command).wait()
-    return os.path.join(original_xliff_file_path + f".{source_lang}")
 def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[str, str | tuple[str, ...]]]:
     """
     Given some text that may or may not contain some chunks tagged with something like <g id=1> </g>, extract each

 import re
 from src.aligner import Aligner
+from src.utils import file_to_moses, moses_to_file
 import glob
 import spacy
         shutil.copy(input_file, temp_input_file)
         original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
+        plain_text_file = file_to_moses(temp_input_file, source_lang, target_lang, tikal_folder,
+                                        original_xliff_file)
         # get paragraphs with runs
         paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
         translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
         runs_to_plain_text(translated_paragraphs_with_style, translated_moses_file)
+        translated_file_path = moses_to_file(translated_moses_file, source_lang, target_lang, tikal_folder,
+                                             original_xliff_file)
         print(f"Saved file in {translated_file_path}")
         yield "", translated_file_path
     return text[:i]
 def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[str, str | tuple[str, ...]]]:
     """
     Given some text that may or may not contain some chunks tagged with something like <g id=1> </g>, extract each