Spaces:
Sleeping
Sleeping
File size: 1,135 Bytes
85387ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import ctranslate2
from subword_nmt.apply_bpe import BPE
import codecs
import re
def apply_subwording(sample_text, model_code_path):
# APPLY BPE WITH SUBWORD-NMT
model = codecs.open(model_code_path, encoding='utf-8')
bpe = BPE(model)
subwording_text = ""
for line in sample_text.splitlines():
subwording_line = bpe.process_line(line)
subwording_text = subwording_text + subwording_line + "\n"
return subwording_text
def remove_subwording_marks(translated_text):
return re.sub("@@ ", "", translated_text)
def translate_nos(sample_text, model):
tokenizer_model = model[0]
translator_model = model[1]
# Apply subwording
subwording_text = apply_subwording(sample_text, tokenizer_model)
# Translate entry
translator = ctranslate2.Translator(translator_model, device="cpu")
output =""
for line in subwording_text.splitlines():
line = line.strip()
r = translator.translate_batch(
[line.split()], replace_unknowns=True, beam_size=5, batch_type='examples'
)
results =' '.join(r[0].hypotheses[0])+"\n"
output = output + results
# Remove subwording
output = remove_subwording_marks(output)
return output |