File size: 1,135 Bytes
85387ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import ctranslate2
from subword_nmt.apply_bpe import BPE
import codecs
import re

def apply_subwording(sample_text, model_code_path):
	# APPLY BPE WITH SUBWORD-NMT
	model = codecs.open(model_code_path, encoding='utf-8')
	bpe = BPE(model)
	subwording_text = ""
	for line in sample_text.splitlines():
		subwording_line = bpe.process_line(line)
		subwording_text = subwording_text + subwording_line + "\n"
	return subwording_text

def remove_subwording_marks(translated_text):
	return re.sub("@@ ", "", translated_text)

def translate_nos(sample_text, model):
	tokenizer_model = model[0]
	translator_model = model[1]
	# Apply subwording
	subwording_text = apply_subwording(sample_text, tokenizer_model)
	# Translate entry
	translator = ctranslate2.Translator(translator_model, device="cpu")
	output =""
	for line in subwording_text.splitlines():  
		line = line.strip()
		r = translator.translate_batch(
				[line.split()], replace_unknowns=True,  beam_size=5, batch_type='examples'
			)
		results =' '.join(r[0].hypotheses[0])+"\n"
		output = output + results
	# Remove subwording
	output = remove_subwording_marks(output)
	return output