kenlm-uk / create_tokens_and_lexicon_files.py
Yehor's picture
Create create_tokens_and_lexicon_files.py
d9464ce verified
words = []
with open('vocab.txt') as x:
words = [it.strip() for it in x.readlines()]
# Generate tokens file
tokens = set()
for word in words:
tokens.update(list(word))
# add "|" token
tokens.add("|")
with open("model_tokens.txt", "w") as f:
tokens_ordered = sorted(tokens)
f.write("\n".join(tokens_ordered))
# Generate lexicon file
with open("model_lexicon.txt", "w") as f:
for word in words:
splitted_word = " ".join(list(word + "|"))
f.write(f"{word}\t{splitted_word}\n")