File size: 497 Bytes
d9464ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
words = []
with open('vocab.txt') as x:
words = [it.strip() for it in x.readlines()]
# Generate tokens file
tokens = set()
for word in words:
tokens.update(list(word))
# add "|" token
tokens.add("|")
with open("model_tokens.txt", "w") as f:
tokens_ordered = sorted(tokens)
f.write("\n".join(tokens_ordered))
# Generate lexicon file
with open("model_lexicon.txt", "w") as f:
for word in words:
splitted_word = " ".join(list(word + "|"))
f.write(f"{word}\t{splitted_word}\n")
|