words = [] | |
with open('vocab.txt') as x: | |
words = [it.strip() for it in x.readlines()] | |
# Generate tokens file | |
tokens = set() | |
for word in words: | |
tokens.update(list(word)) | |
# add "|" token | |
tokens.add("|") | |
with open("model_tokens.txt", "w") as f: | |
tokens_ordered = sorted(tokens) | |
f.write("\n".join(tokens_ordered)) | |
# Generate lexicon file | |
with open("model_lexicon.txt", "w") as f: | |
for word in words: | |
splitted_word = " ".join(list(word + "|")) | |
f.write(f"{word}\t{splitted_word}\n") | |