File size: 497 Bytes
d9464ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
words = []

with open('vocab.txt') as x:
  words = [it.strip() for it in x.readlines()]

# Generate tokens file
tokens = set()
for word in words:
  tokens.update(list(word))

# add "|" token
tokens.add("|")

with open("model_tokens.txt", "w") as f:
  tokens_ordered = sorted(tokens)
  f.write("\n".join(tokens_ordered))

# Generate lexicon file
with open("model_lexicon.txt", "w") as f:
  for word in words:
    splitted_word = " ".join(list(word + "|"))
    f.write(f"{word}\t{splitted_word}\n")