AkashDataScience commited on
Commit
5021135
·
1 Parent(s): 5581e99

Update method to build vocab

Browse files
Files changed (1) hide show
  1. language_bpe/base.py +1 -0
language_bpe/base.py CHANGED
@@ -64,6 +64,7 @@ class Tokenizer:
64
  def _build_vocab(self):
65
  # vocab is simply and deterministically derived from merges
66
  vocab = {idx: bytes([idx]) for idx in range(256)}
 
67
  for (p0, p1), idx in self.merges.items():
68
  vocab[idx] = vocab[p0] + vocab[p1]
69
  for special, idx in self.special_tokens.items():
 
64
  def _build_vocab(self):
65
  # vocab is simply and deterministically derived from merges
66
  vocab = {idx: bytes([idx]) for idx in range(256)}
67
+ vocab.update({idx: bytes(list(chr(value).encode('utf-8'))) for idx,value in zip(range(256, 384), range(2304, 2432))})
68
  for (p0, p1), idx in self.merges.items():
69
  vocab[idx] = vocab[p0] + vocab[p1]
70
  for special, idx in self.special_tokens.items():