Spaces:
Sleeping
Sleeping
Commit
·
5021135
1
Parent(s):
5581e99
Update method to build vocab
Browse files- language_bpe/base.py +1 -0
language_bpe/base.py
CHANGED
@@ -64,6 +64,7 @@ class Tokenizer:
|
|
64 |
def _build_vocab(self):
|
65 |
# vocab is simply and deterministically derived from merges
|
66 |
vocab = {idx: bytes([idx]) for idx in range(256)}
|
|
|
67 |
for (p0, p1), idx in self.merges.items():
|
68 |
vocab[idx] = vocab[p0] + vocab[p1]
|
69 |
for special, idx in self.special_tokens.items():
|
|
|
64 |
def _build_vocab(self):
|
65 |
# vocab is simply and deterministically derived from merges
|
66 |
vocab = {idx: bytes([idx]) for idx in range(256)}
|
67 |
+
vocab.update({idx: bytes(list(chr(value).encode('utf-8'))) for idx,value in zip(range(256, 384), range(2304, 2432))})
|
68 |
for (p0, p1), idx in self.merges.items():
|
69 |
vocab[idx] = vocab[p0] + vocab[p1]
|
70 |
for special, idx in self.special_tokens.items():
|