Spaces:
Running
Running
File size: 1,293 Bytes
abd6a65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
"""
## tiktoken版本的 unicode 错误,什么原因?
- gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1} 错误的
- gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1} 正确的
"""
import sys
import pdb
sys.path.append("../../")
from vocab import tokenizer_factory
from character_util import _decode_bytes_to_str
tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b")
tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o")
vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()}
vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()}
min_vocab_size = min(len(vocab_1), len(vocab_2))
for i in range(min_vocab_size):
if i == 188:
import pdb; pdb.set_trace()
print(i)
token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
token_str2 = _decode_bytes_to_str(token_str2)
if token_str1 != token_str2:
pdb.set_trace()
print(i, token_str1, token_str2)
|