""" ## tiktoken版本的 unicode 错误,什么原因? - gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1} 错误的 - gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1} 正确的 """ import sys import pdb sys.path.append("../../") from vocab import tokenizer_factory from character_util import _decode_bytes_to_str tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b") tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o") vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()} vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()} min_vocab_size = min(len(vocab_1), len(vocab_2)) for i in range(min_vocab_size): if i == 188: import pdb; pdb.set_trace() print(i) token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0] token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0] token_str2 = _decode_bytes_to_str(token_str2) if token_str1 != token_str2: pdb.set_trace() print(i, token_str1, token_str2)