Spaces:
Running
Running
""" | |
## tiktoken版本的 unicode 错误,什么原因? | |
- gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1} 错误的 | |
- gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1} 正确的 | |
""" | |
import sys | |
import pdb | |
sys.path.append("../../") | |
from vocab import tokenizer_factory | |
from character_util import _decode_bytes_to_str | |
tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b") | |
tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o") | |
vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()} | |
vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()} | |
min_vocab_size = min(len(vocab_1), len(vocab_2)) | |
for i in range(min_vocab_size): | |
if i == 188: | |
import pdb; pdb.set_trace() | |
print(i) | |
token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0] | |
token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0] | |
token_str2 = _decode_bytes_to_str(token_str2) | |
if token_str1 != token_str2: | |
pdb.set_trace() | |
print(i, token_str1, token_str2) | |