Spaces:

xu-song
/

tokenizer-arena

Running

File size: 1,293 Bytes

abd6a65

"""

## tiktoken版本的 unicode 错误，什么原因？
- gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1}  错误的
- gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1}   正确的
"""

import sys
import pdb

sys.path.append("../../")

from vocab import tokenizer_factory
from character_util import _decode_bytes_to_str



tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b")
tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o")  


vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()}
vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()}

min_vocab_size = min(len(vocab_1), len(vocab_2))

for i in range(min_vocab_size):
    if i == 188:
        import pdb; pdb.set_trace()
        print(i)

    token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
    token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
    token_str2 = _decode_bytes_to_str(token_str2)
    if token_str1 != token_str2:
        pdb.set_trace()
        print(i, token_str1, token_str2)