File size: 1,293 Bytes
abd6a65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""

## tiktoken版本的 unicode 错误,什么原因?
- gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1}  错误的
- gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1}   正确的
"""

import sys
import pdb

sys.path.append("../../")

from vocab import tokenizer_factory
from character_util import _decode_bytes_to_str



tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b")
tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o")  


vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()}
vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()}

min_vocab_size = min(len(vocab_1), len(vocab_2))

for i in range(min_vocab_size):
    if i == 188:
        import pdb; pdb.set_trace()
        print(i)

    token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
    token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
    token_str2 = _decode_bytes_to_str(token_str2)
    if token_str1 != token_str2:
        pdb.set_trace()
        print(i, token_str1, token_str2)