Spaces:

xu-song
/

tokenizer-arena

Running

tokenizer-arena / test /gpt4 /gpt4_demo.py

update

abd6a65 8 days ago

1.29 kB

	"""

	## tiktoken版本的 unicode 错误，什么原因？
	- gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1} 错误的
	- gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1} 正确的
	"""

	import sys
	import pdb

	sys.path.append("../../")

	from vocab import tokenizer_factory
	from character_util import _decode_bytes_to_str



	tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b")
	tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o")


	vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()}
	vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()}

	min_vocab_size = min(len(vocab_1), len(vocab_2))

	for i in range(min_vocab_size):
	if i == 188:
	import pdb; pdb.set_trace()
	print(i)

	token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
	token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
	token_str2 = _decode_bytes_to_str(token_str2)
	if token_str1 != token_str2:
	pdb.set_trace()
	print(i, token_str1, token_str2)