xu-song commited on
Commit
abd6a65
·
1 Parent(s): e6c318a
character_util.py CHANGED
@@ -23,6 +23,8 @@ default_columns = ["digit", "zh"]
23
  def _to_unicode(text):
24
  return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
25
 
 
 
26
 
27
  def _get_coding_length(tokenizer, vocab, filter=None):
28
  """
@@ -62,6 +64,18 @@ def _dist(token_lens):
62
  return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def iter_vocab(
66
  tokenizer_name: str,
67
  from_cache: bool = True,
@@ -103,8 +117,8 @@ def iter_vocab(
103
  tags = []
104
  if token is None: # 有些词典有空的id(不连续)
105
  continue
106
- if isinstance(token, bytes):
107
- token = token.decode("utf-8", errors="ignore")
108
 
109
  if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
110
  if tokenizer.sp_model.is_byte(token_id):
@@ -128,8 +142,11 @@ def iter_vocab(
128
  "id": token_id,
129
  "token": token,
130
  "token_decode": decode_str,
131
- "token_dumps": json.dumps(token),
 
132
  "token_unicode": _to_unicode(token),
 
 
133
  "token_len": len(decode_str),
134
  },
135
  ensure_ascii=False) + "\n")
@@ -212,5 +229,9 @@ def get_character_table(
212
 
213
  if __name__ == "__main__":
214
  # aa = get_character_table(tokenizer_filter="baichuan")
215
- df = get_character_table()
216
- logger.info(f"\n{df.to_markdown(index=False)}")
 
 
 
 
 
23
  def _to_unicode(text):
24
  return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
25
 
26
+ def _to_unicode_decimal(text):
27
+ return [ord(chr) for chr in text]
28
 
29
  def _get_coding_length(tokenizer, vocab, filter=None):
30
  """
 
64
  return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
65
 
66
 
67
+ def _decode_bytes_to_str(token: bytes) -> str:
68
+ try:
69
+ token = token.decode("utf-8", errors="strict")
70
+ except:
71
+ try:
72
+ # for single byte, such as b'\xa1'
73
+ token = token.decode('latin-1')
74
+ except:
75
+ logger.warning(f"token {token} decode failed")
76
+ token = token.decode("utf-8", errors="ignore")
77
+ return token
78
+
79
  def iter_vocab(
80
  tokenizer_name: str,
81
  from_cache: bool = True,
 
117
  tags = []
118
  if token is None: # 有些词典有空的id(不连续)
119
  continue
120
+ if isinstance(token, bytes): # convert bytes to string
121
+ token = _decode_bytes_to_str(token)
122
 
123
  if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
124
  if tokenizer.sp_model.is_byte(token_id):
 
142
  "id": token_id,
143
  "token": token,
144
  "token_decode": decode_str,
145
+ "token_dumps": json.dumps(token), # unicode:
146
+ # https://en.wikipedia.org/wiki/List_of_Unicode_characters
147
  "token_unicode": _to_unicode(token),
148
+ "token_unicode_decimal": _to_unicode_decimal(token), # 十进制的
149
+ # "token_utf8_bytes": "",
150
  "token_len": len(decode_str),
151
  },
152
  ensure_ascii=False) + "\n")
 
229
 
230
  if __name__ == "__main__":
231
  # aa = get_character_table(tokenizer_filter="baichuan")
232
+
233
+ # iter_vocab("openai/gpt-4o", from_cache=False)
234
+ iter_vocab("openai/gpt-oss-20b", from_cache=False)
235
+
236
+ # df = get_character_table()
237
+ # logger.info(f"\n{df.to_markdown(index=False)}")
playground_app.py CHANGED
@@ -4,7 +4,7 @@
4
 
5
  import gradio as gr
6
  from vocab import tokenizer_factory
7
- from playground_examples import example_types, example_fn
8
  from playground_util import (tokenize,
9
  tokenize_pair, basic_count,
10
  get_overlap_token_size, on_load)
 
4
 
5
  import gradio as gr
6
  from vocab import tokenizer_factory
7
+ from config import example_types, example_fn
8
  from playground_util import (tokenize,
9
  tokenize_pair, basic_count,
10
  get_overlap_token_size, on_load)
playground_examples.py DELETED
@@ -1,138 +0,0 @@
1
- """
2
-
3
- ## characters
4
-
5
- - alphanumeric characters
6
- - numeric characters
7
- - special characters: A special character is a character that is not an alphabetic or numeric character.
8
- - ASCII control characters
9
- - punctuation marks
10
- - accent marks
11
- - 数学符号
12
- - whitespace:
13
- - https://en.wikipedia.org/wiki/Whitespace_character
14
- - https://emptycharacter.com/
15
-
16
-
17
- https://www.computerhope.com/jargon/s/specchar.htm
18
- """
19
-
20
- import random
21
- from datasets import load_dataset
22
-
23
- default_user_input = """\
24
- Replace this text in the input field to see how tokenization works.
25
- Buenos días!
26
- Tokenizer 是自然语言处理(NLP)中的一个关键组件,它的主要作用是将人类语言文本转换为计算机可以理解的数字表示形式。
27
- ラグビーワールドカップ2023フランス"""
28
- # default_tokenizer_name_1 = "Meta/llama3"
29
- # default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
30
- default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
31
- default_tokenizer_name_2 = "openai/gpt-4o"
32
-
33
-
34
- def get_sample_input():
35
- default_inputs = {
36
- "en": "Replace this text in the input field to see how tokenization works.",
37
- "zh-Hans": "",
38
- "es": "",
39
- "de": "",
40
- }
41
- random.seed(10) # For reproducibility
42
- lines = []
43
- for lang in default_inputs.keys():
44
- dataset = load_dataset("eson/cc100-samples", lang, split="train")
45
- print(dataset)
46
- print(1)
47
- return default_inputs
48
-
49
-
50
- examples = {
51
- "en": [
52
- ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
53
- [
54
- "whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline",
55
- "huggyllama/llama-7b",
56
- "google-bert/bert-base-cased",
57
- ], # chatglm 有blank_n, bert丢掉了空格,
58
- # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
59
- [
60
- 'punctuation: ,.:/?+=",。!?;【】〔〕〖〗',
61
- "google/gemma-7b",
62
- "huggyllama/llama-7b",
63
- ], # llama词典有点小
64
- [
65
- "symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
66
- "baichuan-inc/Baichuan-7B",
67
- "huggyllama/llama-7b",
68
- ],
69
- # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
70
- ],
71
- "zh": [
72
- [
73
- "空格测试: 2个空格 8个空格",
74
- "llama",
75
- "chatglm2_6b",
76
- ], # chatglm 有blank_n,
77
- ["标点测试:,。!?;", "baichuan_7b", "llama"],
78
- [
79
- "符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
80
- "baichuan_7b",
81
- "llama",
82
- ],
83
- ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
84
- ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
85
- ],
86
- }
87
-
88
-
89
- more_examples = [
90
- # bert系列
91
- (
92
- "google-bert/bert-base-cased",
93
- "google-bert/bert-base-uncased",
94
- "",
95
- "",
96
- ), # # clue VS kplug, bert VS clue
97
- ("bert-base-cased", "clue", "", "增加了[]()"),
98
- ("roberta-chinese-clue", "kplug", "", ""),
99
- # llama系列 (基于sentencepiece)
100
- (
101
- "baichuan",
102
- "baichuan2",
103
- "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1",
104
- ),
105
- ("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
106
- ("llama", "chinese-llama-2-7b", ""),
107
- ("llama", "llama3", "扩充词典"),
108
- ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
109
- # glm系列 (基于sentencepiece)
110
- ("glm", "chatglm1", ""),
111
- ("chatglm1", "chatglm2", ""),
112
- # gpt2系列
113
- ("gpt2", "moss", ""),
114
- ("", "", ""),
115
- # openai系列 (tiktoken)
116
- ("qwen", "gpt_35_turbo", ""),
117
- ]
118
-
119
- lang = "en"
120
-
121
- example_types = [t[0].split(":")[0] for t in examples[lang]]
122
-
123
-
124
- def example_fn(example_idx):
125
- return examples[lang][example_idx]
126
-
127
-
128
- def get_more_example():
129
- import urllib.parse
130
-
131
- url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
132
- for tokenizer1, tokenizer2, text, comment in more_examples:
133
- full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
134
- print(full_url)
135
-
136
-
137
- if __name__ == "__main__":
138
- get_more_example()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
playground_util.py CHANGED
@@ -6,7 +6,7 @@ from vocab import tokenizer_factory
6
  from character_util import iter_vocab
7
  from utils.log_util import logger
8
  from utils.i18n_util import get_lang
9
- from playground_examples import (
10
  default_tokenizer_name_1,
11
  default_tokenizer_name_2,
12
  default_user_input,
@@ -137,10 +137,11 @@ def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
137
 
138
  vocab_set_1 = tokenizer1.get_vocab().keys()
139
  vocab_set_2 = tokenizer2.get_vocab().keys()
 
140
 
141
  token1 = next(iter(vocab_set_1))
142
  token2 = next(iter(vocab_set_2))
143
- if type(token1) != type(token2): # bytes str
144
  if isinstance(token1, str):
145
  vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
146
  if isinstance(token2, str):
@@ -209,9 +210,10 @@ def test_coding():
209
  if __name__ == "__main__":
210
 
211
  # print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
212
- print(
213
- get_overlap_token_size(
214
- "gradientai/Llama-3-8B-Instruct-Gradient-1048k", "deepseek-ai/DeepSeek-R1"
215
- )
216
- )
 
217
  # print(basic_count("internlm_chat_7b"))
 
6
  from character_util import iter_vocab
7
  from utils.log_util import logger
8
  from utils.i18n_util import get_lang
9
+ from config import (
10
  default_tokenizer_name_1,
11
  default_tokenizer_name_2,
12
  default_user_input,
 
137
 
138
  vocab_set_1 = tokenizer1.get_vocab().keys()
139
  vocab_set_2 = tokenizer2.get_vocab().keys()
140
+ # v1 = {v:k for k, v in vocab1.items()}
141
 
142
  token1 = next(iter(vocab_set_1))
143
  token2 = next(iter(vocab_set_2))
144
+ if type(token1) != type(token2): # convert string to byte
145
  if isinstance(token1, str):
146
  vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
147
  if isinstance(token2, str):
 
210
  if __name__ == "__main__":
211
 
212
  # print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
213
+ print(get_overlap_token_size("openai/gpt-oss-20b", "openai/gpt-4o"))
214
+ # print(
215
+ # get_overlap_token_size(
216
+ # "gradientai/Llama-3-8B-Instruct-Gradient-1048k", "deepseek-ai/DeepSeek-R1"
217
+ # )
218
+ # )
219
  # print(basic_count("internlm_chat_7b"))
test/gpt4/gpt4_demo.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ ## tiktoken版本的 unicode 错误,什么原因?
4
+ - gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1} 错误的
5
+ - gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1} 正确的
6
+ """
7
+
8
+ import sys
9
+ import pdb
10
+
11
+ sys.path.append("../../")
12
+
13
+ from vocab import tokenizer_factory
14
+ from character_util import _decode_bytes_to_str
15
+
16
+
17
+
18
+ tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b")
19
+ tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o")
20
+
21
+
22
+ vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()}
23
+ vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()}
24
+
25
+ min_vocab_size = min(len(vocab_1), len(vocab_2))
26
+
27
+ for i in range(min_vocab_size):
28
+ if i == 188:
29
+ import pdb; pdb.set_trace()
30
+ print(i)
31
+
32
+ token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
33
+ token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0]
34
+ token_str2 = _decode_bytes_to_str(token_str2)
35
+ if token_str1 != token_str2:
36
+ pdb.set_trace()
37
+ print(i, token_str1, token_str2)
38
+