Spaces:
Running
Running
startup
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@
|
|
| 11 |
- theme 开关 light/dark
|
| 12 |
- token_id/tokens/bytes 开关
|
| 13 |
- 通过 javascript 添加 hover_text
|
| 14 |
-
-
|
| 15 |
|
| 16 |
|
| 17 |
|
|
@@ -36,9 +36,6 @@ import gradio as gr
|
|
| 36 |
from vocab import all_tokenizers
|
| 37 |
from util import *
|
| 38 |
|
| 39 |
-
example_text = """Replace this text in the input field to see how tokenization works
|
| 40 |
-
华为智能音箱发布:华为Sound X"""
|
| 41 |
-
|
| 42 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
| 43 |
examples = [
|
| 44 |
# ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
|
@@ -53,6 +50,20 @@ def example_fn(example_idx):
|
|
| 53 |
return examples[example_idx]
|
| 54 |
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
with gr.Blocks(css="style.css") as demo:
|
| 58 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
|
@@ -76,7 +87,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 76 |
)
|
| 77 |
|
| 78 |
user_input = gr.Textbox(
|
| 79 |
-
value=
|
| 80 |
label="Input Text",
|
| 81 |
lines=5,
|
| 82 |
show_label=False,
|
|
@@ -94,7 +105,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 94 |
with gr.Group():
|
| 95 |
tokenizer_type_1 = gr.Dropdown(
|
| 96 |
all_tokenizers,
|
| 97 |
-
value=
|
| 98 |
label="Tokenizer 1",
|
| 99 |
)
|
| 100 |
with gr.Group():
|
|
@@ -103,17 +114,19 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 103 |
"""
|
| 104 |
with gr.Row():
|
| 105 |
stats_vocab_size_1 = gr.TextArea(
|
|
|
|
| 106 |
label="VocabSize",
|
| 107 |
lines=1,
|
| 108 |
elem_classes="statistics"
|
| 109 |
)
|
| 110 |
stats_zh_token_size_1 = gr.TextArea(
|
| 111 |
-
|
| 112 |
label="ZH char/word",
|
| 113 |
lines=1,
|
| 114 |
elem_classes="statistics"
|
| 115 |
)
|
| 116 |
stats_overlap_token_size_1 = gr.TextArea(
|
|
|
|
| 117 |
label="Overlap Tokens",
|
| 118 |
lines=1,
|
| 119 |
elem_classes="statistics"
|
|
@@ -137,12 +150,13 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 137 |
with gr.Group():
|
| 138 |
with gr.Row():
|
| 139 |
stats_vocab_size_2 = gr.TextArea(
|
|
|
|
| 140 |
label="VocabSize",
|
| 141 |
lines=1,
|
| 142 |
elem_classes="statistics"
|
| 143 |
)
|
| 144 |
stats_zh_token_size_2 = gr.TextArea( # 中文单子数,
|
| 145 |
-
|
| 146 |
label="ZH char/word",
|
| 147 |
lines=1,
|
| 148 |
elem_classes="statistics"
|
|
@@ -153,6 +167,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 153 |
# elem_classes="statistics"
|
| 154 |
# )
|
| 155 |
stats_overlap_token_size_2 = gr.TextArea(
|
|
|
|
| 156 |
label="Overlap Tokens",
|
| 157 |
lines=1,
|
| 158 |
elem_classes="statistics"
|
|
@@ -162,12 +177,14 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 162 |
with gr.Row():
|
| 163 |
with gr.Column():
|
| 164 |
output_text_1 = gr.Highlightedtext(
|
|
|
|
| 165 |
label="Tokens 1",
|
| 166 |
show_legend=True,
|
| 167 |
elem_classes="space-show"
|
| 168 |
)
|
| 169 |
with gr.Column():
|
| 170 |
output_text_2 = gr.Highlightedtext(
|
|
|
|
| 171 |
label="Tokens 2",
|
| 172 |
show_legend=True,
|
| 173 |
elem_classes="space-show"
|
|
@@ -175,11 +192,13 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 175 |
|
| 176 |
with gr.Row():
|
| 177 |
output_table_1 = gr.Dataframe(
|
|
|
|
| 178 |
headers=["TokenID", "Byte", "Text"],
|
| 179 |
datatype=["str", "str", "str"],
|
| 180 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
| 181 |
)
|
| 182 |
output_table_2 = gr.Dataframe(
|
|
|
|
| 183 |
headers=["TokenID", "Token", "Text"],
|
| 184 |
datatype=["str", "str", "str"],
|
| 185 |
)
|
|
|
|
| 11 |
- theme 开关 light/dark
|
| 12 |
- token_id/tokens/bytes 开关
|
| 13 |
- 通过 javascript 添加 hover_text
|
| 14 |
+
- i18
|
| 15 |
|
| 16 |
|
| 17 |
|
|
|
|
| 36 |
from vocab import all_tokenizers
|
| 37 |
from util import *
|
| 38 |
|
|
|
|
|
|
|
|
|
|
| 39 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
| 40 |
examples = [
|
| 41 |
# ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
|
|
|
| 50 |
return examples[example_idx]
|
| 51 |
|
| 52 |
|
| 53 |
+
"""Replace this text in the input field to see how tokenization works
|
| 54 |
+
华为智能音箱发布:华为发布mate60 pro手机"""
|
| 55 |
+
|
| 56 |
+
default_user_input = """Replace this text in the input field to see how tokenization works
|
| 57 |
+
华为发布mate60 pro手机"""
|
| 58 |
+
default_tokenizer_type_1 = "llama"
|
| 59 |
+
default_tokenizer_type_2 = "internlm_chat_7b"
|
| 60 |
+
default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
|
| 61 |
+
default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
|
| 62 |
+
default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
|
| 63 |
+
default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
|
| 64 |
+
default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
|
| 68 |
with gr.Blocks(css="style.css") as demo:
|
| 69 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
|
|
|
| 87 |
)
|
| 88 |
|
| 89 |
user_input = gr.Textbox(
|
| 90 |
+
value=default_user_input,
|
| 91 |
label="Input Text",
|
| 92 |
lines=5,
|
| 93 |
show_label=False,
|
|
|
|
| 105 |
with gr.Group():
|
| 106 |
tokenizer_type_1 = gr.Dropdown(
|
| 107 |
all_tokenizers,
|
| 108 |
+
value=default_tokenizer_type_1,
|
| 109 |
label="Tokenizer 1",
|
| 110 |
)
|
| 111 |
with gr.Group():
|
|
|
|
| 114 |
"""
|
| 115 |
with gr.Row():
|
| 116 |
stats_vocab_size_1 = gr.TextArea(
|
| 117 |
+
value=default_stats_vocab_size_1,
|
| 118 |
label="VocabSize",
|
| 119 |
lines=1,
|
| 120 |
elem_classes="statistics"
|
| 121 |
)
|
| 122 |
stats_zh_token_size_1 = gr.TextArea(
|
| 123 |
+
value=default_stats_zh_token_size_1,
|
| 124 |
label="ZH char/word",
|
| 125 |
lines=1,
|
| 126 |
elem_classes="statistics"
|
| 127 |
)
|
| 128 |
stats_overlap_token_size_1 = gr.TextArea(
|
| 129 |
+
value=default_stats_overlap_token_size,
|
| 130 |
label="Overlap Tokens",
|
| 131 |
lines=1,
|
| 132 |
elem_classes="statistics"
|
|
|
|
| 150 |
with gr.Group():
|
| 151 |
with gr.Row():
|
| 152 |
stats_vocab_size_2 = gr.TextArea(
|
| 153 |
+
value=default_stats_vocab_size_2,
|
| 154 |
label="VocabSize",
|
| 155 |
lines=1,
|
| 156 |
elem_classes="statistics"
|
| 157 |
)
|
| 158 |
stats_zh_token_size_2 = gr.TextArea( # 中文单子数,
|
| 159 |
+
value=default_stats_zh_token_size_2,
|
| 160 |
label="ZH char/word",
|
| 161 |
lines=1,
|
| 162 |
elem_classes="statistics"
|
|
|
|
| 167 |
# elem_classes="statistics"
|
| 168 |
# )
|
| 169 |
stats_overlap_token_size_2 = gr.TextArea(
|
| 170 |
+
value=default_stats_overlap_token_size,
|
| 171 |
label="Overlap Tokens",
|
| 172 |
lines=1,
|
| 173 |
elem_classes="statistics"
|
|
|
|
| 177 |
with gr.Row():
|
| 178 |
with gr.Column():
|
| 179 |
output_text_1 = gr.Highlightedtext(
|
| 180 |
+
value=default_output_text_1,
|
| 181 |
label="Tokens 1",
|
| 182 |
show_legend=True,
|
| 183 |
elem_classes="space-show"
|
| 184 |
)
|
| 185 |
with gr.Column():
|
| 186 |
output_text_2 = gr.Highlightedtext(
|
| 187 |
+
value=default_output_text_2,
|
| 188 |
label="Tokens 2",
|
| 189 |
show_legend=True,
|
| 190 |
elem_classes="space-show"
|
|
|
|
| 192 |
|
| 193 |
with gr.Row():
|
| 194 |
output_table_1 = gr.Dataframe(
|
| 195 |
+
value=default_output_table_1,
|
| 196 |
headers=["TokenID", "Byte", "Text"],
|
| 197 |
datatype=["str", "str", "str"],
|
| 198 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
| 199 |
)
|
| 200 |
output_table_2 = gr.Dataframe(
|
| 201 |
+
value=default_output_table_2,
|
| 202 |
headers=["TokenID", "Token", "Text"],
|
| 203 |
datatype=["str", "str", "str"],
|
| 204 |
)
|
util.py
CHANGED
|
@@ -9,7 +9,7 @@ from utils.zh_util import iter_vocab
|
|
| 9 |
|
| 10 |
|
| 11 |
|
| 12 |
-
def tokenize(text, tokenizer_type, color_num=5):
|
| 13 |
"""
|
| 14 |
TODO: cache tokenizer
|
| 15 |
"""
|
|
@@ -57,11 +57,14 @@ def tokenize(text, tokenizer_type, color_num=5):
|
|
| 57 |
print(f"Tokenization[{tokenizer_type}]: {table}")
|
| 58 |
# print(table_df)
|
| 59 |
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
| 64 |
-
pos_tokens_1, table_df_1
|
| 65 |
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
| 66 |
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
| 67 |
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
|
| 12 |
+
def tokenize(text, tokenizer_type, color_num=5, update=True):
|
| 13 |
"""
|
| 14 |
TODO: cache tokenizer
|
| 15 |
"""
|
|
|
|
| 57 |
print(f"Tokenization[{tokenizer_type}]: {table}")
|
| 58 |
# print(table_df)
|
| 59 |
|
| 60 |
+
if update:
|
| 61 |
+
return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
|
| 62 |
+
else:
|
| 63 |
+
return pos_tokens, table_df
|
| 64 |
|
| 65 |
|
| 66 |
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
| 67 |
+
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
|
| 68 |
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
| 69 |
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
| 70 |
|