Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

xu-song commited on Sep 4, 2023

Commit

b15345c

1 Parent(s): e4187ae

startup

Browse files

Files changed (2) hide show

app.py +27 -8
util.py +6 -3

app.py CHANGED Viewed

@@ -11,7 +11,7 @@
 - theme 开关 light/dark
 - token_id/tokens/bytes 开关
 - 通过 javascript 添加 hover_text
--
@@ -36,9 +36,6 @@ import gradio as gr
 from vocab import all_tokenizers
 from util import *
-example_text = """Replace this text in the input field to see how tokenization works
-华为智能音箱发布：华为Sound X"""
 # llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
 examples = [
     # ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
@@ -53,6 +50,20 @@ def example_fn(example_idx):
     return examples[example_idx]
 with gr.Blocks(css="style.css") as demo:
     gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
@@ -76,7 +87,7 @@ with gr.Blocks(css="style.css") as demo:
         )
     user_input = gr.Textbox(
-        value=example_text,
         label="Input Text",
         lines=5,
         show_label=False,
@@ -94,7 +105,7 @@ with gr.Blocks(css="style.css") as demo:
             with gr.Group():
                 tokenizer_type_1 = gr.Dropdown(
                     all_tokenizers,
-                    value="llama",
                     label="Tokenizer 1",
                 )
                 with gr.Group():
@@ -103,17 +114,19 @@ with gr.Blocks(css="style.css") as demo:
                     """
                     with gr.Row():
                         stats_vocab_size_1 = gr.TextArea(
                             label="VocabSize",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_1 = gr.TextArea(
-                            # value="1252/1455",
                             label="ZH char/word",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_overlap_token_size_1 = gr.TextArea(
                             label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
@@ -137,12 +150,13 @@ with gr.Blocks(css="style.css") as demo:
                 with gr.Group():
                     with gr.Row():
                         stats_vocab_size_2 = gr.TextArea(
                             label="VocabSize",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_2 = gr.TextArea(  # 中文单子数，
-                            # value="12/45",
                             label="ZH char/word",
                             lines=1,
                             elem_classes="statistics"
@@ -153,6 +167,7 @@ with gr.Blocks(css="style.css") as demo:
                         #     elem_classes="statistics"
                         # )
                         stats_overlap_token_size_2 = gr.TextArea(
                             label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
@@ -162,12 +177,14 @@ with gr.Blocks(css="style.css") as demo:
     with gr.Row():
         with gr.Column():
             output_text_1 = gr.Highlightedtext(
                 label="Tokens 1",
                 show_legend=True,
                 elem_classes="space-show"
             )
         with gr.Column():
             output_text_2 = gr.Highlightedtext(
                 label="Tokens 2",
                 show_legend=True,
                 elem_classes="space-show"
@@ -175,11 +192,13 @@ with gr.Blocks(css="style.css") as demo:
     with gr.Row():
         output_table_1 = gr.Dataframe(
             headers=["TokenID", "Byte", "Text"],
             datatype=["str", "str", "str"],
             # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用，因此直接修改cell-wrap
         )
         output_table_2 = gr.Dataframe(
             headers=["TokenID", "Token", "Text"],
             datatype=["str", "str", "str"],
         )

 - theme 开关 light/dark
 - token_id/tokens/bytes 开关
 - 通过 javascript 添加 hover_text
+- i18
 from vocab import all_tokenizers
 from util import *
 # llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
 examples = [
     # ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
     return examples[example_idx]
+"""Replace this text in the input field to see how tokenization works
+华为智能音箱发布：华为发布mate60 pro手机"""
+default_user_input = """Replace this text in the input field to see how tokenization works
+华为发布mate60 pro手机"""
+default_tokenizer_type_1 = "llama"
+default_tokenizer_type_2 = "internlm_chat_7b"
+default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
+default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
+default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
+default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
+default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
 with gr.Blocks(css="style.css") as demo:
     gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
         )
     user_input = gr.Textbox(
+        value=default_user_input,
         label="Input Text",
         lines=5,
         show_label=False,
             with gr.Group():
                 tokenizer_type_1 = gr.Dropdown(
                     all_tokenizers,
+                    value=default_tokenizer_type_1,
                     label="Tokenizer 1",
                 )
                 with gr.Group():
                     """
                     with gr.Row():
                         stats_vocab_size_1 = gr.TextArea(
+                            value=default_stats_vocab_size_1,
                             label="VocabSize",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_1 = gr.TextArea(
+                            value=default_stats_zh_token_size_1,
                             label="ZH char/word",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_overlap_token_size_1 = gr.TextArea(
+                            value=default_stats_overlap_token_size,
                             label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
                 with gr.Group():
                     with gr.Row():
                         stats_vocab_size_2 = gr.TextArea(
+                            value=default_stats_vocab_size_2,
                             label="VocabSize",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_2 = gr.TextArea(  # 中文单子数，
+                            value=default_stats_zh_token_size_2,
                             label="ZH char/word",
                             lines=1,
                             elem_classes="statistics"
                         #     elem_classes="statistics"
                         # )
                         stats_overlap_token_size_2 = gr.TextArea(
+                            value=default_stats_overlap_token_size,
                             label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
     with gr.Row():
         with gr.Column():
             output_text_1 = gr.Highlightedtext(
+                value=default_output_text_1,
                 label="Tokens 1",
                 show_legend=True,
                 elem_classes="space-show"
             )
         with gr.Column():
             output_text_2 = gr.Highlightedtext(
+                value=default_output_text_2,
                 label="Tokens 2",
                 show_legend=True,
                 elem_classes="space-show"
     with gr.Row():
         output_table_1 = gr.Dataframe(
+            value=default_output_table_1,
             headers=["TokenID", "Byte", "Text"],
             datatype=["str", "str", "str"],
             # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用，因此直接修改cell-wrap
         )
         output_table_2 = gr.Dataframe(
+            value=default_output_table_2,
             headers=["TokenID", "Token", "Text"],
             datatype=["str", "str", "str"],
         )

util.py CHANGED Viewed

@@ -9,7 +9,7 @@ from utils.zh_util import iter_vocab
-def tokenize(text, tokenizer_type, color_num=5):
     """
     TODO: cache tokenizer
     """
@@ -57,11 +57,14 @@ def tokenize(text, tokenizer_type, color_num=5):
     print(f"Tokenization[{tokenizer_type}]: {table}")
     # print(table_df)
-    return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
 def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
-    pos_tokens_1, table_df_1  = tokenize(text, tokenizer_type_1)
     pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
     return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2

+def tokenize(text, tokenizer_type, color_num=5, update=True):
     """
     TODO: cache tokenizer
     """
     print(f"Tokenization[{tokenizer_type}]: {table}")
     # print(table_df)
+    if update:
+        return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
+    else:
+        return pos_tokens, table_df
 def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
+    pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
     pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
     return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2