File size: 1,908 Bytes
e2d65f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72f967c
e2d65f6
72f967c
e2d65f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8eaaa82
4b7a78a
b1f0cdb
e2d65f6
 
 
8eaaa82
e2d65f6
 
b1f0cdb
e2d65f6
 
 
9299dbb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import gradio as gr
from transformers import AutoTokenizer
import random
import colorsys
import html

def get_distinct_colors(n):
    colors = []
    for i in range(n):
        h = i / n
        s = 0.6
        v = 0.7
        r, g, b = colorsys.hsv_to_rgb(h, s, v)
        color = "#{:02x}{:02x}{:02x}".format(int(r*255), int(g*255), int(b*255))
        colors.append(color)
    return colors

def tokenize_text(hf_model_id, text, token=None):
    try:
        tokenizer = AutoTokenizer.from_pretrained(hf_model_id, access_token=token)
        tokens = tokenizer.tokenize(text)
        token_count = len(tokens)
        colors = get_distinct_colors(token_count)
        colored_tokens = []
        for i, token in enumerate(tokens):
            display_token = token.replace('Ġ', '<space>')
            display_token = html.escape(display_token)
            colored_tokens.append(f'<span style="background-color: {colors[i]}; color: white; padding: 2px 4px; border-radius: 3px; margin: 2px; display: inline-block;">{display_token}</span>')
        tokenized_text = "".join(colored_tokens)
        return token_count, tokenized_text
    except Exception as e:
        return f"Error: {str(e)}", ""

demo = gr.Interface(
    fn=tokenize_text,
    inputs=[
        gr.Textbox(label="Hugging Face Model ID", placeholder="unsloth/gemma-3-27b-it", value="unsloth/gemma-3-27b-it"),
        gr.Textbox(label="Text to Tokenize", lines=5, placeholder="Enter your text here..."),
        gr.Textbox(label="HuggingFace Token (optional)", placeholder="hf_...", lines=1)
    ],
    outputs=[
        gr.Number(label="Token Count"),
        gr.HTML(label="Tokens", container=True, show_label=True)
    ],
    title="HuggingFace Tokenizer",
    description="Enter a HuggingFace model ID and text to see how it gets tokenized. Provide a huggingface token if the model is gated.",
    allow_flagging="never"
)

demo.launch()