import gradio as gr from transformers import AutoTokenizer import random import colorsys import html def get_distinct_colors(n): colors = [] for i in range(n): h = i / n s = 0.6 v = 0.7 r, g, b = colorsys.hsv_to_rgb(h, s, v) color = "#{:02x}{:02x}{:02x}".format(int(r*255), int(g*255), int(b*255)) colors.append(color) return colors def tokenize_text(hf_model_id, text, token=None): try: tokenizer = AutoTokenizer.from_pretrained(hf_model_id, access_token=token) tokens = tokenizer.tokenize(text) token_count = len(tokens) colors = get_distinct_colors(token_count) colored_tokens = [] for i, token in enumerate(tokens): display_token = token.replace('Ġ', '') display_token = html.escape(display_token) colored_tokens.append(f'{display_token}') tokenized_text = "".join(colored_tokens) return token_count, tokenized_text except Exception as e: return f"Error: {str(e)}", "" demo = gr.Interface( fn=tokenize_text, inputs=[ gr.Textbox(label="Hugging Face Model ID", placeholder="unsloth/gemma-3-27b-it", value="unsloth/gemma-3-27b-it"), gr.Textbox(label="Text to Tokenize", lines=5, placeholder="Enter your text here..."), gr.Textbox(label="HuggingFace Token (optional)", placeholder="hf_...", lines=1) ], outputs=[ gr.Number(label="Token Count"), gr.HTML(label="Tokens", container=True, show_label=True) ], title="HuggingFace Tokenizer", description="Enter a HuggingFace model ID and text to see how it gets tokenized. Provide a huggingface token if the model is gated.", allow_flagging="never" ) demo.launch()