File size: 1,908 Bytes
e2d65f6 72f967c e2d65f6 72f967c e2d65f6 8eaaa82 4b7a78a b1f0cdb e2d65f6 8eaaa82 e2d65f6 b1f0cdb e2d65f6 9299dbb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import gradio as gr
from transformers import AutoTokenizer
import random
import colorsys
import html
def get_distinct_colors(n):
colors = []
for i in range(n):
h = i / n
s = 0.6
v = 0.7
r, g, b = colorsys.hsv_to_rgb(h, s, v)
color = "#{:02x}{:02x}{:02x}".format(int(r*255), int(g*255), int(b*255))
colors.append(color)
return colors
def tokenize_text(hf_model_id, text, token=None):
try:
tokenizer = AutoTokenizer.from_pretrained(hf_model_id, access_token=token)
tokens = tokenizer.tokenize(text)
token_count = len(tokens)
colors = get_distinct_colors(token_count)
colored_tokens = []
for i, token in enumerate(tokens):
display_token = token.replace('Ġ', '<space>')
display_token = html.escape(display_token)
colored_tokens.append(f'<span style="background-color: {colors[i]}; color: white; padding: 2px 4px; border-radius: 3px; margin: 2px; display: inline-block;">{display_token}</span>')
tokenized_text = "".join(colored_tokens)
return token_count, tokenized_text
except Exception as e:
return f"Error: {str(e)}", ""
demo = gr.Interface(
fn=tokenize_text,
inputs=[
gr.Textbox(label="Hugging Face Model ID", placeholder="unsloth/gemma-3-27b-it", value="unsloth/gemma-3-27b-it"),
gr.Textbox(label="Text to Tokenize", lines=5, placeholder="Enter your text here..."),
gr.Textbox(label="HuggingFace Token (optional)", placeholder="hf_...", lines=1)
],
outputs=[
gr.Number(label="Token Count"),
gr.HTML(label="Tokens", container=True, show_label=True)
],
title="HuggingFace Tokenizer",
description="Enter a HuggingFace model ID and text to see how it gets tokenized. Provide a huggingface token if the model is gated.",
allow_flagging="never"
)
demo.launch() |