File size: 4,898 Bytes
4a104a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
def tokenize_dialogue(dialogue_data):
"""
Tokenize the dialogue using the GPT-OSS tokenizer
"""
if tokenizer is None:
raise ValueError("Tokenizer not loaded. Please check your installation.")
messages = []
for message in dialogue_data:
role = message.get("speaker", "user")
content = message.get("text", "")
if role == "system":
messages.append({"role": "system", "content": content})
elif role == "user":
messages.append({"role": "user", "content": content})
elif role == "assistant":
messages.append({"role": "assistant", "content": content})
formatted_input = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="np"
)
token_ids = formatted_input[0].tolist()
decoded_text = []
colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7"]
color_map = {}
for i, token_id in enumerate(token_ids):
color = colors[i % len(colors)]
if token_id not in color_map:
color_map[str(token_id)] = color
decoded_text.append((tokenizer.decode([token_id]), str(token_id)))
print("decoded_text", decoded_text)
return gr.HighlightedText(value=decoded_text, color_map=color_map), len(token_ids)
def create_sample_dialogue():
"""
Create a sample dialogue for demonstration
"""
return [
{"speaker": "system", "text": "You are a helpful assistant."},
{"speaker": "user", "text": "Hello! How are you today?"},
{"speaker": "assistant", "text": "I'm doing well, thank you for asking! How can I help you today?"},
{"speaker": "user", "text": "Can you explain what MXFP4 quantization is?"}
]
with gr.Blocks(title="GPT-OSS Tokenizer Explorer") as demo:
gr.Markdown("# GPT-OSS Tokenizer Explorer")
gr.Markdown("Enter a dialogue and see how the GPT-OSS tokenizer processes it. Use the format `speaker: message` in the dialogue component.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Input Dialogue")
dialogue_input = gr.Dialogue(
speakers=["system", "user", "assistant"],
label="Enter your dialogue",
placeholder="Type 'system:', 'user:', or 'assistant:' followed by your message",
show_submit_button=True,
show_copy_button=True,
type="dialogue",
ui_mode="dialogue-only",
)
with gr.Row():
sample_btn = gr.Button("Load Sample", variant="secondary")
clear_btn = gr.Button("Clear", variant="secondary")
with gr.Column(scale=1):
gr.Markdown("### Tokenization Results")
highlighted_output = gr.HighlightedText(
label="Tokenized Output",
show_inline_category=False
)
token_count = gr.Label(
value="Total Tokens: 0",
label="Token Count"
)
with gr.Accordion("How to use", open=False):
gr.Markdown("""
### Instructions:
1. **Enter dialogue**: Use the dialogue component to enter conversations
2. **Speaker format**: Type `system:`, `user:`, or `assistant:` followed by your message
3. **Submit**: Click 'Tokenize Dialogue' to process the conversation
4. **View results**: See the tokenization details in the output area
### Example:
```
system: You are a helpful assistant.
user: Hello! How are you today?
assistant: I'm doing well, thank you for asking!
```
### What you'll see:
- **Total tokens**: Number of tokens in the conversation
- **Tokenized output**: How the tokenizer formats the conversation
""")
def process_dialogue(dialogue):
if not dialogue:
return "Please enter some dialogue first.", {}, "Total Tokens: 0"
result_text, token_count_val = tokenize_dialogue(dialogue)
return result_text, f"Total Tokens: {token_count_val}"
def clear_dialogue():
return None, [], "Total Tokens: 0"
sample_btn.click(
fn=create_sample_dialogue,
outputs=[dialogue_input]
)
clear_btn.click(
fn=clear_dialogue,
outputs=[dialogue_input, highlighted_output, token_count]
)
dialogue_input.submit(
fn=process_dialogue,
inputs=[dialogue_input],
outputs=[highlighted_output, token_count]
)
if __name__ == "__main__":
demo.launch() |