import gradio as gr from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b") def tokenize_dialogue(dialogue_data): """ Tokenize the dialogue using the GPT-OSS tokenizer """ if tokenizer is None: raise ValueError("Tokenizer not loaded. Please check your installation.") messages = [] for message in dialogue_data: role = message.get("speaker", "user") content = message.get("text", "") if role == "system": messages.append({"role": "system", "content": content}) elif role == "user": messages.append({"role": "user", "content": content}) elif role == "assistant": messages.append({"role": "assistant", "content": content}) formatted_input = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="np" ) token_ids = formatted_input[0].tolist() decoded_text = [] colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7"] color_map = {} for i, token_id in enumerate(token_ids): color = colors[i % len(colors)] if token_id not in color_map: color_map[str(token_id)] = color decoded_text.append((tokenizer.decode([token_id]), str(token_id))) print("decoded_text", decoded_text) return gr.HighlightedText(value=decoded_text, color_map=color_map), len(token_ids) def create_sample_dialogue(): """ Create a sample dialogue for demonstration """ return [ {"speaker": "system", "text": "You are a helpful assistant."}, {"speaker": "user", "text": "Hello! How are you today?"}, {"speaker": "assistant", "text": "I'm doing well, thank you for asking! How can I help you today?"}, {"speaker": "user", "text": "Can you explain what MXFP4 quantization is?"} ] with gr.Blocks(title="GPT-OSS Tokenizer Explorer") as demo: gr.Markdown("# GPT-OSS Tokenizer Explorer") gr.Markdown("Enter a dialogue and see how the GPT-OSS tokenizer processes it. Use the format `speaker: message` in the dialogue component.") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Input Dialogue") dialogue_input = gr.Dialogue( speakers=["system", "user", "assistant"], label="Enter your dialogue", placeholder="Type 'system:', 'user:', or 'assistant:' followed by your message", show_submit_button=True, show_copy_button=True, type="dialogue", ui_mode="dialogue-only", ) with gr.Row(): sample_btn = gr.Button("Load Sample", variant="secondary") clear_btn = gr.Button("Clear", variant="secondary") with gr.Column(scale=1): gr.Markdown("### Tokenization Results") highlighted_output = gr.HighlightedText( label="Tokenized Output", show_inline_category=False ) token_count = gr.Label( value="Total Tokens: 0", label="Token Count" ) with gr.Accordion("How to use", open=False): gr.Markdown(""" ### Instructions: 1. **Enter dialogue**: Use the dialogue component to enter conversations 2. **Speaker format**: Type `system:`, `user:`, or `assistant:` followed by your message 3. **Submit**: Click 'Tokenize Dialogue' to process the conversation 4. **View results**: See the tokenization details in the output area ### Example: ``` system: You are a helpful assistant. user: Hello! How are you today? assistant: I'm doing well, thank you for asking! ``` ### What you'll see: - **Total tokens**: Number of tokens in the conversation - **Tokenized output**: How the tokenizer formats the conversation """) def process_dialogue(dialogue): if not dialogue: return "Please enter some dialogue first.", {}, "Total Tokens: 0" result_text, token_count_val = tokenize_dialogue(dialogue) return result_text, f"Total Tokens: {token_count_val}" def clear_dialogue(): return None, [], "Total Tokens: 0" sample_btn.click( fn=create_sample_dialogue, outputs=[dialogue_input] ) clear_btn.click( fn=clear_dialogue, outputs=[dialogue_input, highlighted_output, token_count] ) dialogue_input.submit( fn=process_dialogue, inputs=[dialogue_input], outputs=[highlighted_output, token_count] ) if __name__ == "__main__": demo.launch()