File size: 4,898 Bytes
4a104a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")


def tokenize_dialogue(dialogue_data):
    """
    Tokenize the dialogue using the GPT-OSS tokenizer
    """
    if tokenizer is None:
        raise ValueError("Tokenizer not loaded. Please check your installation.")
    
    messages = []
    for message in dialogue_data:
        role = message.get("speaker", "user")
        content = message.get("text", "")
        
        if role == "system":
            messages.append({"role": "system", "content": content})
        elif role == "user":
            messages.append({"role": "user", "content": content})
        elif role == "assistant":
            messages.append({"role": "assistant", "content": content})
    
    formatted_input = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="np"
    )
    
    token_ids = formatted_input[0].tolist()
    decoded_text = []
    colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7"]
    color_map = {}
    
    for i, token_id in enumerate(token_ids):
        color = colors[i % len(colors)]
        if token_id not in color_map:
            color_map[str(token_id)] = color
        decoded_text.append((tokenizer.decode([token_id]), str(token_id)))
    
    print("decoded_text", decoded_text)
    
    return gr.HighlightedText(value=decoded_text, color_map=color_map), len(token_ids)

def create_sample_dialogue():
    """
    Create a sample dialogue for demonstration
    """
    return [
        {"speaker": "system", "text": "You are a helpful assistant."},
        {"speaker": "user", "text": "Hello! How are you today?"},
        {"speaker": "assistant", "text": "I'm doing well, thank you for asking! How can I help you today?"},
        {"speaker": "user", "text": "Can you explain what MXFP4 quantization is?"}
    ]

with gr.Blocks(title="GPT-OSS Tokenizer Explorer") as demo:
    gr.Markdown("# GPT-OSS Tokenizer Explorer")
    gr.Markdown("Enter a dialogue and see how the GPT-OSS tokenizer processes it. Use the format `speaker: message` in the dialogue component.")
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Input Dialogue")
            
            dialogue_input = gr.Dialogue(
                speakers=["system", "user", "assistant"],
                label="Enter your dialogue",
                placeholder="Type 'system:', 'user:', or 'assistant:' followed by your message",
                show_submit_button=True,
                show_copy_button=True,
                type="dialogue",
                ui_mode="dialogue-only",
            )
            
            with gr.Row():
                sample_btn = gr.Button("Load Sample", variant="secondary")
                clear_btn = gr.Button("Clear", variant="secondary")
        
        with gr.Column(scale=1):
            gr.Markdown("### Tokenization Results")
            
            highlighted_output = gr.HighlightedText(
                label="Tokenized Output",
                show_inline_category=False
            )
            
            token_count = gr.Label(
                value="Total Tokens: 0",
                label="Token Count"
            )
    
    with gr.Accordion("How to use", open=False):
        gr.Markdown("""
        ### Instructions:
        1. **Enter dialogue**: Use the dialogue component to enter conversations
        2. **Speaker format**: Type `system:`, `user:`, or `assistant:` followed by your message
        3. **Submit**: Click 'Tokenize Dialogue' to process the conversation
        4. **View results**: See the tokenization details in the output area
        
        ### Example:
        ```
        system: You are a helpful assistant.
        user: Hello! How are you today?
        assistant: I'm doing well, thank you for asking!
        ```
        
        ### What you'll see:
        - **Total tokens**: Number of tokens in the conversation
        - **Tokenized output**: How the tokenizer formats the conversation
        """)
    
    def process_dialogue(dialogue):
        if not dialogue:
            return "Please enter some dialogue first.", {}, "Total Tokens: 0"
        
        result_text, token_count_val = tokenize_dialogue(dialogue)
        
        return result_text, f"Total Tokens: {token_count_val}"
    
    def clear_dialogue():
        return None, [], "Total Tokens: 0"
    
    sample_btn.click(
        fn=create_sample_dialogue,
        outputs=[dialogue_input]
    )
    
    clear_btn.click(
        fn=clear_dialogue,
        outputs=[dialogue_input, highlighted_output, token_count]
    )
    
    dialogue_input.submit(
        fn=process_dialogue,
        inputs=[dialogue_input],
        outputs=[highlighted_output, token_count]
    )

if __name__ == "__main__":
    demo.launch()