freddyaboulton HF Staff commited on
Commit
4a104a5
·
verified ·
1 Parent(s): 52971db

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
5
+
6
+
7
+ def tokenize_dialogue(dialogue_data):
8
+ """
9
+ Tokenize the dialogue using the GPT-OSS tokenizer
10
+ """
11
+ if tokenizer is None:
12
+ raise ValueError("Tokenizer not loaded. Please check your installation.")
13
+
14
+ messages = []
15
+ for message in dialogue_data:
16
+ role = message.get("speaker", "user")
17
+ content = message.get("text", "")
18
+
19
+ if role == "system":
20
+ messages.append({"role": "system", "content": content})
21
+ elif role == "user":
22
+ messages.append({"role": "user", "content": content})
23
+ elif role == "assistant":
24
+ messages.append({"role": "assistant", "content": content})
25
+
26
+ formatted_input = tokenizer.apply_chat_template(
27
+ messages,
28
+ add_generation_prompt=True,
29
+ return_tensors="np"
30
+ )
31
+
32
+ token_ids = formatted_input[0].tolist()
33
+ decoded_text = []
34
+ colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7"]
35
+ color_map = {}
36
+
37
+ for i, token_id in enumerate(token_ids):
38
+ color = colors[i % len(colors)]
39
+ if token_id not in color_map:
40
+ color_map[str(token_id)] = color
41
+ decoded_text.append((tokenizer.decode([token_id]), str(token_id)))
42
+
43
+ print("decoded_text", decoded_text)
44
+
45
+ return gr.HighlightedText(value=decoded_text, color_map=color_map), len(token_ids)
46
+
47
+ def create_sample_dialogue():
48
+ """
49
+ Create a sample dialogue for demonstration
50
+ """
51
+ return [
52
+ {"speaker": "system", "text": "You are a helpful assistant."},
53
+ {"speaker": "user", "text": "Hello! How are you today?"},
54
+ {"speaker": "assistant", "text": "I'm doing well, thank you for asking! How can I help you today?"},
55
+ {"speaker": "user", "text": "Can you explain what MXFP4 quantization is?"}
56
+ ]
57
+
58
+ with gr.Blocks(title="GPT-OSS Tokenizer Explorer") as demo:
59
+ gr.Markdown("# GPT-OSS Tokenizer Explorer")
60
+ gr.Markdown("Enter a dialogue and see how the GPT-OSS tokenizer processes it. Use the format `speaker: message` in the dialogue component.")
61
+
62
+ with gr.Row():
63
+ with gr.Column(scale=1):
64
+ gr.Markdown("### Input Dialogue")
65
+
66
+ dialogue_input = gr.Dialogue(
67
+ speakers=["system", "user", "assistant"],
68
+ label="Enter your dialogue",
69
+ placeholder="Type 'system:', 'user:', or 'assistant:' followed by your message",
70
+ show_submit_button=True,
71
+ show_copy_button=True,
72
+ type="dialogue",
73
+ ui_mode="dialogue-only",
74
+ )
75
+
76
+ with gr.Row():
77
+ sample_btn = gr.Button("Load Sample", variant="secondary")
78
+ clear_btn = gr.Button("Clear", variant="secondary")
79
+
80
+ with gr.Column(scale=1):
81
+ gr.Markdown("### Tokenization Results")
82
+
83
+ highlighted_output = gr.HighlightedText(
84
+ label="Tokenized Output",
85
+ show_inline_category=False
86
+ )
87
+
88
+ token_count = gr.Label(
89
+ value="Total Tokens: 0",
90
+ label="Token Count"
91
+ )
92
+
93
+ with gr.Accordion("How to use", open=False):
94
+ gr.Markdown("""
95
+ ### Instructions:
96
+ 1. **Enter dialogue**: Use the dialogue component to enter conversations
97
+ 2. **Speaker format**: Type `system:`, `user:`, or `assistant:` followed by your message
98
+ 3. **Submit**: Click 'Tokenize Dialogue' to process the conversation
99
+ 4. **View results**: See the tokenization details in the output area
100
+
101
+ ### Example:
102
+ ```
103
+ system: You are a helpful assistant.
104
+ user: Hello! How are you today?
105
+ assistant: I'm doing well, thank you for asking!
106
+ ```
107
+
108
+ ### What you'll see:
109
+ - **Total tokens**: Number of tokens in the conversation
110
+ - **Tokenized output**: How the tokenizer formats the conversation
111
+ """)
112
+
113
+ def process_dialogue(dialogue):
114
+ if not dialogue:
115
+ return "Please enter some dialogue first.", {}, "Total Tokens: 0"
116
+
117
+ result_text, token_count_val = tokenize_dialogue(dialogue)
118
+
119
+ return result_text, f"Total Tokens: {token_count_val}"
120
+
121
+ def clear_dialogue():
122
+ return None, [], "Total Tokens: 0"
123
+
124
+ sample_btn.click(
125
+ fn=create_sample_dialogue,
126
+ outputs=[dialogue_input]
127
+ )
128
+
129
+ clear_btn.click(
130
+ fn=clear_dialogue,
131
+ outputs=[dialogue_input, highlighted_output, token_count]
132
+ )
133
+
134
+ dialogue_input.submit(
135
+ fn=process_dialogue,
136
+ inputs=[dialogue_input],
137
+ outputs=[highlighted_output, token_count]
138
+ )
139
+
140
+ if __name__ == "__main__":
141
+ demo.launch()