freddyaboulton's picture
Create app.py
4a104a5 verified
import gradio as gr
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
def tokenize_dialogue(dialogue_data):
"""
Tokenize the dialogue using the GPT-OSS tokenizer
"""
if tokenizer is None:
raise ValueError("Tokenizer not loaded. Please check your installation.")
messages = []
for message in dialogue_data:
role = message.get("speaker", "user")
content = message.get("text", "")
if role == "system":
messages.append({"role": "system", "content": content})
elif role == "user":
messages.append({"role": "user", "content": content})
elif role == "assistant":
messages.append({"role": "assistant", "content": content})
formatted_input = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="np"
)
token_ids = formatted_input[0].tolist()
decoded_text = []
colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7"]
color_map = {}
for i, token_id in enumerate(token_ids):
color = colors[i % len(colors)]
if token_id not in color_map:
color_map[str(token_id)] = color
decoded_text.append((tokenizer.decode([token_id]), str(token_id)))
print("decoded_text", decoded_text)
return gr.HighlightedText(value=decoded_text, color_map=color_map), len(token_ids)
def create_sample_dialogue():
"""
Create a sample dialogue for demonstration
"""
return [
{"speaker": "system", "text": "You are a helpful assistant."},
{"speaker": "user", "text": "Hello! How are you today?"},
{"speaker": "assistant", "text": "I'm doing well, thank you for asking! How can I help you today?"},
{"speaker": "user", "text": "Can you explain what MXFP4 quantization is?"}
]
with gr.Blocks(title="GPT-OSS Tokenizer Explorer") as demo:
gr.Markdown("# GPT-OSS Tokenizer Explorer")
gr.Markdown("Enter a dialogue and see how the GPT-OSS tokenizer processes it. Use the format `speaker: message` in the dialogue component.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Input Dialogue")
dialogue_input = gr.Dialogue(
speakers=["system", "user", "assistant"],
label="Enter your dialogue",
placeholder="Type 'system:', 'user:', or 'assistant:' followed by your message",
show_submit_button=True,
show_copy_button=True,
type="dialogue",
ui_mode="dialogue-only",
)
with gr.Row():
sample_btn = gr.Button("Load Sample", variant="secondary")
clear_btn = gr.Button("Clear", variant="secondary")
with gr.Column(scale=1):
gr.Markdown("### Tokenization Results")
highlighted_output = gr.HighlightedText(
label="Tokenized Output",
show_inline_category=False
)
token_count = gr.Label(
value="Total Tokens: 0",
label="Token Count"
)
with gr.Accordion("How to use", open=False):
gr.Markdown("""
### Instructions:
1. **Enter dialogue**: Use the dialogue component to enter conversations
2. **Speaker format**: Type `system:`, `user:`, or `assistant:` followed by your message
3. **Submit**: Click 'Tokenize Dialogue' to process the conversation
4. **View results**: See the tokenization details in the output area
### Example:
```
system: You are a helpful assistant.
user: Hello! How are you today?
assistant: I'm doing well, thank you for asking!
```
### What you'll see:
- **Total tokens**: Number of tokens in the conversation
- **Tokenized output**: How the tokenizer formats the conversation
""")
def process_dialogue(dialogue):
if not dialogue:
return "Please enter some dialogue first.", {}, "Total Tokens: 0"
result_text, token_count_val = tokenize_dialogue(dialogue)
return result_text, f"Total Tokens: {token_count_val}"
def clear_dialogue():
return None, [], "Total Tokens: 0"
sample_btn.click(
fn=create_sample_dialogue,
outputs=[dialogue_input]
)
clear_btn.click(
fn=clear_dialogue,
outputs=[dialogue_input, highlighted_output, token_count]
)
dialogue_input.submit(
fn=process_dialogue,
inputs=[dialogue_input],
outputs=[highlighted_output, token_count]
)
if __name__ == "__main__":
demo.launch()