GPT-OSS

Running on Zero

App Files Files Community

GPT-OSS / app.py

Spestly

Update app.py

a64f19d verified 7 days ago

raw

history blame contribute delete

8.25 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import time
	import spaces
	import re

	# Model configurations
	MODELS = {
	"20B": "openai/gpt-oss-20b",
	"120B": "openai/gpt-oss-20b",
	}

	@spaces.GPU
	def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
	"""Generate response using ZeroGPU - all CUDA operations happen here"""
	print(f"🚀 Loading {model_id}...")
	start_time = time.time()
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)
	load_time = time.time() - start_time
	print(f"✅ Model loaded in {load_time:.2f}s")

	# Build messages in proper chat format (OpenAI-style messages)
	messages = []
	system_prompt = (
	"You are GPT, a helpful, harmless, and honest AI assistant. "
	"You provide clear, accurate, and concise responses to user questions. "
	"You are knowledgeable across many domains and always aim to be respectful and helpful. "
	)
	messages.append({"role": "system", "content": system_prompt})

	# Add conversation history
	for msg in conversation:
	messages.append(msg)

	# Add current user message
	messages.append({"role": "user", "content": user_message})

	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	inputs = tokenizer(prompt, return_tensors="pt")
	device = next(model.parameters()).device
	inputs = {k: v.to(device) for k, v in inputs.items()}
	generation_start = time.time()
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_length,
	temperature=temperature,
	do_sample=True,
	top_p=0.9,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id
	)
	generation_time = time.time() - generation_start
	response = tokenizer.decode(
	outputs[0][inputs['input_ids'].shape[-1]:],
	skip_special_tokens=True
	).strip()
	print(f"Generation time: {generation_time:.2f}s")
	return response, load_time, generation_time

	def format_response_with_thinking(response):
	"""Format response to handle <think></think> tags"""
	# Check if response contains thinking tags
	if '<think>' in response and '</think>' in response:
	# Split the response into parts
	pattern = r'(.?)(<think>(.?)</think>)(.*)'
	match = re.search(pattern, response, re.DOTALL)

	if match:
	before_thinking = match.group(1).strip()
	thinking_content = match.group(3).strip()
	after_thinking = match.group(4).strip()

	# Create HTML with collapsible thinking section
	html = f"{before_thinking}\n"
	html += f'<div class="thinking-container">'
	html += f'<button class="thinking-toggle" onclick="this.nextElementSibling.classList.toggle(\'hidden\'); this.textContent = this.textContent === \'Show reasoning\' ? \'Hide reasoning\' : \'Show reasoning\'">Show reasoning</button>'
	html += f'<div class="thinking-content hidden">{thinking_content}</div>'
	html += f'</div>\n'
	html += after_thinking

	return html

	# If no thinking tags, return the original response
	return response

	def chat_submit(message, history, conversation_state, model_name, max_length, temperature):
	"""Process a new message and update the chat history"""
	if not message.strip():
	return "", history, conversation_state

	model_id = MODELS.get(model_name, MODELS["20B"])
	try:
	# Print debug info to help diagnose issues
	print(f"Processing message: {message}")
	print(f"Selected model: {model_name} ({model_id})")

	response, load_time, generation_time = generate_response(
	model_id, conversation_state, message, max_length, temperature
	)

	# Update the conversation state with the raw response
	conversation_state.append({"role": "user", "content": message})
	conversation_state.append({"role": "assistant", "content": response})

	# Format the response for display
	formatted_response = format_response_with_thinking(response)

	# Update the visible chat history
	history.append((message, formatted_response))
	print(f"Response added to history. Current length: {len(history)}")

	return "", history, conversation_state
	except Exception as e:
	import traceback
	print(f"Error in chat_submit: {str(e)}")
	print(traceback.format_exc())
	error_message = f"Error: {str(e)}"
	history.append((message, error_message))
	return "", history, conversation_state

	css = """
	.message {
	padding: 10px;
	margin: 5px;
	border-radius: 10px;
	}
	.thinking-container {
	margin: 10px 0;
	}
	.thinking-toggle {
	background-color: #f1f1f1;
	border: 1px solid #ddd;
	border-radius: 4px;
	padding: 5px 10px;
	cursor: pointer;
	font-size: 0.9em;
	margin-bottom: 5px;
	color: #555;
	}
	.thinking-content {
	background-color: #f9f9f9;
	border-left: 3px solid #ccc;
	padding: 10px;
	margin-top: 5px;
	font-size: 0.95em;
	color: #555;
	font-family: monospace;
	white-space: pre-wrap;
	overflow-x: auto;
	}
	.hidden {
	display: none;
	}
	"""

	with gr.Blocks(title="GPT-OSS Playground Chat", css=css) as demo:
	gr.Markdown("# 🚀 GPT-OSS Playground Chat")
	gr.Markdown("Powered by HuggingFace ZeroGPU")

	# State to keep track of the conversation for the model
	conversation_state = gr.State([])

	chatbot = gr.Chatbot(height=500, label="Athena", render_markdown=True)

	with gr.Row():
	user_input = gr.Textbox(label="Your message", scale=8, autofocus=True, placeholder="Type your message here...")
	send_btn = gr.Button(value="Send", scale=1, variant="primary")

	# Clear button for resetting the conversation
	clear_btn = gr.Button("Clear Conversation")

	# Configuration controls
	gr.Markdown("### ⚙️ Model & Generation Settings")
	with gr.Row():
	model_choice = gr.Dropdown(
	label="📱 Model",
	choices=list(MODELS.keys()),
	value="20B",
	info="Select which Athena model to use"
	)
	max_length = gr.Slider(
	32, 8192, value=512,
	label="📝 Max Tokens",
	info="Maximum number of tokens to generate"
	)
	temperature = gr.Slider(
	0.1, 2.0, value=0.7,
	label="🎨 Creativity",
	info="Higher values = more creative responses"
	)

	# Function to clear the conversation
	def clear_conversation():
	return [], []

	# Connect the interface components - note the specific ordering
	user_input.submit(
	chat_submit,
	inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
	outputs=[user_input, chatbot, conversation_state]
	)

	# Make sure send button uses the exact same function with the same parameter ordering
	send_btn.click(
	chat_submit,
	inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
	outputs=[user_input, chatbot, conversation_state]
	)

	# Connect clear button
	clear_btn.click(clear_conversation, outputs=[chatbot, conversation_state])

	# Add examples if desired
	gr.Examples(
	examples=[
	"What is artificial intelligence?",
	"Can you explain quantum computing?",
	"Write a short poem about technology",
	"What are some ethical concerns about AI?"
	],
	inputs=[user_input]
	)


	if __name__ == "__main__":
	demo.launch(debug=True) # Enable debug mode for better error reporting