Spaces:

merterbak
/

gpt-oss-20b-demo

Running on Zero

App Files Files Community

gpt-oss-20b-demo / app.py

merterbak

Update app.py

dc6c5d6 verified 14 days ago

raw

history blame

3.49 kB

	from transformers import pipeline, TextIteratorStreamer
	import torch
	from threading import Thread
	import gradio as gr
	import spaces
	import re

	model_id = "openai/gpt-oss-20b"

	pipe = pipeline(
	"text-generation",
	model=model_id,
	torch_dtype="auto",
	device_map="auto",
	)

	def format_conversation_history(chat_history):
	messages = []
	for item in chat_history:
	role = item["role"]
	content = item["content"]
	if isinstance(content, list):
	content = content[0]["text"] if content and "text" in content[0] else str(content)
	messages.append({"role": role, "content": content})
	return messages

	@spaces.GPU(duration=60)
	def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
	new_message = {"role": "user", "content": input_data}
	system_message = [{"role": "system", "content": system_prompt}] if system_prompt else []
	processed_history = format_conversation_history(chat_history)
	messages = system_message + processed_history + [new_message]

	streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = {
	"max_new_tokens": max_new_tokens,
	"do_sample": True,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repetition_penalty": repetition_penalty,
	"streamer": streamer
	}
	thread = Thread(target=pipe, args=(messages,), kwargs=generation_kwargs)
	thread.start()
	#streaming try #1
	buffer = ""
	full_response = ""
	for chunk in streamer:
	buffer += chunk
	parts = re.split(r'(\s+)', buffer)
	if re.match(r'\s+', parts[-1]) is not None:
	to_append = ''.join(parts)
	buffer = ""
	else:
	to_append = ''.join(parts[:-1])
	buffer = parts[-1]
	if to_append:
	full_response += to_append
	yield full_response
	if buffer:
	full_response += buffer
	yield full_response

	demo = gr.ChatInterface(
	fn=generate_response,
	additional_inputs=[
	gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
	gr.Textbox(
	label="System Prompt",
	value="You are a helpful assistant. Reasoning: medium",
	lines=4,
	placeholder="Change system prompt"
	),
	gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
	gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
	gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
	gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.0)
	],
	examples=[
	[{"text": "Explain Newton laws clearly and concisely"}],
	[{"text": "Write a Python function to calculate the Fibonacci sequence"}],
	[{"text": "What are the benefits of open weight AI models"}],
	],
	cache_examples=False,
	type="messages",
	description="""
	# gpt-oss-20b
	Wait couple of seconds initially. You can adjust reasoning level in the system prompt like "Reasoning: high.
	""",
	fill_height=True,
	textbox=gr.Textbox(
	label="Query Input",
	placeholder="Type your prompt"
	),
	stop_btn="Stop Generation",
	multimodal=False,
	theme=gr.themes.Soft()
	)

	if __name__ == "__main__":
	demo.launch(share=True)