TEST

Sleeping

App Files Files Community

TEST / app.py

Reality123b

Update app.py

691f69e verified 8 months ago

raw

history blame

3.75 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import time

	# Model configuration
	MODEL_NAME = "Qwen/Qwen2-14B-Instruct"

	# Initialize model and tokenizer
	print("Loading model and tokenizer...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype="auto",
	device_map="auto"
	)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	print("Model and tokenizer loaded!")

	def simulate_typing(text, min_chars_per_sec=15, max_chars_per_sec=40):
	"""Simulate typing animation with variable speed."""
	full_text = ""
	words = text.split()
	for i, word in enumerate(words):
	full_text += word
	if i < len(words) - 1:
	full_text += " "
	delay = 1 / (min_chars_per_sec + (max_chars_per_sec - min_chars_per_sec) * torch.rand(1).item())
	time.sleep(delay)
	yield full_text

	def generate_response(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens=512,
	temperature=0.7,
	top_p=0.95
	):
	# Prepare conversation history
	messages = [{"role": "system", "content": system_message}]
	for user_msg, assistant_msg in history:
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	messages.append({"role": "user", "content": message})

	# Convert messages to model input format
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Generate response
	with torch.inference_mode():
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
	generated_ids = model.generate(
	**model_inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)
	generated_ids = generated_ids[0, len(model_inputs.input_ids[0]):]
	response = tokenizer.decode(generated_ids, skip_special_tokens=True)

	# Return response with typing animation
	for partial_response in simulate_typing(response):
	yield partial_response

	# Custom CSS with typing cursor animation
	custom_css = """
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
	body, .gradio-container {
	font-family: 'Inter', sans-serif;
	}
	.typing-cursor::after {
	content: '\|';
	animation: blink 1s step-start infinite;
	}
	@keyframes blink {
	50% { opacity: 0; }
	}
	"""

	# System message
	system_message = """You are Qwen 2.5 14B, an advanced AI assistant created by Alibaba Cloud.
	You are knowledgeable, helpful, and strive to provide accurate and comprehensive responses."""

	# Gradio chat interface
	demo = gr.ChatInterface(
	generate_response,
	additional_inputs=[
	gr.Textbox(
	value=system_message,
	visible=False,
	),
	gr.Slider(
	minimum=1,
	maximum=2048,
	value=512,
	step=1,
	label="Max new tokens"
	),
	gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature"
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)"
	),
	],
	css=custom_css,
	title="Qwen 2.5 14B Chat",
	description="An advanced AI assistant powered by Qwen 2.5 14B"
	)

	# Launch the demo
	if __name__ == "__main__":
	demo.queue(max_size=40)
	demo.launch(max_threads=40)