Spaces:

1inkusFace
/

qwen2.5-32b-instruct

Paused

App Files Files Community

qwen2.5-32b-instruct / app-qwen.py

1inkusFace

Rename app.py to app-qwen.py

c7cd4fe verified 6 months ago

raw

history blame

2.92 kB

	import spaces
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import gradio as gr
	import os

	os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
	os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
	os.environ["SAFETENSORS_FAST_GPU"] = "1"
	os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')

	torch.backends.cuda.matmul.allow_tf32 = False
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
	torch.backends.cudnn.allow_tf32 = False
	torch.backends.cudnn.deterministic = False
	torch.backends.cudnn.benchmark = False
	torch.backends.cuda.preferred_blas_library="cublas"
	torch.backends.cuda.preferred_linalg_library="cusolver"

	torch.set_float32_matmul_precision("highest")

	model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"

	# Load model and tokenizer (outside the function for efficiency)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype="auto",
	# device_map="auto",
	trust_remote_code=True # Add this line for Qwen models
	).to('cuda',torch.bfloat16)

	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True) # Add this line for Qwen models

	@spaces.GPU(required=True)
	def generate_code(prompt):
	messages = [
	{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
	{"role": "user", "content": prompt}
	]
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
	with torch.no_grad():
	generated_ids = model.generate(
	**model_inputs,
	max_new_tokens = 1024,
	min_new_tokens = 256,
	low_memory = False,
	do_sample = True,
	#guidance_scale = 3.8,
	)
	generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
	response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
	return response

	with gr.Blocks(title="Qwen 14b") as demo: # Updated title
	with gr.Tab("Code Chat"):
	run_button = gr.Button("Run", scale=0)
	prompt = gr.Text(
	label="Prompt",
	show_label=False,
	max_lines=1,
	placeholder="Enter your prompt",
	container=False,
	)
	result = gr.Text(
	label="Result",
	show_label=False,
	max_lines=100,
	container=False,
	)
	gr.on(
	triggers=[
	run_button.click,
	],
	# api_name="generate", # Add this line
	fn=generate_code,
	inputs=[
	prompt,
	],
	outputs=[result],
	)

	demo.launch(share=False)