|
|
import spaces |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
import torch |
|
|
import gradio as gr |
|
|
import os |
|
|
|
|
|
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1') |
|
|
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128') |
|
|
os.environ["SAFETENSORS_FAST_GPU"] = "1" |
|
|
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1') |
|
|
|
|
|
torch.backends.cuda.matmul.allow_tf32 = False |
|
|
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False |
|
|
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
|
|
torch.backends.cudnn.allow_tf32 = False |
|
|
torch.backends.cudnn.deterministic = False |
|
|
torch.backends.cudnn.benchmark = False |
|
|
torch.backends.cuda.preferred_blas_library="cublas" |
|
|
torch.backends.cuda.preferred_linalg_library="cusolver" |
|
|
|
|
|
torch.set_float32_matmul_precision("highest") |
|
|
|
|
|
model_name = "Qwen/Qwen2.5-Coder-14B-Instruct" |
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
torch_dtype="auto", |
|
|
|
|
|
trust_remote_code=True |
|
|
).to('cuda',torch.bfloat16) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True) |
|
|
|
|
|
@spaces.GPU(required=True) |
|
|
def generate_code(prompt): |
|
|
messages = [ |
|
|
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, |
|
|
{"role": "user", "content": prompt} |
|
|
] |
|
|
text = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
with torch.no_grad(): |
|
|
generated_ids = model.generate( |
|
|
**model_inputs, |
|
|
max_new_tokens = 1024, |
|
|
min_new_tokens = 256, |
|
|
low_memory = False, |
|
|
do_sample = True, |
|
|
|
|
|
) |
|
|
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] |
|
|
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
return response |
|
|
|
|
|
with gr.Blocks(title="Qwen 14b") as demo: |
|
|
with gr.Tab("Code Chat"): |
|
|
run_button = gr.Button("Run", scale=0) |
|
|
prompt = gr.Text( |
|
|
label="Prompt", |
|
|
show_label=False, |
|
|
max_lines=1, |
|
|
placeholder="Enter your prompt", |
|
|
container=False, |
|
|
) |
|
|
result = gr.Text( |
|
|
label="Result", |
|
|
show_label=False, |
|
|
max_lines=100, |
|
|
container=False, |
|
|
) |
|
|
gr.on( |
|
|
triggers=[ |
|
|
run_button.click, |
|
|
], |
|
|
|
|
|
fn=generate_code, |
|
|
inputs=[ |
|
|
prompt, |
|
|
], |
|
|
outputs=[result], |
|
|
) |
|
|
|
|
|
demo.launch(share=False) |