import gradio as gr import time from optimum.intel import OVModelForCausalLM from transformers import AutoTokenizer, pipeline # Load the model and tokenizer model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino" model = OVModelForCausalLM.from_pretrained(model_id, device="CPU") # 明確指定设备 tokenizer = AutoTokenizer.from_pretrained(model_id) # Create generation pipeline pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) def respond(message): try: start_time = time.time() # 強化 Prompt 讓模型輸出更合理 instruction = ( "請用簡單、繁體中文、準確的語言回答問題,避免冗長和重複內容。\n" "User: " + message + "\n" "Assistant: " ) # Generate response with improved settings response = pipe( instruction, max_length=2048, # 限制最大輸出長度 truncation=True, num_return_sequences=1, temperature=0.3, top_p=0.8, repetition_penalty=1.5, ) generated_text = response[0]['generated_text'].strip() # 提取 "Assistant:" 之後的部分 if "Assistant:" in generated_text: reply = generated_text.split("Assistant:")[-1].strip() else: reply = generated_text inference_time = time.time() - start_time print(f"Inference time: {inference_time:.4f} seconds") # 返回對話記錄更新結果 return [(message, reply)] except Exception as e: print(f"Error: {e}") return [(message, "Sorry, something went wrong. Please try again.")] # Set up Gradio chat interface with gr.Blocks() as demo: gr.Markdown("# DeepSeek-R1-Distill-Qwen-1.5B-openvino Chat") gr.Markdown("Chat with DeepSeek-R1-Distill-Qwen-1.5B-openvino model.") chatbot = gr.Chatbot() # 設置 clear_on_submit=True,使得訊息送出後立即清空輸入框 msg = gr.Textbox(label="Your Message", clear_on_submit=True) # 提交後更新聊天記錄,輸入框會由 clear_on_submit 自動清空 msg.submit(respond, inputs=msg, outputs=chatbot) if __name__ == "__main__": demo.launch(share=True)