import gradio as gr import time from optimum.intel import OVModelForCausalLM from transformers import AutoTokenizer, pipeline # Load the model and tokenizer model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino" model = OVModelForCausalLM.from_pretrained(model_id, device="CPU") # 明确指定设备 tokenizer = AutoTokenizer.from_pretrained(model_id) # Create generation pipeline pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) def respond(message): try: # Record the start time start_time = time.time() # 修改 prompt:在對話中加入指令,要求只輸出最終答案,不顯示推理過程 instruction = "請只輸出最終答案,不要展示任何中間推理過程。" input_text = f"User: {message}\nAssistant: {instruction}\nAssistant:" # Generate response response = pipe( input_text, max_length=1024, truncation=True, num_return_sequences=1, temperature=0.2, # 控制生成多样性 top_p=0.1, # 控制生成质量 ) generated_text = response[0]['generated_text'].strip() # 提取模型輸出中最後一次出現 "Assistant:" 之後的部分(假設模型按照格式輸出) # 若模型輸出格式不同,可根據實際情況進行調整 if "Assistant:" in generated_text: reply = generated_text.split("Assistant:")[-1].strip() else: reply = generated_text # Calculate inference time inference_time = time.time() - start_time print(f"Inference time: {inference_time:.4f} seconds") # Return as a tuple (user message, bot reply) return [(message, reply)] except Exception as e: print(f"Error: {e}") return [(message, "Sorry, something went wrong. Please try again.")] # Set up Gradio chat interface with gr.Blocks() as demo: gr.Markdown("# DeepSeek-R1-Distill-Qwen-1.5B-openvino Chat") gr.Markdown("Chat with DeepSeek-R1-Distill-Qwen-1.5B-openvino model.") chatbot = gr.Chatbot() msg = gr.Textbox(label="Your Message") msg.submit(respond, msg, chatbot) if __name__ == "__main__": demo.launch(share=True)