import gradio as gr from huggingface_hub import InferenceClient from optimum.intel import OVModelForCausalLM from transformers import AutoTokenizer, pipeline # 載入模型和標記器 model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino" print("Loading model...") model = OVModelForCausalLM.from_pretrained(model_id, device_map="auto") print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True,) def respond(prompt, history): # 構建聊天模板 messages = [ {"role": "system", "content": "使用中文,直接回答用戶的問題,盡量簡潔在1024 token內。"}, {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) print("Chat template text:", text) # 將文本轉換為模型輸入 model_inputs = tokenizer([text], return_tensors="pt").to(model.device) print("Model inputs:", model_inputs) # 生成回應 generated_ids = model.generate( **model_inputs, max_new_tokens=2048, temperature=0.7, # 降低隨機性 top_p=0.9, # 限制生成的多樣性 do_sample=True # 啟用採樣 ) print("Generated IDs:", generated_ids) # 解碼生成的 token IDs generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print("Decoded response:", response) # **去除 `` 及其他無用內容** response = response.replace("", "**THINK**").replace("", "**THINK**").strip() # 返回回應 return response # 設定 Gradio 的聊天界面 demo = gr.ChatInterface( fn=respond, title="DeepSeek-R1-Distill-Qwen-1.5B-openvino", description="DeepSeek-R1-Distill-Qwen-1.5B-openvino" ) if __name__ == "__main__": print("Launching Gradio app...") demo.launch(server_name="0.0.0.0", server_port=7860) #demo.launch()