File size: 2,320 Bytes
4d871c7
0d30833
4d871c7
 
 
68d71c5
6453441
e69a5b4
4d871c7
 
68d71c5
4d871c7
 
3c3f47e
e69a5b4
0d30833
 
 
c9ac48b
 
 
 
e69a5b4
 
 
c9ac48b
e69a5b4
 
9c592ef
 
e69a5b4
c9ac48b
 
 
 
 
 
 
 
e69a5b4
0d30833
 
 
 
0e6fea8
 
4d871c7
e69a5b4
 
0e6fea8
f48f0af
68d71c5
f48f0af
6453441
 
f48f0af
 
 
 
3c3f47e
4d871c7
 
0e6fea8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
import time
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Load the model and tokenizer
model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino"
model = OVModelForCausalLM.from_pretrained(model_id, device="CPU")  # 明确指定设备
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Create generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

def respond(message):
    try:
        # Record the start time
        start_time = time.time()
        
        # 修改 prompt:在對話中加入指令,要求只輸出最終答案,不顯示推理過程
        instruction = "請只輸出最終答案,不要展示任何中間推理過程。"
        input_text = f"User: {message}\nAssistant: {instruction}\nAssistant:"
        
        # Generate response
        response = pipe(
            input_text,
            max_length=1024,
            truncation=True,
            num_return_sequences=1,
            temperature=0.2,  # 控制生成多样性
            top_p=0.1,        # 控制生成质量
        )
        generated_text = response[0]['generated_text'].strip()
        
        # 提取模型輸出中最後一次出現 "Assistant:" 之後的部分(假設模型按照格式輸出)
        # 若模型輸出格式不同,可根據實際情況進行調整
        if "Assistant:" in generated_text:
            reply = generated_text.split("Assistant:")[-1].strip()
        else:
            reply = generated_text
        
        # Calculate inference time
        inference_time = time.time() - start_time
        print(f"Inference time: {inference_time:.4f} seconds")
        
        # Return as a tuple (user message, bot reply)
        return [(message, reply)]
    
    except Exception as e:
        print(f"Error: {e}")
        return [(message, "Sorry, something went wrong. Please try again.")]

# Set up Gradio chat interface
with gr.Blocks() as demo:
    gr.Markdown("# DeepSeek-R1-Distill-Qwen-1.5B-openvino Chat")
    gr.Markdown("Chat with DeepSeek-R1-Distill-Qwen-1.5B-openvino model.")
    
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Your Message")
    
    msg.submit(respond, msg, chatbot)

if __name__ == "__main__":
    demo.launch(share=True)