hsuwill000's picture
Update app.py
c9ac48b verified
raw
history blame
2.32 kB
import gradio as gr
import time
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer, pipeline
# Load the model and tokenizer
model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino"
model = OVModelForCausalLM.from_pretrained(model_id, device="CPU") # 明确指定设备
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Create generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
def respond(message):
try:
# Record the start time
start_time = time.time()
# 修改 prompt:在對話中加入指令,要求只輸出最終答案,不顯示推理過程
instruction = "請只輸出最終答案,不要展示任何中間推理過程。"
input_text = f"User: {message}\nAssistant: {instruction}\nAssistant:"
# Generate response
response = pipe(
input_text,
max_length=1024,
truncation=True,
num_return_sequences=1,
temperature=0.2, # 控制生成多样性
top_p=0.1, # 控制生成质量
)
generated_text = response[0]['generated_text'].strip()
# 提取模型輸出中最後一次出現 "Assistant:" 之後的部分(假設模型按照格式輸出)
# 若模型輸出格式不同,可根據實際情況進行調整
if "Assistant:" in generated_text:
reply = generated_text.split("Assistant:")[-1].strip()
else:
reply = generated_text
# Calculate inference time
inference_time = time.time() - start_time
print(f"Inference time: {inference_time:.4f} seconds")
# Return as a tuple (user message, bot reply)
return [(message, reply)]
except Exception as e:
print(f"Error: {e}")
return [(message, "Sorry, something went wrong. Please try again.")]
# Set up Gradio chat interface
with gr.Blocks() as demo:
gr.Markdown("# DeepSeek-R1-Distill-Qwen-1.5B-openvino Chat")
gr.Markdown("Chat with DeepSeek-R1-Distill-Qwen-1.5B-openvino model.")
chatbot = gr.Chatbot()
msg = gr.Textbox(label="Your Message")
msg.submit(respond, msg, chatbot)
if __name__ == "__main__":
demo.launch(share=True)