File size: 1,773 Bytes
4d871c7 0d30833 4d871c7 68d71c5 6453441 e69a5b4 4d871c7 68d71c5 4d871c7 3c3f47e e69a5b4 0d30833 3c3f47e e69a5b4 3933f4f e69a5b4 0d30833 0e6fea8 4d871c7 e69a5b4 0e6fea8 f48f0af 68d71c5 f48f0af 6453441 f48f0af 3c3f47e 4d871c7 0e6fea8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import gradio as gr
import time
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer, pipeline
# Load the model and tokenizer
model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino"
model = OVModelForCausalLM.from_pretrained(model_id, device="CPU") # 明确指定设备
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Create generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
def respond(message):
try:
# Record the start time
start_time = time.time()
# Only use the current message as input (no history)
input_text = f"User: {message}"
# Generate response
response = pipe(
input_text,
max_length=1024,
truncation=True,
num_return_sequences=1,
temperature=0.7, # 控制生成多样性
top_p=0.9, # 控制生成质量
)
reply = response[0]['generated_text'].strip()
# Calculate inference time
inference_time = time.time() - start_time
print(f"Inference time: {inference_time:.4f} seconds")
# Return as a tuple (user message, bot reply)
return [(message, reply)]
except Exception as e:
print(f"Error: {e}")
return [(message, "Sorry, something went wrong. Please try again.")]
# Set up Gradio chat interface
with gr.Blocks() as demo:
gr.Markdown("# DeepSeek-R1-Distill-Qwen-1.5B-openvino Chat")
gr.Markdown("Chat with DeepSeek-R1-Distill-Qwen-1.5B-openvino model.")
chatbot = gr.Chatbot()
msg = gr.Textbox(label="Your Message")
msg.submit(respond, msg, chatbot)
if __name__ == "__main__":
demo.launch(share=True)
|