|
import gradio as gr |
|
import time |
|
from optimum.intel import OVModelForCausalLM |
|
from transformers import AutoTokenizer, pipeline |
|
|
|
|
|
model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino" |
|
model = OVModelForCausalLM.from_pretrained(model_id, device="CPU") |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) |
|
|
|
def respond(message): |
|
try: |
|
start_time = time.time() |
|
|
|
|
|
instruction = ( |
|
"請用簡單、準確的語言回答問題,避免冗長和重複內容。\n" |
|
"User: " + message + "\n" |
|
"Assistant: " |
|
) |
|
|
|
|
|
response = pipe( |
|
instruction, |
|
max_length=4096, |
|
truncation=True, |
|
num_return_sequences=1, |
|
temperature=0.3, |
|
top_p=0.8, |
|
repetition_penalty=1.5, |
|
) |
|
generated_text = response[0]['generated_text'].strip() |
|
|
|
|
|
if "Assistant:" in generated_text: |
|
reply = generated_text.split("Assistant:")[-1].strip() |
|
else: |
|
reply = generated_text |
|
|
|
inference_time = time.time() - start_time |
|
print(f"Inference time: {inference_time:.4f} seconds") |
|
|
|
return [(message, reply)] |
|
|
|
except Exception as e: |
|
print(f"Error: {e}") |
|
return [(message, "Sorry, something went wrong. Please try again.")] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# DeepSeek-R1-Distill-Qwen-1.5B-openvino Chat") |
|
gr.Markdown("Chat with DeepSeek-R1-Distill-Qwen-1.5B-openvino model.") |
|
|
|
chatbot = gr.Chatbot() |
|
msg = gr.Textbox(label="Your Message") |
|
|
|
msg.submit(respond, msg, chatbot) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |
|
|