qwen3_test / app.py
hsuwill000's picture
Update app.py
3592035 verified
raw
history blame
1.88 kB
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
# 下載模型
model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
model_path = "Qwen3-0.6B-int4-ov"
hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)
# 建立推理管線
device = "CPU"
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
def generate_response(prompt, history=[]):
full_response = ""
token_count = 0
start_time = time.time()
def streamer(subword):
nonlocal full_response, token_count
full_response += subword
token_count += 1
return ov_genai.StreamingStatus.RUNNING
pipe.start_chat(history) # Pass the history to start_chat
pipe.generate(prompt, streamer=streamer, max_new_tokens=1024) # Adjust max_new_tokens as needed
pipe.finish_chat() # Finish the chat
end_time = time.time()
elapsed_time = end_time - start_time
tokenpersec = f'{token_count / elapsed_time:.2f}' if elapsed_time > 0 else "0.00"
history.append((prompt, full_response)) # Store the conversation history
return tokenpersec, history
# 建立 Gradio 介面
with gr.Blocks() as demo:
chatbot = gr.Chatbot()
state = gr.State([])
msg = gr.Textbox(label="輸入提示 (Prompt)")
def respond(message, chat_history):
tokenpersec, chat_history = generate_response(message, chat_history)
response = chat_history[-1][1]
return "", chat_history, tokenpersec
msg.submit(respond, [msg, state], [msg, chatbot, gr.Textbox(label="tokens/sec")])
demo.title = "Qwen3-0.6B-int4-ov Streaming Demo"
demo.description = "基於 Qwen3-0.6B-int4-ov 推理應用,支援流式輸出與 GUI。"
if __name__ == "__main__":
demo.launch()