qwen3_test / app.py
hsuwill000's picture
Update app.py
07268e3 verified
raw
history blame
2.02 kB
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
model_path = "Qwen3-0.6B-int4-ov"
hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)
pipe = ov_genai.LLMPipeline(model_path, "CPU")
pipe.start_chat() # 初始化聊天狀態
def generate(prompt, history):
"""
與 LLM 互動,並使用 `yield` 串流輸出回應。
"""
global pipe # 允許修改全域的 pipeline 物件
full_response = ""
def streamer(subword):
nonlocal full_response # 允許修改封閉範圍的 full_response 變數
full_response += subword
yield full_response # 逐步產生回應
return ov_genai.StreamingStatus.RUNNING
# Gradio 會傳入 history,所以我們用它建立 prompts
# 如果需要更複雜的prompt 處理,可以在這裡添加
# 例如: 建立 system prompt, 用過去的對話建立完整的prompts
for value in pipe.generate(prompt, streamer=streamer, max_new_tokens=100):
yield value # Streamer 已經在yield full_response了,這裡只需要把streamer的產出再次yield出去
# 結束生成後,可以添加一些邏輯,例如記錄對話或更新狀態
# ...
def on_close():
global pipe
pipe.finish_chat() # 在應用結束時清理pipeline
print("Chat finished and pipeline closed.")
if __name__ == "__main__":
demo = gr.ChatInterface(
generate,
chatbot=gr.Chatbot(height=300),
textbox=gr.Textbox(placeholder="請輸入您的訊息...", container=False, scale=7),
title="LLM 串流輸出範例 (OpenVINO)",
description="這個範例示範如何使用 Gradio 串流輸出 OpenVINO GenAI 的回應。",
theme="soft",
examples=["你好", "請自我介紹一下", "今天的氣候如何?"],
)
demo.close(on_close) # 添加在應用關閉時的清理函數
demo.launch()