Spaces:

hsuwill000
/

qwen3_test

Running

File size: 2,016 Bytes

import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re

model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
model_path = "Qwen3-0.6B-int4-ov"

hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)


pipe = ov_genai.LLMPipeline(model_path, "CPU")
pipe.start_chat()  # 初始化聊天狀態

def generate(prompt, history):
    """
    與 LLM 互動，並使用 `yield` 串流輸出回應。
    """
    global pipe  # 允許修改全域的 pipeline 物件
    full_response = ""

    def streamer(subword):
        nonlocal full_response  # 允許修改封閉範圍的 full_response 變數
        full_response += subword
        yield full_response  # 逐步產生回應
        return ov_genai.StreamingStatus.RUNNING

    # Gradio 會傳入 history，所以我們用它建立 prompts
    # 如果需要更複雜的prompt 處理，可以在這裡添加
    # 例如: 建立 system prompt, 用過去的對話建立完整的prompts

    for value in pipe.generate(prompt, streamer=streamer, max_new_tokens=100):
       yield value # Streamer 已經在yield full_response了，這裡只需要把streamer的產出再次yield出去

    # 結束生成後，可以添加一些邏輯，例如記錄對話或更新狀態
    # ...

def on_close():
    global pipe
    pipe.finish_chat() # 在應用結束時清理pipeline
    print("Chat finished and pipeline closed.")


if __name__ == "__main__":
    demo = gr.ChatInterface(
        generate,
        chatbot=gr.Chatbot(height=300),
        textbox=gr.Textbox(placeholder="請輸入您的訊息...", container=False, scale=7),
        title="LLM 串流輸出範例 (OpenVINO)",
        description="這個範例示範如何使用 Gradio 串流輸出 OpenVINO GenAI 的回應。",
        theme="soft",
        examples=["你好", "請自我介紹一下", "今天的氣候如何？"],
    )
    demo.close(on_close) # 添加在應用關閉時的清理函數
    demo.launch()