import gradio as gr import openvino_genai as ov_genai import huggingface_hub as hf_hub import queue import threading import time # 初始化 OpenVINO 模型 model_id = "OpenVINO/Qwen3-0.6B-int4-ov" model_path = "Qwen3-0.6B-int4-ov" hf_hub.snapshot_download(model_id, local_dir=model_path) pipe = ov_genai.LLMPipeline(model_path, "CPU") #pipe.start_chat() # 建立推論函式:使用 streamer 並回傳 generator 結果 def generate_stream(prompt): q = queue.Queue() def streamer(subword): print(subword, end='', flush=True) q.put(subword) return ov_genai.StreamingStatus.RUNNING def worker(): # 在背景 thread 中做推論 pipe.generate([prompt], streamer=streamer, max_new_tokens=4096) q.put(None) # 結束符號 threading.Thread(target=worker).start() result = "" while True: token = q.get() if token is None: break result += token yield result # 把逐步結果傳給 output textbox with gr.Blocks(css=""" #scrollbox textarea { overflow-y: auto !important; height: 300px !important; resize: none; white-space: pre-wrap; } """) as demo: gr.Markdown("## 🧠 OpenVINO Streaming Demo with Gradio Textbox") textbox_input = gr.Textbox(label="Prompt", lines=1, placeholder="Enter prompt here...") textbox_output = gr.Textbox(label="Output", elem_id="scrollbox", lines=10) # 按鈕控制觸發推論 button = gr.Button("Submit") # 當按鈕被按下時,呼叫 generate_stream 並更新 textbox_output button.click(fn=generate_stream, inputs=textbox_input, outputs=textbox_output) demo.launch()