File size: 1,872 Bytes
6e96eae
 
e838cdf
6e96eae
 
 
 
 
6771aca
6e96eae
 
 
 
 
 
 
f372999
1e44aa5
f372999
1e44aa5
7c5f0ef
04d3fa9
f372999
 
 
04d3fa9
8b31668
f372999
1e44aa5
7c5f0ef
1e44aa5
 
f372999
7c5f0ef
f372999
15a68f9
1e44aa5
f372999
 
1e44aa5
f372999
1e44aa5
f372999
7c5f0ef
1e44aa5
f372999
1e44aa5
f372999
 
7d7759a
6e96eae
f372999
6e96eae
f372999
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import huggingface_hub as hf_hub
import openvino_genai as ov_genai
import gradio as gr

# 下載模型
model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
model_path = "Qwen3-0.6B-int4-ov"

hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)

# 建立推理管線
device = "CPU"
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)

# 完整流式處理的 generate_response 函數
def generate_response(prompt):
    response = ""
    try:
        # 定義流式處理的回調函數
        def streamer(subword):
            nonlocal response
            response += subword  # 拼接實時輸出的內容
            yield response  # 每次生成一部分內容即時返回給 Gradio
            return ov_genai.StreamingStatus.RUNNING

        # 啟動流式生成
        pipe.start_chat()
        generated = pipe.generate([prompt], streamer=streamer, max_length=1024)
        pipe.finish_chat()

        # 最後返回完整輸出與性能數據
        token_per_sec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
        yield f"生成完成:每秒生成 {token_per_sec} tokens。\n\n最終回應:{response}"

    except Exception as e:
        # 捕獲錯誤並即時返回錯誤訊息
        yield f"生成過程中發生錯誤:{e}"

# 使用 Gradio 流式介面
demo = gr.Interface(
    fn=generate_response,  # 流式處理函數
    inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
    outputs=[
        gr.Textbox(label="流式處理的回應")  # 輸出將逐步更新,顯示即時生成內容
    ],
    title="Qwen3-0.6B-int4-ov 流式處理",
    description="基於 Qwen3-0.6B-int4-ov 推理應用,支援實時輸出到 Gradio 介面。"
)

# 啟動 Gradio 服務
if __name__ == "__main__":
    demo.queue().launch()