File size: 2,809 Bytes
259675b
9424b30
6bda22b
d45acbf
9424b30
259675b
b307fb6
6bda22b
0fcdc6e
 
 
 
 
 
 
 
 
 
 
 
 
 
145ef29
0dedb6c
6b241f8
1bd4352
5adf214
1bd4352
0dedb6c
1353c4b
0fcdc6e
d45acbf
0fcdc6e
9424b30
3e39aa5
259675b
d7d5739
 
 
 
 
 
 
b75b9d4
 
d7d5739
3e39aa5
d45acbf
30994f6
085daf3
6bda22b
06ea06c
6bda22b
3e39aa5
6bda22b
 
259675b
6bda22b
3e39aa5
06ea06c
885df1a
4a7c181
 
3e39aa5
9424b30
6bda22b
259675b
d45acbf
6bda22b
 
 
 
4a7c181
 
 
d45acbf
cbbc528
 
 
 
 
 
 
 
3e39aa5
b75b9d4
 
 
 
38b3f94
b75b9d4
 
d7d5739
 
1e27ea1
3818ee6
cbbc528
d45acbf
3e39aa5
b307fb6
d45acbf
3e39aa5
4a7c181
b307fb6
4a7c181
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import openvino_genai as ov_genai
import queue
import threading
import time

import huggingface_hub as hf_hub
# 初始化 OpenVINO 模型

import subprocess

subprocess.run([
    "optimum-cli", "export", "openvino",
    "--model", "Qwen/Qwen3-1.7B",
    "--task", "text-generation-with-past",
    "--weight-format", "int4",
    "--quant-mode", "int4_f8e4m3",
    "ov"
])

'''
model_id = "OpenVINO/Qwen2.5-7B-Instruct-int4-ov"
model_path = "ov"

config = ov_genai.GenerationConfig()
config.max_new_tokens = 4096
config.top_p = 0.9;
config.top_k = 30;

hf_hub.snapshot_download(model_id, local_dir=model_path)
'''

model_path = "ov"
pipe = ov_genai.LLMPipeline(model_path, "CPU")
#pipe.start_chat()

def start_chat():
    pipe.start_chat()
    return "✅ 開始對話!"

def finish_chat():
    pipe.finish_chat()
    return "🛑 結束對話!"


	
# 建立推論函式:使用 streamer 並回傳 generator 結果
def generate_stream(prompt):
    prompt = prompt #+ " /no_think" + " 答案短且明瞭"

    q = queue.Queue()
    tps_result = ""
    def streamer(subword):
        print(subword, end='', flush=True)
        q.put(subword)
        return ov_genai.StreamingStatus.RUNNING

    def worker():
        # 在背景 thread 中做推論
        nonlocal tps_result
        gen_result = pipe.generate([prompt], streamer=streamer, config=config)
        tps = gen_result.perf_metrics.get_throughput().mean
        tps_result = f"{tps:.2f} tokens/s"        
        q.put(None)  # 結束符號

    threading.Thread(target=worker).start()

    result = ""
    while True:
        token = q.get()
        if token is None:
            break
        result += token
        yield result,""  # 把逐步結果傳給 output textbox
    yield result, tps_result

with gr.Blocks(css="""
#scrollbox textarea {
    overflow-y: auto !important;
    height: 300px !important;
    resize: none;
    white-space: pre-wrap;
}
""") as demo:
    gr.Markdown("## 🧠 OpenVINO Streaming Demo with Gradio Textbox")
    
    with gr.Row():
        start_btn = gr.Button("開始對話")
        end_btn = gr.Button("結束對話")
        status_box = gr.Textbox(label="狀態", interactive=False)
        TPS_box = gr.Textbox(label="TPS", interactive=False)
    
    start_btn.click(fn=start_chat, outputs=status_box)
    end_btn.click(fn=finish_chat, outputs=status_box)
	
    textbox_input = gr.Textbox(label="Prompt", lines=1, placeholder="Enter prompt here...")
    textbox_output = gr.Textbox(label="Output", elem_id="scrollbox", lines=10)

    # 按鈕控制觸發推論
    button = gr.Button("Submit")

    # 當按鈕被按下時,呼叫 generate_stream 並更新 textbox_output
    button.click(fn=generate_stream, inputs=textbox_input, outputs=[textbox_output, TPS_box])

demo.launch()