File size: 2,568 Bytes
259675b
9424b30
6bda22b
d45acbf
9424b30
678e02e
 
259675b
b307fb6
6bda22b
0fcdc6e
145ef29
23052cd
 
 
 
 
 
 
5202755
edd5af4
678e02e
3e39aa5
259675b
d7d5739
 
 
 
 
 
 
b75b9d4
 
d7d5739
3e39aa5
d45acbf
30994f6
085daf3
6bda22b
06ea06c
6bda22b
3e39aa5
6bda22b
 
259675b
6bda22b
3e39aa5
06ea06c
885df1a
4a7c181
 
3e39aa5
9424b30
6bda22b
259675b
d45acbf
6bda22b
 
 
 
4a7c181
 
 
d45acbf
a402d27
3e39aa5
b75b9d4
 
 
 
38b3f94
b75b9d4
 
d7d5739
 
1e27ea1
3818ee6
a402d27
f0109a7
a402d27
3e39aa5
b307fb6
d45acbf
3e39aa5
4a7c181
b307fb6
4a7c181
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import openvino_genai as ov_genai
import queue
import threading
import time
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
import nncf

import huggingface_hub as hf_hub
# 初始化 OpenVINO 模型
model_id = "OpenVINO/Qwen2.5-7B-Instruct-int4-ov"
model_path = "ov"
hf_hub.snapshot_download(model_id, local_dir=model_path)

config = ov_genai.GenerationConfig()
config.max_new_tokens = 4096
config.top_p = 0.9;
config.top_k = 30;


pipe = ov_genai.LLMPipeline(model_path, "CPU")

#pipe.start_chat()

def start_chat():
    pipe.start_chat()
    return "✅ 開始對話!"

def finish_chat():
    pipe.finish_chat()
    return "🛑 結束對話!"


	
# 建立推論函式:使用 streamer 並回傳 generator 結果
def generate_stream(prompt):
    prompt = prompt #+ " /no_think" + " 答案短且明瞭"

    q = queue.Queue()
    tps_result = ""
    def streamer(subword):
        print(subword, end='', flush=True)
        q.put(subword)
        return ov_genai.StreamingStatus.RUNNING

    def worker():
        # 在背景 thread 中做推論
        nonlocal tps_result
        gen_result = pipe.generate([prompt], streamer=streamer, config=config)
        tps = gen_result.perf_metrics.get_throughput().mean
        tps_result = f"{tps:.2f} tokens/s"        
        q.put(None)  # 結束符號

    threading.Thread(target=worker).start()

    result = ""
    while True:
        token = q.get()
        if token is None:
            break
        result += token
        yield result,""  # 把逐步結果傳給 output textbox
    yield result, tps_result

with gr.Blocks() as demo:
    gr.Markdown("## 🧠 OpenVINO Streaming Demo with Gradio Textbox")
    
    with gr.Row():
        start_btn = gr.Button("開始對話")
        end_btn = gr.Button("結束對話")
        status_box = gr.Textbox(label="狀態", interactive=False)
        TPS_box = gr.Textbox(label="TPS", interactive=False)
    
    start_btn.click(fn=start_chat, outputs=status_box)
    end_btn.click(fn=finish_chat, outputs=status_box)
	
    textbox_input = gr.Textbox(label="Prompt", lines=1, placeholder="Enter prompt here...")
    #textbox_output = gr.Textbox(label="Output", elem_id="scrollbox", lines=10)
    textbox_output = gr.Markdown(min_height=30, label="robot answer:")
    
    # 按鈕控制觸發推論
    button = gr.Button("Submit")

    # 當按鈕被按下時,呼叫 generate_stream 並更新 textbox_output
    button.click(fn=generate_stream, inputs=textbox_input, outputs=[textbox_output, TPS_box])

demo.launch()