Spaces:
Running
Running
File size: 2,568 Bytes
259675b 9424b30 6bda22b d45acbf 9424b30 678e02e 259675b b307fb6 6bda22b 0fcdc6e 145ef29 23052cd 5202755 edd5af4 678e02e 3e39aa5 259675b d7d5739 b75b9d4 d7d5739 3e39aa5 d45acbf 30994f6 085daf3 6bda22b 06ea06c 6bda22b 3e39aa5 6bda22b 259675b 6bda22b 3e39aa5 06ea06c 885df1a 4a7c181 3e39aa5 9424b30 6bda22b 259675b d45acbf 6bda22b 4a7c181 d45acbf a402d27 3e39aa5 b75b9d4 38b3f94 b75b9d4 d7d5739 1e27ea1 3818ee6 a402d27 f0109a7 a402d27 3e39aa5 b307fb6 d45acbf 3e39aa5 4a7c181 b307fb6 4a7c181 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gradio as gr
import openvino_genai as ov_genai
import queue
import threading
import time
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
import nncf
import huggingface_hub as hf_hub
# 初始化 OpenVINO 模型
model_id = "OpenVINO/Qwen2.5-7B-Instruct-int4-ov"
model_path = "ov"
hf_hub.snapshot_download(model_id, local_dir=model_path)
config = ov_genai.GenerationConfig()
config.max_new_tokens = 4096
config.top_p = 0.9;
config.top_k = 30;
pipe = ov_genai.LLMPipeline(model_path, "CPU")
#pipe.start_chat()
def start_chat():
pipe.start_chat()
return "✅ 開始對話!"
def finish_chat():
pipe.finish_chat()
return "🛑 結束對話!"
# 建立推論函式:使用 streamer 並回傳 generator 結果
def generate_stream(prompt):
prompt = prompt #+ " /no_think" + " 答案短且明瞭"
q = queue.Queue()
tps_result = ""
def streamer(subword):
print(subword, end='', flush=True)
q.put(subword)
return ov_genai.StreamingStatus.RUNNING
def worker():
# 在背景 thread 中做推論
nonlocal tps_result
gen_result = pipe.generate([prompt], streamer=streamer, config=config)
tps = gen_result.perf_metrics.get_throughput().mean
tps_result = f"{tps:.2f} tokens/s"
q.put(None) # 結束符號
threading.Thread(target=worker).start()
result = ""
while True:
token = q.get()
if token is None:
break
result += token
yield result,"" # 把逐步結果傳給 output textbox
yield result, tps_result
with gr.Blocks() as demo:
gr.Markdown("## 🧠 OpenVINO Streaming Demo with Gradio Textbox")
with gr.Row():
start_btn = gr.Button("開始對話")
end_btn = gr.Button("結束對話")
status_box = gr.Textbox(label="狀態", interactive=False)
TPS_box = gr.Textbox(label="TPS", interactive=False)
start_btn.click(fn=start_chat, outputs=status_box)
end_btn.click(fn=finish_chat, outputs=status_box)
textbox_input = gr.Textbox(label="Prompt", lines=1, placeholder="Enter prompt here...")
#textbox_output = gr.Textbox(label="Output", elem_id="scrollbox", lines=10)
textbox_output = gr.Markdown(min_height=30, label="robot answer:")
# 按鈕控制觸發推論
button = gr.Button("Submit")
# 當按鈕被按下時,呼叫 generate_stream 並更新 textbox_output
button.click(fn=generate_stream, inputs=textbox_input, outputs=[textbox_output, TPS_box])
demo.launch() |