Spaces:
Running
Running
File size: 1,872 Bytes
6e96eae e838cdf 6e96eae 6771aca 6e96eae f372999 1e44aa5 f372999 1e44aa5 7c5f0ef 04d3fa9 f372999 04d3fa9 8b31668 f372999 1e44aa5 7c5f0ef 1e44aa5 f372999 7c5f0ef f372999 15a68f9 1e44aa5 f372999 1e44aa5 f372999 1e44aa5 f372999 7c5f0ef 1e44aa5 f372999 1e44aa5 f372999 7d7759a 6e96eae f372999 6e96eae f372999 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import huggingface_hub as hf_hub
import openvino_genai as ov_genai
import gradio as gr
# 下載模型
model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
model_path = "Qwen3-0.6B-int4-ov"
hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)
# 建立推理管線
device = "CPU"
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
# 完整流式處理的 generate_response 函數
def generate_response(prompt):
response = ""
try:
# 定義流式處理的回調函數
def streamer(subword):
nonlocal response
response += subword # 拼接實時輸出的內容
yield response # 每次生成一部分內容即時返回給 Gradio
return ov_genai.StreamingStatus.RUNNING
# 啟動流式生成
pipe.start_chat()
generated = pipe.generate([prompt], streamer=streamer, max_length=1024)
pipe.finish_chat()
# 最後返回完整輸出與性能數據
token_per_sec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
yield f"生成完成:每秒生成 {token_per_sec} tokens。\n\n最終回應:{response}"
except Exception as e:
# 捕獲錯誤並即時返回錯誤訊息
yield f"生成過程中發生錯誤:{e}"
# 使用 Gradio 流式介面
demo = gr.Interface(
fn=generate_response, # 流式處理函數
inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
outputs=[
gr.Textbox(label="流式處理的回應") # 輸出將逐步更新,顯示即時生成內容
],
title="Qwen3-0.6B-int4-ov 流式處理",
description="基於 Qwen3-0.6B-int4-ov 推理應用,支援實時輸出到 Gradio 介面。"
)
# 啟動 Gradio 服務
if __name__ == "__main__":
demo.queue().launch() |