Spaces:
Running
Running
File size: 1,610 Bytes
6e96eae d140707 6e96eae d140707 e838cdf d140707 6e96eae 6771aca 6e96eae d140707 1e44aa5 0c88b92 b260b39 0c88b92 1e44aa5 0c88b92 15a68f9 0c88b92 1e44aa5 d140707 0c88b92 1e44aa5 d140707 1e44aa5 d140707 7c5f0ef 1e44aa5 d140707 b260b39 1e44aa5 d140707 7d7759a 6e96eae d140707 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
# 下載模型
model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
model_path = "Qwen3-0.6B-int4-ov"
hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)
# 建立推理管線
device = "CPU"
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
def generate_response(prompt):
full_response = "" # 用於儲存完整的回應
def streamer(subword):
nonlocal full_response
full_response += subword
yield full_response # 使用 yield 使 streamer 成為生成器
return ov_genai.StreamingStatus.RUNNING # 返回 StreamingStatus.RUNNING
try:
# 使用流式生成
generated = pipe.generate(prompt, streamer=streamer, max_new_tokens=100)
tokenpersec = f'{generated.perf_metrics.get_throughput().mean:.2f}' # 恢復原本計算 tokenpersec 的方式
return tokenpersec, full_response
except Exception as e:
return "發生錯誤", "發生錯誤", f"生成回應時發生錯誤:{e}"
# 建立 Gradio 介面
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
outputs=[
gr.Textbox(label="tokens/sec"),
gr.Textbox(label="回應"]
],
title="Qwen3-0.6B-int4-ov ",
description="基於 Qwen3-0.6B-int4-ov 推理應用,支援思考過程分離與 GUI。"
)
if __name__ == "__main__":
demo.launch() |