Spaces:
Running
Running
File size: 1,439 Bytes
1af7192 c786907 72bde98 d6f0fdc 72bde98 c695fb5 6696581 c786907 82fc211 72bde98 058ff15 82fc211 b18b8b5 82fc211 c786907 39aecff 82fc211 72bde98 058ff15 72bde98 8886195 72bde98 058ff15 72bde98 ac387f0 72bde98 1af7192 72bde98 50398e9 8e9ef4f 1af7192 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
import gc
hf_hub.snapshot_download(repo_id="OpenVINO/Qwen3-0.6B-int4-ov", local_dir="ov", local_dir_use_symlinks=False)
# 初始化模型
device = "CPU"
InUsed_model_name = "ov"
model_path = f"./{InUsed_model_name}" # 加上目錄路徑
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
config = ov_genai.GenerationConfig(max_new_tokens=4096)
def streamer(subword):
global output_buffer
output_buffer += subword
print(subword, end='', flush=True)
# 推理函式
def generate_response(prompt, model_name):
global pipe, tokenizer
pipe = ov_genai.LLMPipeline("ov", device)
tokenizer = pipe.get_tokenizer()
generated = pipe.generate([prompt], config, streamer)
tokenpersec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
print(f"\nModel:{model_name} TPS:{tokenpersec}\n")
return tokenpersec, generated.text
# 建立 Gradio 介面
demo = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(lines=5, label="輸入提示 (Prompt)")
],
outputs=[
gr.Textbox(label="tokens/sec"),
gr.Textbox(label="回應"),
],
title="Qwen3 Model Inference",
description="基於 Qwen3 推理應用,支援思考過程分離與 GUI。"
)
if __name__ == "__main__":
demo.launch()
|