Spaces:
Running
Running
File size: 1,368 Bytes
1af7192 c786907 72bde98 d6f0fdc 72bde98 c695fb5 6696581 c786907 82fc211 72bde98 058ff15 82fc211 b18b8b5 82fc211 c786907 82fc211 72bde98 058ff15 72bde98 058ff15 72bde98 ac387f0 72bde98 1af7192 72bde98 50398e9 8e9ef4f 1af7192 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
import gc
hf_hub.snapshot_download(repo_id="OpenVINO/Qwen3-0.6B-int4-ov", local_dir="ov", local_dir_use_symlinks=False)
# 初始化模型
device = "CPU"
InUsed_model_name = "ov"
model_path = f"./{InUsed_model_name}" # 加上目錄路徑
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
config = ov_genai.GenerationConfig(max_new_tokens=4096)
# 推理函式
def generate_response(prompt, model_name):
global pipe, tokenizer
pipe = ov_genai.LLMPipeline("ov", device)
tokenizer = pipe.get_tokenizer()
generated = pipe.generate([prompt], config, streamer)
tokenpersec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
print(f"\nModel:{model_name} TPS:{tokenpersec}\n")
return tokenpersec, generated.text
# 建立 Gradio 介面
model_choices = list(model_name_to_full_id.keys())
demo = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(lines=5, label="輸入提示 (Prompt)")
],
outputs=[
gr.Textbox(label="tokens/sec"),
gr.Textbox(label="回應"),
],
title="Qwen3 Model Inference",
description="基於 Qwen3 推理應用,支援思考過程分離與 GUI。"
)
if __name__ == "__main__":
demo.launch()
|