qwen3_test / app.py
hsuwill000's picture
Update app.py
39aecff verified
raw
history blame
1.44 kB
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
import gc
hf_hub.snapshot_download(repo_id="OpenVINO/Qwen3-0.6B-int4-ov", local_dir="ov", local_dir_use_symlinks=False)
# 初始化模型
device = "CPU"
InUsed_model_name = "ov"
model_path = f"./{InUsed_model_name}" # 加上目錄路徑
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
config = ov_genai.GenerationConfig(max_new_tokens=4096)
def streamer(subword):
global output_buffer
output_buffer += subword
print(subword, end='', flush=True)
# 推理函式
def generate_response(prompt, model_name):
global pipe, tokenizer
pipe = ov_genai.LLMPipeline("ov", device)
tokenizer = pipe.get_tokenizer()
generated = pipe.generate([prompt], config, streamer)
tokenpersec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
print(f"\nModel:{model_name} TPS:{tokenpersec}\n")
return tokenpersec, generated.text
# 建立 Gradio 介面
demo = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(lines=5, label="輸入提示 (Prompt)")
],
outputs=[
gr.Textbox(label="tokens/sec"),
gr.Textbox(label="回應"),
],
title="Qwen3 Model Inference",
description="基於 Qwen3 推理應用,支援思考過程分離與 GUI。"
)
if __name__ == "__main__":
demo.launch()