File size: 1,368 Bytes
1af7192
c786907
72bde98
 
 
 
 
d6f0fdc
72bde98
c695fb5
6696581
c786907
82fc211
72bde98
058ff15
82fc211
 
b18b8b5
82fc211
c786907
82fc211
72bde98
058ff15
 
 
 
 
 
 
 
72bde98
 
 
 
 
 
 
058ff15
72bde98
 
 
ac387f0
72bde98
 
1af7192
72bde98
50398e9
8e9ef4f
1af7192
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
import gc


hf_hub.snapshot_download(repo_id="OpenVINO/Qwen3-0.6B-int4-ov", local_dir="ov", local_dir_use_symlinks=False)

# 初始化模型
device = "CPU"
InUsed_model_name = "ov"
model_path = f"./{InUsed_model_name}"  # 加上目錄路徑
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
config = ov_genai.GenerationConfig(max_new_tokens=4096)

# 推理函式
def generate_response(prompt, model_name):
    global pipe, tokenizer
    pipe = ov_genai.LLMPipeline("ov", device)
    tokenizer = pipe.get_tokenizer()
    generated = pipe.generate([prompt], config, streamer)
    tokenpersec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
    print(f"\nModel:{model_name} TPS:{tokenpersec}\n")
    
    return tokenpersec, generated.text

# 建立 Gradio 介面
model_choices = list(model_name_to_full_id.keys())

demo = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(lines=5, label="輸入提示 (Prompt)")
    ],
    outputs=[
        gr.Textbox(label="tokens/sec"),
        gr.Textbox(label="回應"),
    ],
    title="Qwen3 Model Inference",
    description="基於 Qwen3 推理應用,支援思考過程分離與 GUI。"
)

if __name__ == "__main__":
    demo.launch()