File size: 2,797 Bytes
1af7192
c786907
72bde98
 
 
 
 
d6f0fdc
72bde98
 
c695fb5
72bde98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c786907
c695fb5
 
b37db0d
72bde98
 
c695fb5
4e74e4f
b18b8b5
 
6df3934
b37db0d
c786907
1af7192
 
 
01ad3ce
72bde98
b18b8b5
ff821d7
4e74e4f
 
 
 
 
 
 
 
c786907
325f636
1af7192
 
76c3846
1af7192
72bde98
1af7192
72bde98
 
 
 
 
 
 
 
 
3fce2a0
72bde98
 
 
ac387f0
72bde98
 
1af7192
72bde98
50398e9
8e9ef4f
1af7192
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
import gc

# 下載模型

model_ids = [
    "OpenVINO/Qwen3-0.6B-int4-ov",
    "OpenVINO/Qwen3-1.7B-int4-ov",
    #"OpenVINO/Qwen3-4B-int4-ov",#不可用
    "OpenVINO/Qwen3-8B-int4-ov",
    "OpenVINO/Qwen3-14B-int4-ov",
]

model_name_to_full_id = {model_id.split("/")[-1]: model_id for model_id in model_ids}  #Create Dictionary

for model_id in model_ids:
    model_path = model_id.split("/")[-1]  # Extract model name
    try:
      hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)
      print(f"Successfully downloaded {model_id} to {model_path}") # Optional: Print confirmation
    except Exception as e:
      print(f"Error downloading {model_id}: {e}") # Handle download errors gracefully


#hf_hub.snapshot_download("hsuwill000/Llama-3.1-TAIDE-LX-8B-Chat_int4_ov", local_dir="ov", local_dir_use_symlinks=False)

# 建立推理管線 (Initialize with a default model first)
device = "CPU"
InUsed_model_name = "Qwen3-0.6B-int4-ov"  # Choose a default model
pipe = ov_genai.LLMPipeline(InUsed_model_name, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
config = ov_genai.GenerationConfig()
config.max_new_tokens = 4096

def streamer(subword):
    print(subword, end='', flush=True)
    return False
    
def generate_response(prompt, model_name):
    global pipe, tokenizer, InUsed_model_name
    if(InUsed_model_name!=model_name):
        model_path = model_name
        del pipe
        gc.collect()
        print(f"Switching to model: {model_name}")
        pipe = ov_genai.LLMPipeline(model_path, device)
        tokenizer = pipe.get_tokenizer()
        tokenizer.set_chat_template(tokenizer.chat_template)
        InUsed_model_name=model_name

    try:
        generated = pipe.generate([prompt], config, streamer)
        tokenpersec=f'{generated.perf_metrics.get_throughput().mean:.2f}'
        print(f"\nModel:{model_name} TPS:{tokenpersec}\n")
        return tokenpersec, generated
    except Exception as e:
        return "發生錯誤", "發生錯誤", f"生成回應時發生錯誤:{e}"
    

# 建立 Gradio 介面
model_choices = list(model_name_to_full_id.keys())

demo = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
        gr.Dropdown(choices=model_choices, value=InUsed_model_name, label="選擇模型") # Added dropdown
    ],
    outputs=[
        gr.Textbox(label="tokens/sec"),
        gr.Textbox(label="回應"),
    ],
    title="Qwen3 Model Inference",
    description="基於 Qwen3 推理應用,支援思考過程分離與 GUI。"
)

if __name__ == "__main__":
    demo.launch()