Spaces:
Running
Running
File size: 2,797 Bytes
1af7192 c786907 72bde98 d6f0fdc 72bde98 c695fb5 72bde98 c786907 c695fb5 b37db0d 72bde98 c695fb5 4e74e4f b18b8b5 6df3934 b37db0d c786907 1af7192 01ad3ce 72bde98 b18b8b5 ff821d7 4e74e4f c786907 325f636 1af7192 76c3846 1af7192 72bde98 1af7192 72bde98 3fce2a0 72bde98 ac387f0 72bde98 1af7192 72bde98 50398e9 8e9ef4f 1af7192 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
import gc
# 下載模型
model_ids = [
"OpenVINO/Qwen3-0.6B-int4-ov",
"OpenVINO/Qwen3-1.7B-int4-ov",
#"OpenVINO/Qwen3-4B-int4-ov",#不可用
"OpenVINO/Qwen3-8B-int4-ov",
"OpenVINO/Qwen3-14B-int4-ov",
]
model_name_to_full_id = {model_id.split("/")[-1]: model_id for model_id in model_ids} #Create Dictionary
for model_id in model_ids:
model_path = model_id.split("/")[-1] # Extract model name
try:
hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)
print(f"Successfully downloaded {model_id} to {model_path}") # Optional: Print confirmation
except Exception as e:
print(f"Error downloading {model_id}: {e}") # Handle download errors gracefully
#hf_hub.snapshot_download("hsuwill000/Llama-3.1-TAIDE-LX-8B-Chat_int4_ov", local_dir="ov", local_dir_use_symlinks=False)
# 建立推理管線 (Initialize with a default model first)
device = "CPU"
InUsed_model_name = "Qwen3-0.6B-int4-ov" # Choose a default model
pipe = ov_genai.LLMPipeline(InUsed_model_name, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
config = ov_genai.GenerationConfig()
config.max_new_tokens = 4096
def streamer(subword):
print(subword, end='', flush=True)
return False
def generate_response(prompt, model_name):
global pipe, tokenizer, InUsed_model_name
if(InUsed_model_name!=model_name):
model_path = model_name
del pipe
gc.collect()
print(f"Switching to model: {model_name}")
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
InUsed_model_name=model_name
try:
generated = pipe.generate([prompt], config, streamer)
tokenpersec=f'{generated.perf_metrics.get_throughput().mean:.2f}'
print(f"\nModel:{model_name} TPS:{tokenpersec}\n")
return tokenpersec, generated
except Exception as e:
return "發生錯誤", "發生錯誤", f"生成回應時發生錯誤:{e}"
# 建立 Gradio 介面
model_choices = list(model_name_to_full_id.keys())
demo = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
gr.Dropdown(choices=model_choices, value=InUsed_model_name, label="選擇模型") # Added dropdown
],
outputs=[
gr.Textbox(label="tokens/sec"),
gr.Textbox(label="回應"),
],
title="Qwen3 Model Inference",
description="基於 Qwen3 推理應用,支援思考過程分離與 GUI。"
)
if __name__ == "__main__":
demo.launch()
|