Spaces:

hsuwill000
/

qwen3_test

Running

File size: 1,883 Bytes

import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re

# 下載模型
model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
model_path = "Qwen3-0.6B-int4-ov"

hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)

# 建立推理管線
device = "CPU"
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)


def generate_response(prompt, history=[]):
    full_response = ""
    token_count = 0
    start_time = time.time()

    def streamer(subword):
        nonlocal full_response, token_count
        full_response += subword
        token_count += 1
        return ov_genai.StreamingStatus.RUNNING

    pipe.start_chat(history)  # Pass the history to start_chat
    pipe.generate(prompt, streamer=streamer, max_new_tokens=1024)  # Adjust max_new_tokens as needed
    pipe.finish_chat()  # Finish the chat

    end_time = time.time()
    elapsed_time = end_time - start_time
    tokenpersec = f'{token_count / elapsed_time:.2f}' if elapsed_time > 0 else "0.00"

    history.append((prompt, full_response))  # Store the conversation history
    return tokenpersec, history

# 建立 Gradio 介面
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    state = gr.State([])
    msg = gr.Textbox(label="輸入提示 (Prompt)")

    def respond(message, chat_history):
        tokenpersec, chat_history = generate_response(message, chat_history)
        response = chat_history[-1][1]
        return "", chat_history, tokenpersec

    msg.submit(respond, [msg, state], [msg, chatbot, gr.Textbox(label="tokens/sec")])

    demo.title = "Qwen3-0.6B-int4-ov Streaming Demo"
    demo.description = "基於 Qwen3-0.6B-int4-ov 推理應用，支援流式輸出與 GUI。"

if __name__ == "__main__":
    demo.launch()