Spaces:

hsuwill000
/

qwen3_test

Running

File size: 2,524 Bytes

import gradio as gr
import openvino_genai as ov_genai
import huggingface_hub as hf_hub

# OpenVINO Setup
model_id = "OpenVINO/Qwen3-0.6B-int4-ov"  # Or your chosen model
model_path = "Qwen3-0.6B-int4-ov"         # Local directory for the model

# Download the model if it doesn't exist locally
try:
    # Check if the model directory exists.  A quick and dirty check.  Adjust as needed.
    import os
    if not os.path.exists(model_path):
        hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)
except Exception as e:
    print(f"Error downloading model: {e}")
    print("Please ensure you have huggingface_hub installed and are authenticated if required.")
    exit()  # Or handle the error more gracefully

pipe = ov_genai.LLMPipeline(model_path, "CPU")
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
pipe.start_chat() # moved pipe.start_chat() here to run after pipeline intialization


# Gradio Chatbot UI
def user(user_message, history: list):
    return "", history + [{"role": "user", "content": user_message}]


def bot(history: list):
    # Get the user's last message from the history
    user_message = history[-1]["content"]

    # Use OpenVINO to generate a response
    full_response = ""  # Store the complete response

    def streamer(subword):  # Local streamer function
        nonlocal full_response  # Allow modification of outer scope variable
        full_response += subword  # Accumulate the subword
        history[-1]['content'] = full_response  # Update chatbot content
        yield history
        return ov_genai.StreamingStatus.RUNNING


    # Initialize the bot message in history
    history.append({"role": "assistant", "content": ""})

    # Generate the response using the streaming function
    for updated_history in pipe.generate(user_message, streamer=streamer, max_new_tokens=100):
        yield updated_history

    # Alternatively, without the step-by-step updates, you can just do this:
    # full_response = pipe.generate(user_message, max_new_tokens=100) # but this will skip the steaming
    # history[-1]['content'] = full_response
    # yield history


with gr.Blocks() as demo:
    chatbot = gr.Chatbot(type="messages")
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.queue().launch()