Spaces:
Running
Running
import gradio as gr | |
import openvino_genai as ov_genai | |
import huggingface_hub as hf_hub | |
# OpenVINO Setup | |
model_id = "OpenVINO/Qwen3-0.6B-int4-ov" # Or your chosen model | |
model_path = "Qwen3-0.6B-int4-ov" # Local directory for the model | |
# Download the model if it doesn't exist locally | |
try: | |
# Check if the model directory exists. A quick and dirty check. Adjust as needed. | |
import os | |
if not os.path.exists(model_path): | |
hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False) | |
except Exception as e: | |
print(f"Error downloading model: {e}") | |
print("Please ensure you have huggingface_hub installed and are authenticated if required.") | |
exit() # Or handle the error more gracefully | |
pipe = ov_genai.LLMPipeline(model_path, "CPU") | |
tokenizer = pipe.get_tokenizer() | |
tokenizer.set_chat_template(tokenizer.chat_template) | |
pipe.start_chat() # moved pipe.start_chat() here to run after pipeline intialization | |
# Gradio Chatbot UI | |
def user(user_message, history: list): | |
return "", history + [{"role": "user", "content": user_message}] | |
def bot(history: list): | |
# Get the user's last message from the history | |
user_message = history[-1]["content"] | |
# Use OpenVINO to generate a response | |
full_response = "" # Store the complete response | |
def streamer(subword): # Local streamer function | |
nonlocal full_response # Allow modification of outer scope variable | |
full_response += subword # Accumulate the subword | |
history[-1]['content'] = full_response # Update chatbot content | |
yield history | |
return ov_genai.StreamingStatus.RUNNING | |
# Initialize the bot message in history | |
history.append({"role": "assistant", "content": ""}) | |
# Generate the response using the streaming function | |
for updated_history in pipe.generate(user_message, streamer=streamer, max_new_tokens=100): | |
yield updated_history | |
# Alternatively, without the step-by-step updates, you can just do this: | |
# full_response = pipe.generate(user_message, max_new_tokens=100) # but this will skip the steaming | |
# history[-1]['content'] = full_response | |
# yield history | |
with gr.Blocks() as demo: | |
chatbot = gr.Chatbot(type="messages") | |
msg = gr.Textbox() | |
clear = gr.Button("Clear") | |
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
bot, chatbot, chatbot | |
) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
if __name__ == "__main__": | |
demo.queue().launch() |