import os import torch from transformers import AutoTokenizer, AutoConfig from optimum.intel.openvino import OVModelForCausalLM import openvino as ov import gradio as gr from typing import List, Tuple from threading import Event, Thread from gradio_helper import make_demo from llm_config import SUPPORTED_LLM_MODELS # Define model configuration model_language = "English" # For example, set the model language to English model_id = "qwen2.5-0.5b-instruct" # For example, select a model ID # Load model configuration model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id] pt_model_id = model_configuration["model_id"] int4_model_dir = os.path.join(model_id, "INT4_compressed_weights") # Load the OpenVINO model and tokenizer device = "CPU" # Or GPU if available core = ov.Core() model_name = model_configuration["model_id"] tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True) # Load the OpenVINO model ov_model = OVModelForCausalLM.from_pretrained( int4_model_dir, device=device, config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True), trust_remote_code=True, ) def convert_history_to_token(history: List[Tuple[str, str]]): """ Converts conversation history to tokens based on model configuration. """ input_ids = tok.encode(history[-1][0]) # Simple example for tokenizing the last user input. return torch.LongTensor([input_ids]) def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): """ Generates the next part of the conversation. """ input_ids = convert_history_to_token(history) streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, max_new_tokens=256, temperature=temperature, do_sample=temperature > 0.0, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, streamer=streamer, ) # Generation process ov_model.generate(**generate_kwargs) # Stream and update history partial_text = "" for new_text in streamer: partial_text += new_text history[-1][1] = partial_text yield history def request_cancel(): ov_model.request.cancel() # Gradio UI demo = make_demo(run_fn=bot, stop_fn=request_cancel, title="OpenVINO Chatbot", language="en") demo.launch(debug=True, share=True)