import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Define available models model_options = { "VLM-1-K3": "VortexIntelligence/VLM-1-K3", "VLM-1-K2": "VortexIntelligence/VLM-1-K2", "VLM-1-K1": "VortexIntelligence/VLM-1-K1", } # Load models and tokenizers models = {} tokenizers = {} for name, model_id in model_options.items(): print(f"Loading {name}...") tokenizers[name] = AutoTokenizer.from_pretrained(model_id) models[name] = AutoModelForCausalLM.from_pretrained(model_id) print(f"{name} loaded successfully!") def generate_response(message, history, model_choice): tokenizer = tokenizers[model_choice] model = models[model_choice] input_ids = tokenizer(message, return_tensors="pt").input_ids input_ids = input_ids[:, -1024:] # Truncate to last 1024 tokens if needed with torch.no_grad(): output = model.generate( input_ids, max_new_tokens=50, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id ) new_tokens = output[0][input_ids.shape[1]:] response = tokenizer.decode(new_tokens, skip_special_tokens=True) return response.strip() # Create the Gradio interface with gr.Blocks() as demo: model_choice = gr.Dropdown(choices=list(model_options.keys()), label="Select Model", value="VLM-1-K3") chatbot = gr.ChatInterface( lambda message, history: generate_response(message, history, model_choice.value), theme="soft", examples=["Hello, who are you?", "What can you do?", "Tell me a short story"], ) model_choice.change(fn=lambda x: None, inputs=model_choice, outputs=[]) if __name__ == "__main__": demo.launch()