import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define available models
model_options = {
    "VLM-1-K3": "VortexIntelligence/VLM-1-K3",
    "VLM-1-K2": "VortexIntelligence/VLM-1-K2",
    "VLM-1-K1": "VortexIntelligence/VLM-1-K1",
}

# Load models and tokenizers
models = {}
tokenizers = {}
for name, model_id in model_options.items():
    print(f"Loading {name}...")
    tokenizers[name] = AutoTokenizer.from_pretrained(model_id)
    models[name] = AutoModelForCausalLM.from_pretrained(model_id)
    print(f"{name} loaded successfully!")

def generate_response(message, history, model_choice):
    tokenizer = tokenizers[model_choice]
    model = models[model_choice]
    input_ids = tokenizer(message, return_tensors="pt").input_ids
    input_ids = input_ids[:, -1024:]  # Truncate to last 1024 tokens if needed
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    new_tokens = output[0][input_ids.shape[1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
    return response.strip()

# Create the Gradio interface
with gr.Blocks() as demo:
    model_choice = gr.Dropdown(choices=list(model_options.keys()), label="Select Model", value="VLM-1-K3")
    chatbot = gr.ChatInterface(
        lambda message, history: generate_response(message, history, model_choice.value),
        theme="soft",
        examples=["Hello, who are you?", "What can you do?", "Tell me a short story"],
    )
    model_choice.change(fn=lambda x: None, inputs=model_choice, outputs=[])

if __name__ == "__main__":
    demo.launch()