import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
model_id = "suayptalha/FastLlama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Explicitly set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# System prompt
system_prompt = "You are a friendly assistant named FastLlama."

def format_prompt(message: str, history: list):
    prompt = f"<|system|>\n{system_prompt}</s>\n"
    for user_msg, bot_msg in history:
        prompt += f"<|user|>\n{user_msg}</s>\n<|assistant|>\n{bot_msg}</s>\n"
    prompt += f"<|user|>\n{message}</s>\n<|assistant|>\n"
    return prompt

def respond(message: str, history: list):
    # Format the prompt with chat history
    full_prompt = format_prompt(message, history)
    
    # Tokenize input with attention mask
    inputs = tokenizer(
        full_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)
    
    # Generate response with attention mask
    output = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Decode response while skipping special tokens
    response = tokenizer.decode(
        output[0][inputs.input_ids.shape[-1]:], 
        skip_special_tokens=True
    )
    
    return response

# Create chat interface
chat = gr.ChatInterface(
    fn=respond,
    title="FastLlama-3.2B Chat",
    description="Chat with FastLlama-3.2-3B-Instruct AI assistant",
    examples=[
        ["Explain quantum computing in simple terms"],
        ["Write a poem about artificial intelligence"],
        ["What's the meaning of life?"]
    ],
    cache_examples=False
)

if __name__ == "__main__":
    chat.launch(server_name="0.0.0.0")