import gradio as gr
import spaces
from transformers import pipeline
import torch
from typing import List, Dict, Optional

# Global variable to store pipelines
model_cache = {}

# Available models
AVAILABLE_MODELS = {
    "Nous-1-4B": "apexion-ai/Nous-1-4B",
    "Nous-1-8B": "apexion-ai/Nous-1-8B",
    "Nous-1-2B": "apexion-ai/Nous-1-2B",
}

@spaces.GPU
def initialize_model(model_name):
    global model_cache
    
    if model_name not in AVAILABLE_MODELS:
        raise ValueError(f"Model {model_name} not found in available models")
    
    model_id = AVAILABLE_MODELS[model_name]
    
    # Check if model is already cached
    if model_id not in model_cache:
        try:
            model_cache[model_id] = pipeline(
                "text-generation", 
                model=model_id,
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True
            )
        except Exception as e:
            # Fallback to CPU if GPU fails
            model_cache[model_id] = pipeline(
                "text-generation", 
                model=model_id,
                torch_dtype=torch.float32,
                device_map="cpu",
                trust_remote_code=True
            )
    
    return model_cache[model_id]

@spaces.GPU
def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9):
    """Generate response using the selected model"""
    
    # Initialize model inside the GPU-decorated function
    try:
        model_pipe = initialize_model(model_name)
    except Exception as e:
        return f"Error loading model {model_name}: {str(e)}"
    
    # Format the conversation history
    messages = []
    
    # Add conversation history
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Generate response
    try:
        # Some models may not support the messages format, so we'll try different approaches
        try:
            # Try with messages format first
            response = model_pipe(
                messages,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=model_pipe.tokenizer.eos_token_id,
                return_full_text=False
            )
        except:
            # Fallback to simple text format
            conversation_text = ""
            for msg in messages:
                if msg["role"] == "user":
                    conversation_text += f"User: {msg['content']}\n"
                else:
                    conversation_text += f"Assistant: {msg['content']}\n"
            conversation_text += "Assistant:"
            
            response = model_pipe(
                conversation_text,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=model_pipe.tokenizer.eos_token_id,
                return_full_text=False
            )
        
        # Extract the generated text
        if isinstance(response, list) and len(response) > 0:
            generated_text = response[0]['generated_text']
        else:
            generated_text = str(response)
        
        # Clean up the response
        if isinstance(generated_text, list):
            assistant_response = generated_text[-1]['content']
        else:
            # Remove the prompt and extract assistant response
            assistant_response = str(generated_text).strip()
            if "Assistant:" in assistant_response:
                assistant_response = assistant_response.split("Assistant:")[-1].strip()
        
        return assistant_response
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

@spaces.GPU
def generate(
    model: str,
    user_input: str,
    history: Optional[str] = "",
    temperature: float = 0.7,
    system_prompt: Optional[str] = "",
    max_tokens: int = 512
):
    """
    API endpoint for LLM generation
    
    Args:
        model: Model name to use (Nous-1-2B, Nous-1-4B, or Nous-1-8B)
        user_input: Current user message/input
        history: JSON string of conversation history in format [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
        temperature: Temperature for generation (0.1-2.0)
        system_prompt: System prompt to guide the model
        max_tokens: Maximum tokens to generate (1-8192)
    
    Returns:
        Generated response from the model
    """
    
    # Validate model
    if model not in AVAILABLE_MODELS:
        return f"Error: Model {model} not available. Available models: {list(AVAILABLE_MODELS.keys())}"
    
    # Initialize model
    try:
        model_pipe = initialize_model(model)
    except Exception as e:
        return f"Error loading model {model}: {str(e)}"
    
    # Parse history if provided and convert to gradio format
    gradio_history = []
    if history and history.strip():
        try:
            import json
            history_list = json.loads(history)
            current_pair = [None, None]
            for msg in history_list:
                if isinstance(msg, dict) and "role" in msg and "content" in msg:
                    if msg["role"] == "user":
                        if current_pair[0] is not None:
                            gradio_history.append([current_pair[0], current_pair[1]])
                        current_pair = [msg["content"], None]
                    elif msg["role"] == "assistant":
                        current_pair[1] = msg["content"]
            if current_pair[0] is not None:
                gradio_history.append([current_pair[0], current_pair[1]])
        except:
            # If history parsing fails, continue without history
            pass
    
    # Add system prompt to user input if provided
    final_user_input = user_input
    if system_prompt and system_prompt.strip():
        final_user_input = f"System: {system_prompt}\n\nUser: {user_input}"
    
    # Use the original generate_response function
    return generate_response(final_user_input, gradio_history, model, max_tokens, temperature, 0.9)

# Create the Gradio interface
def create_interface():
    with gr.Blocks(title="Multi-Model Chat", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🚀 Nous-1 Model Chat Interface
        
        Chat with the Nous-1 models by Apexion AI.
        
        **Available Models:**
        - Nous-1-4B (4 billion parameters)
        - Nous-1-8B (8 billion parameters)
        - Nous-1-2B (2 billion parameters)
        """)
        
        with gr.Row():
            model_selector = gr.Dropdown(
                choices=list(AVAILABLE_MODELS.keys()),
                value="Nous-1-4B",
                label="Select Model",
                info="Choose which model to use for generation"
            )
        
        chatbot = gr.Chatbot(
            height=400,
            placeholder="Select a model and start chatting...",
            label="Chat"
        )
        
        msg = gr.Textbox(
            placeholder="Type your message here...",
            label="Message",
            lines=2
        )
        
        with gr.Row():
            submit_btn = gr.Button("Send", variant="primary")
            clear_btn = gr.Button("Clear Chat", variant="secondary")
        
        with gr.Accordion("Advanced Settings", open=False):
            max_length = gr.Slider(
                minimum=200,
                maximum=8192,
                value=2048,
                step=50,
                label="Max Length",
                info="Maximum length of generated response"
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=0.7,
                step=0.1,
                label="Temperature",
                info="Controls randomness in generation"
            )
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.9,
                step=0.1,
                label="Top P",
                info="Controls diversity via nucleus sampling"
            )
        
        # Event handlers
        def user_message(message, history):
            return "", history + [[message, None]]
        
        def bot_response(history, model_name, max_len, temp, top_p):
            if history:
                user_message = history[-1][0]
                bot_message = generate_response(
                    user_message, 
                    history[:-1], 
                    model_name,
                    max_len, 
                    temp, 
                    top_p
                )
                history[-1][1] = bot_message
            return history
        
        def model_changed(model_name):
            return gr.update(placeholder=f"Chat with {model_name}...")
        
        # Wire up the events
        msg.submit(user_message, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, model_selector, max_length, temperature, top_p], chatbot
        )
        
        submit_btn.click(user_message, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, model_selector, max_length, temperature, top_p], chatbot
        )
        
        clear_btn.click(lambda: None, None, chatbot, queue=False)
        
        model_selector.change(model_changed, model_selector, chatbot)
        
        gr.Markdown("""
        ---
        
        ### About the Nous-1 Models
        **Nous-1-2B**: 2 billion parameter model by Apexion AI, designed for fast and quick infrencing
        
        **Nous-1-4B**: 4 billion parameter model by Apexion AI, optimisd for efficient conversation and text generation
        
        **Nous-1-8B**: 8 billion parameter model by Apexion AI, offering enhanced capabilities and better performance for complex tasks
        
        All models are designed for conversational AI and support various text generation tasks. The 8B model provides more sophisticated responses but requires more computational resources.
        
        This Space uses ZeroGPU for efficient GPU allocation across both model sizes.
        """)
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    # Enable API and launch
    demo.launch(share=True)