import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
import torch
from PIL import Image
import os

# Check if CUDA is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and tokenizer with optimizations for CPU deployment
def load_model():
    print("Loading model and tokenizer...")
    model = AutoModelForCausalLM.from_pretrained(
        "sagar007/Lava_phi",
        torch_dtype=torch.float32 if device == "cpu" else torch.bfloat16,
        low_cpu_mem_usage=True,
    )
    model = model.to(device)
    
    tokenizer = AutoTokenizer.from_pretrained("sagar007/Lava_phi")
    processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    print("Model and tokenizer loaded successfully!")
    return model, tokenizer, processor

# Load models
model, tokenizer, processor = load_model()

# For text-only generation
def generate_text(prompt, max_length=128):
    try:
        inputs = tokenizer(f"human: {prompt}\ngpt:", return_tensors="pt").to(device)
        
        # Generate with low memory footprint settings
        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_new_tokens=max_length,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the model's response
        if "gpt:" in generated_text:
            generated_text = generated_text.split("gpt:", 1)[1].strip()
            
        return generated_text
    except Exception as e:
        return f"Error generating text: {str(e)}"

# For image and text processing
def process_image_and_prompt(image, prompt, max_length=128):
    try:
        if image is None:
            return "No image provided. Please upload an image."
        
        # Process image
        image_tensor = processor(images=image, return_tensors="pt").pixel_values.to(device)
        
        # Tokenize input with image token
        inputs = tokenizer(f"human: <image>\n{prompt}\ngpt:", return_tensors="pt").to(device)
        
        # Generate with memory optimizations
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                images=image_tensor,
                max_new_tokens=max_length,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the model's response
        if "gpt:" in generated_text:
            generated_text = generated_text.split("gpt:", 1)[1].strip()
            
        return generated_text
    except Exception as e:
        return f"Error processing image: {str(e)}"

# Create Gradio Interface
with gr.Blocks(title="LLaVA-Phi: Vision-Language Model") as demo:
    gr.Markdown("# LLaVA-Phi: Vision-Language Model")
    gr.Markdown("This model can generate text responses from text prompts or analyze images with text prompts.")
    
    with gr.Tab("Text Generation"):
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label="Enter your prompt", lines=3, placeholder="What is artificial intelligence?")
                text_max_length = gr.Slider(minimum=16, maximum=512, value=128, step=8, label="Maximum response length")
                text_button = gr.Button("Generate")
            
            text_output = gr.Textbox(label="Generated response", lines=8)
        
        text_button.click(
            fn=generate_text,
            inputs=[text_input, text_max_length],
            outputs=text_output
        )
    
    with gr.Tab("Image + Text Analysis"):
        with gr.Row():
            with gr.Column():
                image_input = gr.Image(type="pil", label="Upload an image")
                image_text_input = gr.Textbox(label="Enter your prompt about the image", 
                                              lines=2, 
                                              placeholder="Describe this image in detail.")
                image_max_length = gr.Slider(minimum=16, maximum=512, value=128, step=8, label="Maximum response length")
                image_button = gr.Button("Analyze")
            
            image_output = gr.Textbox(label="Model response", lines=8)
        
        image_button.click(
            fn=process_image_and_prompt,
            inputs=[image_input, image_text_input, image_max_length],
            outputs=image_output
        )
    
    # Example inputs for each tab
    gr.Examples(
        examples=["What is the advantage of vision-language models?", 
                  "Explain how multimodal AI models work.",
                  "Tell me a short story about robots."],
        inputs=text_input
    )
    
    # Add examples for image tab if you have example images
    # gr.Examples(
    #     examples=[["example1.jpg", "What's in this image?"]],
    #     inputs=[image_input, image_text_input]
    # )

# Launch the app with memory optimizations
if __name__ == "__main__":
    # Memory cleanup before launch
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    # Set low CPU thread usage to reduce memory
    os.environ["OMP_NUM_THREADS"] = "4"
    
    # Launch with minimal resource usage
    demo.launch(
        share=True,  # Set to False in production
        enable_queue=True,
        max_threads=4,
        show_error=True
    )