import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

import gradio as gr
import torch
from huggingface_hub import snapshot_download
from PIL import Image
import random
import numpy as np
import spaces
import gc

# Import for Stable Diffusion XL
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
from compel import Compel, ReturnedEmbeddingsType

# Import for Wan2.2
import wan
from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
from wan.utils.utils import cache_video

# --- Global Setup ---
print("Starting Integrated Text-to-Image-to-Video App...")

# --- 1. Setup Text-to-Image Model (SDXL) ---
print("Loading Stable Diffusion XL model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize SDXL pipeline
sdxl_pipe = StableDiffusionXLPipeline.from_pretrained(
    "votepurchase/pornmasterPro_noobV3VAE",
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True
)

sdxl_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sdxl_pipe.scheduler.config)
sdxl_pipe.to(device)

# Force all components to use the same dtype
sdxl_pipe.text_encoder.to(torch.float16)
sdxl_pipe.text_encoder_2.to(torch.float16)
sdxl_pipe.vae.to(torch.float16)
sdxl_pipe.unet.to(torch.float16)

# Initialize Compel for long prompt processing
compel = Compel(
    tokenizer=[sdxl_pipe.tokenizer, sdxl_pipe.tokenizer_2],
    text_encoder=[sdxl_pipe.text_encoder, sdxl_pipe.text_encoder_2],
    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
    requires_pooled=[False, True],
    truncate_long_prompts=False
)

# --- 2. Setup Image-to-Video Model (Wan2.2) ---
print("Loading Wan 2.2 TI2V-5B model...")

# Download model snapshots
repo_id = "Wan-AI/Wan2.2-TI2V-5B"
print(f"Downloading/loading checkpoints for {repo_id}...")
ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
print(f"Using checkpoints from {ckpt_dir}")

# Load the model configuration
TASK_NAME = 'ti2v-5B'
cfg = WAN_CONFIGS[TASK_NAME]
FIXED_FPS = 24
MIN_FRAMES_MODEL = 8
MAX_FRAMES_MODEL = 121

# Instantiate the pipeline
device_id = 0 if torch.cuda.is_available() else -1
wan_pipeline = wan.WanTI2V(
    config=cfg,
    checkpoint_dir=ckpt_dir,
    device_id=device_id,
    rank=0,
    t5_fsdp=False,
    dit_fsdp=False,
    use_sp=False,
    t5_cpu=False,
    init_on_cpu=False,
    convert_model_dtype=True,
)
print("All models loaded and ready.")

# --- Constants ---
MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1216

# --- Helper Functions ---
def clear_gpu_memory():
    """Clear GPU memory more thoroughly"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    gc.collect()

def process_long_prompt(prompt, negative_prompt=""):
    """Simple long prompt processing using Compel"""
    try:
        conditioning, pooled = compel([prompt, negative_prompt])
        return conditioning, pooled
    except Exception as e:
        print(f"Long prompt processing failed: {e}, falling back to standard processing")
        return None, None

def select_best_size_for_image(image, available_sizes):
    """Select the size option with aspect ratio closest to the input image."""
    if image is None:
        return available_sizes[0]
    
    img_width, img_height = image.size
    img_aspect_ratio = img_height / img_width
    
    best_size = available_sizes[0]
    best_diff = float('inf')
    
    for size_str in available_sizes:
        height, width = map(int, size_str.split('*'))
        size_aspect_ratio = height / width
        diff = abs(img_aspect_ratio - size_aspect_ratio)
        
        if diff < best_diff:
            best_diff = diff
            best_size = size_str
    
    return best_size

def validate_video_inputs(image, prompt, duration_seconds):
    """Validate user inputs for video generation"""
    errors = []
    
    if not prompt or len(prompt.strip()) < 5:
        errors.append("Prompt must be at least 5 characters long.")
    
    if image is not None:
        if isinstance(image, np.ndarray):
            img = Image.fromarray(image)
        else:
            img = image
        if img.size[0] * img.size[1] > 4096 * 4096:
            errors.append("Image size is too large (maximum 4096x4096).")
    
    if duration_seconds > 5.0 and image is None:
        errors.append("Videos longer than 5 seconds require an input image.")
    
    return errors

# --- Text-to-Image Generation Function ---
@spaces.GPU(duration=30)
def generate_image(
    prompt,
    negative_prompt,
    seed,
    randomize_seed,
    width,
    height,
    guidance_scale,
    num_inference_steps,
    progress=gr.Progress(track_tqdm=True)
):
    """Generate image from text prompt"""
    progress(0, desc="Initializing image generation...")
    
    use_long_prompt = len(prompt.split()) > 60 or len(prompt) > 300
    
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)

    generator = torch.Generator(device=device).manual_seed(seed)
    
    try:
        progress(0.3, desc="Processing prompt...")
        
        if use_long_prompt:
            print("Using long prompt processing...")
            conditioning, pooled = process_long_prompt(prompt, negative_prompt)
            
            if conditioning is not None:
                progress(0.5, desc="Generating image...")
                output_image = sdxl_pipe(
                    prompt_embeds=conditioning[0:1],
                    pooled_prompt_embeds=pooled[0:1],
                    negative_prompt_embeds=conditioning[1:2],
                    negative_pooled_prompt_embeds=pooled[1:2],
                    guidance_scale=guidance_scale,
                    num_inference_steps=num_inference_steps,
                    width=width,
                    height=height,
                    generator=generator
                ).images[0]
                progress(1.0, desc="Complete!")
                return output_image, seed
        
        # Fall back to standard processing
        progress(0.5, desc="Generating image...")
        output_image = sdxl_pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            guidance_scale=guidance_scale,
            num_inference_steps=num_inference_steps,
            width=width,
            height=height,
            generator=generator
        ).images[0]
        
        progress(1.0, desc="Complete!")
        return output_image, seed
        
    except RuntimeError as e:
        print(f"Error during generation: {e}")
        error_img = Image.new('RGB', (width, height), color=(0, 0, 0))
        return error_img, seed
    finally:
        clear_gpu_memory()

# --- Image-to-Video Generation Function ---
@spaces.GPU(duration=120)
def generate_video(
    image,
    prompt,
    size,
    duration_seconds,
    sampling_steps,
    guide_scale,
    shift,
    seed,
    progress=gr.Progress(track_tqdm=True)
):
    """Generate video from image and prompt"""
    errors = validate_video_inputs(image, prompt, duration_seconds)
    if errors:
        raise gr.Error("\n".join(errors))
    
    progress(0, desc="Setting up video generation...")
    
    if seed == -1:
        seed = random.randint(0, sys.maxsize)

    progress(0.1, desc="Processing image...")
    
    input_image = None
    if image is not None:
        if isinstance(image, np.ndarray):
            input_image = Image.fromarray(image).convert("RGB")
        else:
            input_image = image.convert("RGB")
        # Resize image to match selected size
        target_height, target_width = map(int, size.split('*'))
        input_image = input_image.resize((target_width, target_height))
    
    # Calculate number of frames based on duration
    num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)

    progress(0.2, desc="Generating video...")
    
    try:
        video_tensor = wan_pipeline.generate(
            input_prompt=prompt,
            img=input_image,
            size=SIZE_CONFIGS[size],
            max_area=MAX_AREA_CONFIGS[size],
            frame_num=num_frames,
            shift=shift,
            sample_solver='unipc',
            sampling_steps=int(sampling_steps),
            guide_scale=guide_scale,
            seed=seed,
            offload_model=True
        )

        progress(0.9, desc="Saving video...")
        
        video_path = cache_video(
            tensor=video_tensor[None],
            save_file=None,
            fps=cfg.sample_fps,
            normalize=True,
            value_range=(-1, 1)
        )
        
        progress(1.0, desc="Complete!")
        
    except torch.cuda.OutOfMemoryError:
        clear_gpu_memory()
        raise gr.Error("GPU out of memory. Please try with lower settings.")
    except Exception as e:
        raise gr.Error(f"Video generation failed: {str(e)}")
    finally:
        if 'video_tensor' in locals():
            del video_tensor
        clear_gpu_memory()
    
    return video_path

# --- Combined Generation Function ---
def generate_image_to_video(
    img_prompt,
    img_negative_prompt,
    img_seed,
    img_randomize_seed,
    img_width,
    img_height,
    img_guidance_scale,
    img_num_inference_steps,
    video_prompt,
    video_size,
    video_duration,
    video_sampling_steps,
    video_guide_scale,
    video_shift,
    video_seed
):
    """Generate image from text, then use it to generate video"""
    # First generate image
    generated_image, used_seed = generate_image(
        img_prompt,
        img_negative_prompt,
        img_seed,
        img_randomize_seed,
        img_width,
        img_height,
        img_guidance_scale,
        img_num_inference_steps
    )
    
    # Update the best video size based on generated image
    available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
    best_size = select_best_size_for_image(generated_image, available_sizes)
    
    # Then generate video using the generated image
    video_path = generate_video(
        generated_image,
        video_prompt,
        best_size,  # Use auto-selected size
        video_duration,
        video_sampling_steps,
        video_guide_scale,
        video_shift,
        video_seed
    )
    
    return generated_image, video_path, used_seed, best_size

# --- Gradio Interface ---
css = """
.gradio-container {max-width: 1400px !important; margin: 0 auto} 
#output_video {height: 500px;} 
#input_image {height: 400px;}
#generated_image {height: 400px;}
.tab-nav button {font-size: 18px !important; padding: 10px 20px !important;}
"""

# Prompt templates
video_templates = {
    "Cinematic": "cinematic shot of {subject}, professional lighting, smooth camera movement, 4k quality",
    "Animation": "animated style {subject}, vibrant colors, fluid motion, dynamic movement",
    "Nature": "nature documentary footage of {subject}, wildlife photography, natural movement",
    "Slow Motion": "slow motion capture of {subject}, high speed camera, detailed motion",
    "Action": "dynamic action shot of {subject}, fast paced movement, energetic motion"
}

def apply_template(template, current_prompt):
    """Apply prompt template"""
    if "{subject}" in template:
        subject = current_prompt.split(",")[0] if "," in current_prompt else current_prompt
        return template.replace("{subject}", subject)
    return template + " " + current_prompt

with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎨 Integrated Text-to-Image-to-Video Generator
    
    Generate images from text and convert them to high-quality videos using:
    - **Stable Diffusion XL** for Text-to-Image generation
    - **Wan 2.2 5B** for Image-to-Video generation
    
    ### ✨ Features:
    - 📝 **Text-to-Image**: Generate images from text descriptions
    - 🎬 **Image-to-Video**: Convert images (uploaded or generated) to videos
    - 🔄 **Text-to-Image-to-Video**: Complete pipeline from text to video
    """)

    # Badge section
    gr.HTML(
        """
        <div style="display: flex; justify-content: center; align-items: center; gap: 20px; margin: 20px 0;">
            <a href="https://huggingface.co/spaces/Heartsync/Wan-2.2-ADULT" target="_blank">
                <img src="https://img.shields.io/static/v1?label=T2I%20%26%20TI2V&message=Wan-2.2-ADULT&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
            </a>
            <a href="https://huggingface.co/spaces/Heartsync/PornHUB" target="_blank">
                <img src="https://img.shields.io/static/v1?label=T2I%20&message=PornHUB&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
            </a>
            <a href="https://huggingface.co/spaces/Heartsync/Hentai-Adult" target="_blank">
                <img src="https://img.shields.io/static/v1?label=T2I%20&message=Hentai-Adult&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
            </a>
        </div>
        """
    )

    
    with gr.Tabs() as tabs:
        # Tab 1: Text-to-Image
        with gr.Tab("Text to Image", id="t2i_tab"):
            with gr.Row():
                with gr.Column(scale=1):
                    t2i_prompt = gr.Textbox(
                        label="Prompt",
                        placeholder="Describe the image you want to generate...",
                        lines=3
                    )
                    t2i_negative_prompt = gr.Textbox(
                        label="Negative Prompt",
                        value="nsfw, (low quality, worst quality:1.2), very displeasing, 3d, watermark, signature, ugly, poorly drawn",
                        lines=2
                    )
                    
                    with gr.Row():
                        t2i_width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
                        t2i_height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
                    
                    with gr.Accordion("Advanced Settings", open=False):
                        t2i_seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                        t2i_randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
                        t2i_guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=20.0, step=0.1, value=7)
                        t2i_num_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=28)
                    
                    t2i_generate_btn = gr.Button("Generate Image", variant="primary", size="lg")
                
                with gr.Column(scale=1):
                    t2i_output = gr.Image(label="Generated Image", elem_id="generated_image")
                    t2i_seed_output = gr.Number(label="Used Seed", interactive=False)

        # Tab 2: Image-to-Video
        with gr.Tab("Image to Video", id="i2v_tab"):
            with gr.Row():
                with gr.Column(scale=1):
                    i2v_image = gr.Image(type="numpy", label="Input Image", elem_id="input_image")
                    i2v_prompt = gr.Textbox(
                        label="Video Prompt",
                        value="Generate a video with smooth and natural movement. Objects should have visible motion while maintaining fluid transitions.",
                        lines=3
                    )
                    
                    with gr.Accordion("Prompt Templates", open=False):
                        gr.Markdown("Click a template to apply it to your prompt:")
                        template_buttons = {}
                        for name, template in video_templates.items():
                            btn = gr.Button(name, size="sm")
                            template_buttons[name] = (btn, template)
                    
                    i2v_duration = gr.Slider(
                        label="Duration (seconds)",
                        minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
                        maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
                        step=0.1,
                        value=2.0
                    )
                    i2v_size = gr.Dropdown(
                        label="Output Resolution",
                        choices=list(SUPPORTED_SIZES[TASK_NAME]),
                        value="704*1280"
                    )
                    
                    with gr.Accordion("Advanced Settings", open=False):
                        i2v_steps = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=38, step=1)
                        i2v_guide_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=cfg.sample_guide_scale, step=0.1)
                        i2v_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
                        i2v_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
                    
                    i2v_generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
                
                with gr.Column(scale=1):
                    i2v_output = gr.Video(label="Generated Video", elem_id="output_video")

        # Tab 3: Text-to-Image-to-Video
        with gr.Tab("Text to Image to Video", id="t2i2v_tab"):
            gr.Markdown("### 🎯 Complete Pipeline: Generate an image from text, then convert it to video")
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("#### Step 1: Image Generation Settings")
                    t2i2v_img_prompt = gr.Textbox(
                        label="Image Prompt",
                        placeholder="Describe the image to generate...",
                        lines=3
                    )
                    t2i2v_img_negative = gr.Textbox(
                        label="Negative Prompt",
                        value="nsfw, (low quality, worst quality:1.2), very displeasing, 3d, watermark, signature, ugly, poorly drawn",
                        lines=2
                    )
                    
                    with gr.Row():
                        t2i2v_img_width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
                        t2i2v_img_height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
                    
                    with gr.Accordion("Image Advanced Settings", open=False):
                        t2i2v_img_seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                        t2i2v_img_randomize = gr.Checkbox(label="Randomize seed", value=True)
                        t2i2v_img_guidance = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=20.0, step=0.1, value=7)
                        t2i2v_img_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=28)
                    
                    gr.Markdown("#### Step 2: Video Generation Settings")
                    t2i2v_video_prompt = gr.Textbox(
                        label="Video Prompt",
                        value="Generate a video with smooth and natural movement. Objects should have visible motion while maintaining fluid transitions.",
                        lines=3
                    )
                    t2i2v_video_duration = gr.Slider(
                        label="Duration (seconds)",
                        minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
                        maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
                        step=0.1,
                        value=2.0
                    )
                    
                    # Add the missing video size dropdown component
                    t2i2v_video_size = gr.Dropdown(
                        label="Video Output Resolution",
                        choices=list(SUPPORTED_SIZES[TASK_NAME]),
                        value="704*1280",
                        info="This will be auto-adjusted based on generated image aspect ratio"
                    )
                    
                    with gr.Accordion("Video Advanced Settings", open=False):
                        t2i2v_video_steps = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=38, step=1)
                        t2i2v_video_guide = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=cfg.sample_guide_scale, step=0.1)
                        t2i2v_video_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
                        t2i2v_video_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
                    
                    t2i2v_generate_btn = gr.Button("Generate Image → Video", variant="primary", size="lg")
                
                with gr.Column(scale=1):
                    gr.Markdown("#### Results")
                    t2i2v_image_output = gr.Image(label="Generated Image", elem_id="generated_image")
                    t2i2v_video_output = gr.Video(label="Generated Video", elem_id="output_video")
                    with gr.Row():
                        t2i2v_seed_output = gr.Number(label="Image Seed Used", interactive=False)
                        t2i2v_size_output = gr.Textbox(label="Video Size Used", interactive=False)

    # Event handlers
    
    # Tab 1: Text-to-Image
    t2i_generate_btn.click(
        fn=generate_image,
        inputs=[
            t2i_prompt, t2i_negative_prompt, t2i_seed, t2i_randomize_seed,
            t2i_width, t2i_height, t2i_guidance_scale, t2i_num_steps
        ],
        outputs=[t2i_output, t2i_seed_output]
    )
    
    # Tab 2: Image-to-Video
    # Connect template buttons
    for name, (btn, template) in template_buttons.items():
        btn.click(
            fn=lambda t=template, p=i2v_prompt: apply_template(t, p),
            inputs=[i2v_prompt],
            outputs=i2v_prompt
        )
    
    # Auto-select best size when image is uploaded
    def handle_image_upload(image):
        if image is None:
            return gr.update()
        pil_image = Image.fromarray(image).convert("RGB")
        available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
        best_size = select_best_size_for_image(pil_image, available_sizes)
        return gr.update(value=best_size)
    
    i2v_image.upload(
        fn=handle_image_upload,
        inputs=[i2v_image],
        outputs=[i2v_size]
    )
    
    i2v_generate_btn.click(
        fn=generate_video,
        inputs=[
            i2v_image, i2v_prompt, i2v_size, i2v_duration,
            i2v_steps, i2v_guide_scale, i2v_shift, i2v_seed
        ],
        outputs=i2v_output
    )
    
    # Tab 3: Text-to-Image-to-Video
    t2i2v_generate_btn.click(
        fn=generate_image_to_video,
        inputs=[
            t2i2v_img_prompt, t2i2v_img_negative, t2i2v_img_seed, t2i2v_img_randomize,
            t2i2v_img_width, t2i2v_img_height, t2i2v_img_guidance, t2i2v_img_steps,
            t2i2v_video_prompt, t2i2v_video_size, t2i2v_video_duration,
            t2i2v_video_steps, t2i2v_video_guide, t2i2v_video_shift, t2i2v_video_seed
        ],
        outputs=[t2i2v_image_output, t2i2v_video_output, t2i2v_seed_output, t2i2v_size_output]
    )
    
    # Examples
    gr.Examples(
        examples=[
            ["A majestic lion sitting on a rock at sunset, golden hour lighting, photorealistic", "Generate a video with the lion slowly turning its head and mane flowing in the wind"],
            ["A futuristic cyberpunk city with neon lights and flying cars", "Cinematic shot with smooth camera movement through the city streets"],
            ["A serene Japanese garden with cherry blossoms and a koi pond", "Gentle breeze causing cherry blossoms to fall, ripples in the pond"],
        ],
        inputs=[t2i2v_img_prompt, t2i2v_video_prompt],
        label="Example Prompts"
    )

if __name__ == "__main__":
    demo.launch()