Spaces:

TheAwakenOne
/

Cosmos-Predict2-2B-Text2Image

Running on Zero

File size: 12,801 Bytes

#!/usr/bin/env python3
"""
Cosmos-Predict2 for Hugging Face Spaces ZeroGPU
Optimized for H200 with 70GB VRAM - much simpler than RTX 5080 version!
"""

import os
import gradio as gr
import torch
import spaces
from diffusers import DiffusionPipeline
import gc
from typing import Optional
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

class CosmosZeroGPUApp:
    def __init__(self):
        self.pipe = None
        self.model_loaded = False
        print("🌌 Cosmos-Predict2 ZeroGPU App Starting...")
    
    def get_memory_info(self):
        """Get current memory usage - simplified for ZeroGPU"""
        if torch.cuda.is_available():
            vram_used = torch.cuda.memory_allocated(0) / 1024**3
            return f"GPU Memory Used: {vram_used:.1f}GB (H200 - 70GB Available)"
        else:
            return "GPU: Not allocated (ZeroGPU will assign when needed)"
    
    @spaces.GPU(duration=300)  # 5 minutes for model loading
    def load_model(self, progress=gr.Progress()):
        """Load model with ZeroGPU"""
        if self.model_loaded:
            return "✅ Model already loaded!", self.get_memory_info()
        
        try:
            progress(0.1, desc="🔄 Initializing ZeroGPU...")
            
            # ZeroGPU automatically handles device allocation
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            print(f"🎮 Using device: {device}")
            
            progress(0.3, desc="📥 Loading Cosmos-Predict2 model...")
            
            model_id = "nvidia/Cosmos-Predict2-2B-Text2Image"
            
            # Load model - much simpler with 70GB VRAM!
            self.pipe = DiffusionPipeline.from_pretrained(
                model_id,
                torch_dtype=torch.bfloat16,  # Use bfloat16 for better performance
                device_map="auto",
                use_safetensors=True,
                trust_remote_code=True
            )
            
            progress(0.7, desc="⚡ Optimizing for H200...")
            
            # Move to GPU
            if torch.cuda.is_available():
                self.pipe = self.pipe.to(device)
            
            # Enable optimizations (optional with 70GB VRAM, but still good for speed)
            try:
                self.pipe.enable_attention_slicing()
                print("✅ Attention slicing enabled")
            except:
                pass
            
            try:
                self.pipe.enable_xformers_memory_efficient_attention()
                print("✅ xformers enabled")
            except:
                print("📝 xformers not available (optional)")
            
            # Compile model for faster inference (optional)
            try:
                if hasattr(self.pipe, 'unet'):
                    self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True)
                    print("✅ Model compiled for faster inference")
            except:
                print("📝 Model compilation not available (optional)")
            
            progress(0.9, desc="🏁 Finalizing...")
            
            self.model_loaded = True
            torch.cuda.empty_cache()
            
            progress(1.0, desc="✅ Ready!")
            return "✅ Model loaded successfully on ZeroGPU H200!", self.get_memory_info()
            
        except Exception as e:
            self.model_loaded = False
            error_msg = str(e)
            if "401" in error_msg or "restricted" in error_msg:
                return "❌ Access denied. Please ensure the model is publicly accessible.", self.get_memory_info()
            return f"❌ Error loading model: {error_msg}", self.get_memory_info()
    
    def unload_model(self):
        """Unload model"""
        if self.pipe is not None:
            del self.pipe
            self.pipe = None
        
        self.model_loaded = False
        torch.cuda.empty_cache()
        gc.collect()
        
        return "✅ Model unloaded!", self.get_memory_info()
    
    @spaces.GPU(duration=120)  # 2 minutes for generation
    def generate_image(self, prompt, negative_prompt="", num_steps=25, guidance_scale=7.5, 
                      seed=-1, width=1024, height=1024, progress=gr.Progress()):
        """Generate image with ZeroGPU H200"""
        if not self.model_loaded or self.pipe is None:
            return None, "❌ Please load the model first!", self.get_memory_info()
        
        try:
            progress(0.1, desc="🎨 Preparing generation...")
            
            # With 70GB VRAM, we can use much larger resolutions!
            max_pixels = 2048 * 2048  # 4MP max for reasonable generation times
            current_pixels = width * height
            
            if current_pixels > max_pixels:
                # Scale down proportionally
                scale = (max_pixels / current_pixels) ** 0.5
                width = int(width * scale)
                height = int(height * scale)
                # Round to nearest 64 for compatibility
                width = (width // 64) * 64
                height = (height // 64) * 64
                size_msg = f"📉 Scaled to {width}x{height} for optimal performance"
            else:
                size_msg = f"📈 Generating at {width}x{height}"
            
            # Set seed for reproducibility
            generator = None
            if seed != -1:
                generator = torch.Generator(device="cuda").manual_seed(seed)
            
            progress(0.3, desc=f"🎨 Generating {width}x{height} image...")
            
            print(f"🎨 Generating: {width}x{height}, {num_steps} steps, guidance: {guidance_scale}")
            
            # Generate with the powerful H200!
            with torch.inference_mode():
                result = self.pipe(
                    prompt=prompt,
                    negative_prompt=negative_prompt if negative_prompt else None,
                    num_inference_steps=num_steps,
                    guidance_scale=guidance_scale,
                    height=height,
                    width=width,
                    generator=generator,
                    output_type="pil"
                )
            
            progress(0.9, desc="🏁 Finalizing...")
            
            # Extract image
            if hasattr(result, 'images'):
                image = result.images[0]
            elif isinstance(result, list):
                image = result[0]
            else:
                image = result
            
            # Cleanup
            del result
            torch.cuda.empty_cache()
            
            progress(1.0, desc="✅ Complete!")
            return image, f"✅ Generated successfully! {size_msg}", self.get_memory_info()
            
        except Exception as e:
            torch.cuda.empty_cache()
            return None, f"❌ Generation failed: {str(e)}", self.get_memory_info()

# Initialize app
app = CosmosZeroGPUApp()

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Cosmos-Predict2 ZeroGPU", theme=gr.themes.Soft()) as interface:
        gr.Markdown("""
        # 🌌 Cosmos-Predict2 on ZeroGPU
        **Powered by NVIDIA H200 with 70GB VRAM • High-resolution generation • Fast inference**
        
        This Space uses ZeroGPU for efficient GPU allocation. The GPU is assigned when you load the model or generate images.
        """)
        
        # Memory status
        memory_display = gr.Textbox(
            label="📊 GPU Status", 
            value=app.get_memory_info(), 
            interactive=False
        )
        
        with gr.Row():
            with gr.Column():
                # Model management
                gr.Markdown("### 🎮 Model Management")
                
                with gr.Row():
                    load_btn = gr.Button("🔄 Load Model", variant="primary", size="lg")
                    unload_btn = gr.Button("🗑️ Unload", variant="secondary")
                
                model_status = gr.Textbox(label="Model Status", interactive=False)
                
                # Generation settings
                gr.Markdown("### 🎨 Generation Settings")
                
                prompt = gr.Textbox(
                    label="Prompt",
                    placeholder="A futuristic robot in a high-tech laboratory with holographic displays...",
                    lines=3
                )
                
                negative_prompt = gr.Textbox(
                    label="Negative Prompt (Optional)",
                    placeholder="blurry, low quality, distorted, ugly, deformed...",
                    lines=2
                )
                
                with gr.Row():
                    steps = gr.Slider(10, 50, value=25, step=5, label="Inference Steps")
                    guidance = gr.Slider(1, 15, value=7.5, step=0.5, label="Guidance Scale")
                
                with gr.Row():
                    width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
                    height = gr.Slider(512, 2048, value=1024, step=64, label="Height")
                
                seed = gr.Number(label="Seed (-1 = random)", value=-1, precision=0)
                
                generate_btn = gr.Button("🎨 Generate Image", variant="primary", size="lg")
                
            with gr.Column():
                # Output
                output_image = gr.Image(label="Generated Image", height=600)
                generation_status = gr.Textbox(label="Generation Status", interactive=False)
                
                # ZeroGPU info
                gr.Markdown("""
                ### 💡 ZeroGPU Features:
                - **70GB VRAM**: Generate high-resolution images up to 2048x2048
                - **Dynamic allocation**: GPU assigned only when needed
                - **H200 powered**: Latest NVIDIA architecture for fast inference
                - **Free to use**: Available to all users (PRO users get higher priority)
                - **Auto-optimization**: Model compilation and memory efficiency
                """)
        
        # Event handlers
        load_btn.click(
            app.load_model, 
            outputs=[model_status, memory_display]
        )
        
        unload_btn.click(
            app.unload_model, 
            outputs=[model_status, memory_display]
        )
        
        generate_btn.click(
            app.generate_image,
            inputs=[prompt, negative_prompt, steps, guidance, seed, width, height],
            outputs=[output_image, generation_status, memory_display]
        )
        
        # Auto-refresh memory status
        def refresh_memory():
            return app.get_memory_info()
        
        # Update memory display every 10 seconds
        gr.Timer(value=10).tick(refresh_memory, outputs=[memory_display])
        
        # Examples optimized for high-resolution
        gr.Examples(
            examples=[
                ["A detailed cyberpunk cityscape at night with neon signs, flying cars, and holographic advertisements, highly detailed, 8k resolution"],
                ["A majestic dragon soaring through storm clouds with lightning, fantasy art, dramatic lighting, ultra detailed"],
                ["A futuristic space station orbiting Earth, with solar panels and docking bays, sci-fi concept art, cinematic"],
                ["A serene Japanese garden with cherry blossoms, koi pond, and traditional architecture, peaceful atmosphere, masterpiece"],
                ["A steampunk mechanical owl with brass gears and copper pipes, intricate details, vintage engineering"],
                ["An underwater city with bioluminescent coral and glass domes, marine life swimming around, fantasy architecture"]
            ],
            inputs=[prompt],
            label="🎨 Example Prompts (optimized for high-resolution generation)"
        )
        
        # Usage tips
        gr.Markdown("""
        ### 🚀 Usage Tips:
        1. **First time**: Click "Load Model" to download and initialize Cosmos-Predict2
        2. **High-res**: Try resolutions up to 2048x2048 with the powerful H200 GPU
        3. **Quality**: Use 25-30 steps for high quality, 15-20 for faster generation
        4. **Prompts**: Be descriptive and specific for best results
        5. **Negative prompts**: Help avoid unwanted elements in your images
        """)
    
    return interface

if __name__ == "__main__":
    print("🚀 Starting Cosmos-Predict2 ZeroGPU Space...")
    
    interface = create_interface()
    interface.launch()