Spaces:

TheAwakenOne
/

Cosmos-Predict2-2B-Text2Image

Running on Zero

File size: 13,419 Bytes

#!/usr/bin/env python3
"""
Cosmos-Predict2 for Hugging Face Spaces ZeroGPU
"""

import subprocess
import os

# Install flash-attn for better performance
subprocess.run(
    "pip install flash-attn --no-build-isolation", 
    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, 
    shell=True
)

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, SiglipProcessor
import random
import gc
import warnings

# Try to import Cosmos-specific pipeline, fall back to generic if not available
try:
    from diffusers import Cosmos2TextToImagePipeline
    COSMOS_PIPELINE_AVAILABLE = True
    print("✅ Cosmos2TextToImagePipeline available")
except ImportError:
    from diffusers import DiffusionPipeline
    COSMOS_PIPELINE_AVAILABLE = False
    print("⚠️ Cosmos2TextToImagePipeline not available, using DiffusionPipeline with trust_remote_code")

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Add flash_attention_2 to the safeguard model for better performance
def patch_from_pretrained(cls):
    orig_method = cls.from_pretrained
    def new_from_pretrained(*args, **kwargs):
        kwargs.setdefault("attn_implementation", "flash_attention_2")
        kwargs.setdefault("torch_dtype", torch.bfloat16) 
        return orig_method(*args, **kwargs)
    cls.from_pretrained = new_from_pretrained

patch_from_pretrained(AutoModelForCausalLM)

# Add a `use_fast` to the safeguard image processor
def patch_processor_fast(cls):
    orig_method = cls.from_pretrained
    def new_from_pretrained(*args, **kwargs):
        kwargs.setdefault("use_fast", True)
        return orig_method(*args, **kwargs)
    cls.from_pretrained = new_from_pretrained

patch_processor_fast(SiglipProcessor)

print("🌌 Loading Cosmos-Predict2 model...")

# Handle authentication for gated model
try:
    from huggingface_hub import login
    import os
    
    # Try to login with token from environment variable
    hf_token = os.getenv("HF_TOKEN")
    if hf_token:
        login(token=hf_token)
        print("✅ Authenticated with Hugging Face")
    else:
        print("⚠️ No HF_TOKEN found, trying without authentication...")
except Exception as e:
    print(f"⚠️ Authentication failed: {e}")

# Load the model at startup
model_id = "nvidia/Cosmos-Predict2-2B-Text2Image"

try:
    if COSMOS_PIPELINE_AVAILABLE:
        print("🔄 Loading with Cosmos2TextToImagePipeline...")
        try:
            # Try loading with safety checker first
            pipe = Cosmos2TextToImagePipeline.from_pretrained(
                model_id,
                torch_dtype=torch.bfloat16,
                use_auth_token=True  # Use authentication token
            )
        except ImportError as e:
            if "cosmos_guardrail" in str(e):
                print("⚠️ cosmos_guardrail not available, trying without safety checker...")
                # Try loading without safety checker
                pipe = Cosmos2TextToImagePipeline.from_pretrained(
                    model_id,
                    torch_dtype=torch.bfloat16,
                    use_auth_token=True,
                    safety_checker=None,
                    requires_safety_checker=False
                )
            else:
                raise e
    else:
        print("🔄 Loading with DiffusionPipeline (trust_remote_code=True)...")
        pipe = DiffusionPipeline.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            use_auth_token=True  # Use authentication token
        )

    pipe.to("cuda")
    print("✅ Cosmos-Predict2 model loaded successfully!")
    
except Exception as e:
    print(f"❌ Failed to load Cosmos model: {e}")
    print("🔄 This is likely due to the model being gated/restricted or missing dependencies")
    print("📝 Please check the Setup Guide for authentication instructions")
    
    # For demo purposes, we could fall back to a different model
    # But for now, let's just exit gracefully
    raise e

# Default negative prompt for better quality
DEFAULT_NEGATIVE_PROMPT = "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality."

def get_memory_info():
    """Get current memory usage"""
    if torch.cuda.is_available():
        vram_used = torch.cuda.memory_allocated(0) / 1024**3
        return f"GPU Memory Used: {vram_used:.1f}GB (H200 - 70GB Available)"
    else:
        return "GPU: Not allocated (ZeroGPU will assign when needed)"

@spaces.GPU(duration=120)  # 2 minutes for generation
def generate_image(prompt, negative_prompt="", num_steps=25, guidance_scale=7.5, 
                  seed=-1, width=1024, height=1024, randomize_seed=True, 
                  progress=gr.Progress(track_tqdm=True)):
    """Generate image with ZeroGPU H200"""
    
    try:
        # Handle seed
        if randomize_seed or seed == -1:
            actual_seed = random.randint(0, 1000000)
        else:
            actual_seed = seed
        
        generator = torch.Generator().manual_seed(actual_seed)
        
        # Use default negative prompt if none provided
        if not negative_prompt.strip():
            negative_prompt = DEFAULT_NEGATIVE_PROMPT
        
        # With 70GB VRAM, we can use much larger resolutions!
        max_pixels = 2048 * 2048  # 4MP max for reasonable generation times
        current_pixels = width * height
        
        if current_pixels > max_pixels:
            # Scale down proportionally
            scale = (max_pixels / current_pixels) ** 0.5
            width = int(width * scale)
            height = int(height * scale)
            # Round to nearest 64 for compatibility
            width = (width // 64) * 64
            height = (height // 64) * 64
            size_msg = f"📉 Scaled to {width}x{height} for optimal performance"
        else:
            size_msg = f"📈 Generating at {width}x{height}"
        
        print(f"🎨 Generating: {width}x{height}, {num_steps} steps, guidance: {guidance_scale}, seed: {actual_seed}")
        
        # Generate with the powerful H200!
        with torch.inference_mode():
            result = pipe(
                prompt=prompt,
                negative_prompt=negative_prompt,
                num_inference_steps=num_steps,
                guidance_scale=guidance_scale,
                height=height,
                width=width,
                generator=generator
            )
        
        # Extract image
        if hasattr(result, 'images'):
            image = result.images[0]
        elif isinstance(result, list):
            image = result[0]
        else:
            image = result
        
        # Cleanup
        del result
        torch.cuda.empty_cache()
        
        return image, f"✅ Generated successfully! {size_msg} (Seed: {actual_seed})", get_memory_info(), actual_seed
        
    except Exception as e:
        torch.cuda.empty_cache()
        return None, f"❌ Generation failed: {str(e)}", get_memory_info(), seed

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Cosmos-Predict2 ZeroGPU", theme=gr.themes.Soft()) as interface:
        gr.Markdown("""
        # 🌌 Cosmos-Predict2 on ZeroGPU
        **High-resolution generation • Fast inference**
        
        This Space uses ZeroGPU for efficient GPU allocation. The model is pre-loaded and ready to generate!
        """)
        
        # Memory status
        memory_display = gr.Textbox(
            label="📊 GPU Status", 
            value=get_memory_info(), 
            interactive=False
        )
        
        with gr.Row():
            with gr.Column():
                # Generation settings
                gr.Markdown("### 🎨 Generate High-Quality Images")
                
                prompt = gr.Textbox(
                    label="Prompt",
                    placeholder="A futuristic robot in a high-tech laboratory with holographic displays...",
                    lines=4,
                    value="A close-up shot captures a vibrant yellow scrubber vigorously working on a grimy plate, its bristles moving in circular motions to lift stubborn grease and food residue. The dish, once covered in remnants of a hearty meal, gradually reveals its original glossy surface."
                )
                
                negative_prompt = gr.Textbox(
                    label="Negative Prompt (Optional - has smart default)",
                    placeholder="Leave empty to use optimized default negative prompt...",
                    lines=2
                )
                
                with gr.Row():
                    steps = gr.Slider(10, 50, value=25, step=5, label="Inference Steps")
                    guidance = gr.Slider(1, 15, value=7.5, step=0.5, label="Guidance Scale")
                
                with gr.Row():
                    width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
                    height = gr.Slider(512, 2048, value=1024, step=64, label="Height")
                
                with gr.Row():
                    randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                    seed = gr.Number(label="Seed", value=42, precision=0)
                
                generate_btn = gr.Button("🎨 Generate Image", variant="primary", size="lg")
                
            with gr.Column():
                # Output
                output_image = gr.Image(label="Generated Image", height=600)
                generation_status = gr.Textbox(label="Generation Status", interactive=False)
                seed_output = gr.Number(label="Used Seed", interactive=False)
                
                # ZeroGPU info
                gr.Markdown("""
                ### 💡 ZeroGPU Features:
                - **70GB VRAM**: Generate high-resolution images up to 2048x2048
                - **Pre-loaded Model**: No waiting for model loading
                - **H200 powered**: Latest NVIDIA architecture for fast inference
                - **Smart defaults**: Optimized negative prompt included
                - **Flash Attention**: Enhanced performance optimizations
                """)
        
        # Event handlers
        generate_btn.click(
            generate_image,
            inputs=[prompt, negative_prompt, steps, guidance, seed, width, height, randomize_seed],
            outputs=[output_image, generation_status, memory_display, seed_output]
        )
        
        # Auto-refresh memory status
        def refresh_memory():
            return get_memory_info()
        
        # Update memory display every 10 seconds
        gr.Timer(value=10).tick(refresh_memory, outputs=[memory_display])
        
        # Examples optimized for high-resolution
        gr.Examples(
            examples=[
                ["A detailed cyberpunk cityscape at night with neon signs, flying cars, and holographic advertisements, highly detailed, 8k resolution"],
                ["A majestic dragon soaring through storm clouds with lightning, fantasy art, dramatic lighting, ultra detailed"],
                ["A futuristic space station orbiting Earth, with solar panels and docking bays, sci-fi concept art, cinematic"],
                ["A serene Japanese garden with cherry blossoms, koi pond, and traditional architecture, peaceful atmosphere, masterpiece"],
                ["A steampunk mechanical owl with brass gears and copper pipes, intricate details, vintage engineering"],
                ["A well-worn broom sweeps across a dusty wooden floor, its bristles gathering crumbs and flecks of debris in swift, rhythmic strokes"],
                ["A robotic arm tightens a bolt beneath the hood of a car, its tool head rotating with practiced torque, precision engineering"],
                ["A nighttime city bus terminal gradually shifts from stillness to subtle movement, urban night scene with illuminated signage"]
            ],
            inputs=[prompt],
            label="🎨 Example Prompts (optimized for high-resolution generation)"
        )
        
        # Usage tips
        gr.Markdown("""
        ### 🚀 Usage Tips:
        1. **Ready to go**: Model is pre-loaded, just click generate!
        2. **High-res**: Try resolutions up to 2048x2048 with the powerful H200 GPU
        3. **Quality**: Use 25-30 steps for high quality, 15-20 for faster generation
        4. **Prompts**: Be descriptive and specific for best results
        5. **Negative prompts**: Leave empty to use optimized defaults, or customize as needed
        6. **Seeds**: Use randomize for variety, or set specific seed for reproducible results
        """)
    
    return interface

if __name__ == "__main__":
    print("🚀 Starting Cosmos-Predict2 ZeroGPU Space...")
    
    interface = create_interface()
    interface.launch()