#!/usr/bin/env python3 """ Cosmos-Predict2 for Hugging Face Spaces ZeroGPU """ import os import gradio as gr import torch import spaces from diffusers import DiffusionPipeline import gc from typing import Optional import warnings # Suppress warnings for cleaner output warnings.filterwarnings("ignore", category=UserWarning) warnings.filterwarnings("ignore", category=FutureWarning) class CosmosZeroGPUApp: def __init__(self): self.pipe = None self.model_loaded = False print("🌌 Cosmos-Predict2 ZeroGPU App Starting...") def get_memory_info(self): """Get current memory usage - simplified for ZeroGPU""" if torch.cuda.is_available(): vram_used = torch.cuda.memory_allocated(0) / 1024**3 return f"GPU Memory Used: {vram_used:.1f}GB (H200 - 70GB Available)" else: return "GPU: Not allocated (ZeroGPU will assign when needed)" @spaces.GPU(duration=300) # 5 minutes for model loading def load_model(self, progress=gr.Progress()): """Load model with ZeroGPU""" if self.model_loaded: return "✅ Model already loaded!", self.get_memory_info() try: progress(0.1, desc="🔄 Initializing ZeroGPU...") # ZeroGPU automatically handles device allocation device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"🎮 Using device: {device}") progress(0.3, desc="📥 Loading Cosmos-Predict2 model...") model_id = "nvidia/Cosmos-Predict2-2B-Text2Image" # Load model - much simpler with 70GB VRAM! self.pipe = DiffusionPipeline.from_pretrained( model_id, torch_dtype=torch.bfloat16, # Use bfloat16 for better performance device_map="auto", use_safetensors=True, trust_remote_code=True ) progress(0.7, desc="⚡ Optimizing for H200...") # Move to GPU if torch.cuda.is_available(): self.pipe = self.pipe.to(device) # Enable optimizations (optional with 70GB VRAM, but still good for speed) try: self.pipe.enable_attention_slicing() print("✅ Attention slicing enabled") except: pass try: self.pipe.enable_xformers_memory_efficient_attention() print("✅ xformers enabled") except: print("📝 xformers not available (optional)") # Compile model for faster inference (optional) try: if hasattr(self.pipe, 'unet'): self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True) print("✅ Model compiled for faster inference") except: print("📝 Model compilation not available (optional)") progress(0.9, desc="🏁 Finalizing...") self.model_loaded = True torch.cuda.empty_cache() progress(1.0, desc="✅ Ready!") return "✅ Model loaded successfully on ZeroGPU H200!", self.get_memory_info() except Exception as e: self.model_loaded = False error_msg = str(e) if "401" in error_msg or "restricted" in error_msg: return "❌ Access denied. Please ensure the model is publicly accessible.", self.get_memory_info() return f"❌ Error loading model: {error_msg}", self.get_memory_info() def unload_model(self): """Unload model""" if self.pipe is not None: del self.pipe self.pipe = None self.model_loaded = False torch.cuda.empty_cache() gc.collect() return "✅ Model unloaded!", self.get_memory_info() @spaces.GPU(duration=120) # 2 minutes for generation def generate_image(self, prompt, negative_prompt="", num_steps=25, guidance_scale=7.5, seed=-1, width=1024, height=1024, progress=gr.Progress()): """Generate image with ZeroGPU H200""" if not self.model_loaded or self.pipe is None: return None, "❌ Please load the model first!", self.get_memory_info() try: progress(0.1, desc="🎨 Preparing generation...") # With 70GB VRAM, we can use much larger resolutions! max_pixels = 2048 * 2048 # 4MP max for reasonable generation times current_pixels = width * height if current_pixels > max_pixels: # Scale down proportionally scale = (max_pixels / current_pixels) ** 0.5 width = int(width * scale) height = int(height * scale) # Round to nearest 64 for compatibility width = (width // 64) * 64 height = (height // 64) * 64 size_msg = f"📉 Scaled to {width}x{height} for optimal performance" else: size_msg = f"📈 Generating at {width}x{height}" # Set seed for reproducibility generator = None if seed != -1: generator = torch.Generator(device="cuda").manual_seed(seed) progress(0.3, desc=f"🎨 Generating {width}x{height} image...") print(f"🎨 Generating: {width}x{height}, {num_steps} steps, guidance: {guidance_scale}") # Generate with the powerful H200! with torch.inference_mode(): result = self.pipe( prompt=prompt, negative_prompt=negative_prompt if negative_prompt else None, num_inference_steps=num_steps, guidance_scale=guidance_scale, height=height, width=width, generator=generator, output_type="pil" ) progress(0.9, desc="🏁 Finalizing...") # Extract image if hasattr(result, 'images'): image = result.images[0] elif isinstance(result, list): image = result[0] else: image = result # Cleanup del result torch.cuda.empty_cache() progress(1.0, desc="✅ Complete!") return image, f"✅ Generated successfully! {size_msg}", self.get_memory_info() except Exception as e: torch.cuda.empty_cache() return None, f"❌ Generation failed: {str(e)}", self.get_memory_info() # Initialize app app = CosmosZeroGPUApp() # Create Gradio interface def create_interface(): with gr.Blocks(title="Cosmos-Predict2 ZeroGPU", theme=gr.themes.Soft()) as interface: gr.Markdown(""" # 🌌 Cosmos-Predict2 on ZeroGPU **Powered by ZeroGPU • High-resolution generation • Fast inference** This Space uses ZeroGPU for efficient GPU allocation. The GPU is assigned when you load the model or generate images. """) # Memory status memory_display = gr.Textbox( label="📊 GPU Status", value=app.get_memory_info(), interactive=False ) with gr.Row(): with gr.Column(): # Model management gr.Markdown("### 🎮 Model Management") with gr.Row(): load_btn = gr.Button("🔄 Load Model", variant="primary", size="lg") unload_btn = gr.Button("🗑️ Unload", variant="secondary") model_status = gr.Textbox(label="Model Status", interactive=False) # Generation settings gr.Markdown("### 🎨 Generation Settings") prompt = gr.Textbox( label="Prompt", placeholder="A futuristic robot in a high-tech laboratory with holographic displays...", lines=3 ) negative_prompt = gr.Textbox( label="Negative Prompt (Optional)", placeholder="blurry, low quality, distorted, ugly, deformed...", lines=2 ) with gr.Row(): steps = gr.Slider(10, 50, value=25, step=5, label="Inference Steps") guidance = gr.Slider(1, 15, value=7.5, step=0.5, label="Guidance Scale") with gr.Row(): width = gr.Slider(512, 2048, value=1024, step=64, label="Width") height = gr.Slider(512, 2048, value=1024, step=64, label="Height") seed = gr.Number(label="Seed (-1 = random)", value=-1, precision=0) generate_btn = gr.Button("🎨 Generate Image", variant="primary", size="lg") with gr.Column(): # Output output_image = gr.Image(label="Generated Image", height=600) generation_status = gr.Textbox(label="Generation Status", interactive=False) # ZeroGPU info gr.Markdown(""" ### 💡 ZeroGPU Features: - **70GB VRAM**: Generate high-resolution images up to 2048x2048 - **Dynamic allocation**: GPU assigned only when needed - **H200 powered**: Latest NVIDIA architecture for fast inference - **Free to use**: Available to all users (PRO users get higher priority) - **Auto-optimization**: Model compilation and memory efficiency """) # Event handlers load_btn.click( app.load_model, outputs=[model_status, memory_display] ) unload_btn.click( app.unload_model, outputs=[model_status, memory_display] ) generate_btn.click( app.generate_image, inputs=[prompt, negative_prompt, steps, guidance, seed, width, height], outputs=[output_image, generation_status, memory_display] ) # Auto-refresh memory status def refresh_memory(): return app.get_memory_info() # Update memory display every 10 seconds gr.Timer(value=10).tick(refresh_memory, outputs=[memory_display]) # Examples optimized for high-resolution gr.Examples( examples=[ ["A detailed cyberpunk cityscape at night with neon signs, flying cars, and holographic advertisements, highly detailed, 8k resolution"], ["A majestic dragon soaring through storm clouds with lightning, fantasy art, dramatic lighting, ultra detailed"], ["A futuristic space station orbiting Earth, with solar panels and docking bays, sci-fi concept art, cinematic"], ["A serene Japanese garden with cherry blossoms, koi pond, and traditional architecture, peaceful atmosphere, masterpiece"], ["A steampunk mechanical owl with brass gears and copper pipes, intricate details, vintage engineering"], ["An underwater city with bioluminescent coral and glass domes, marine life swimming around, fantasy architecture"] ], inputs=[prompt], label="🎨 Example Prompts (optimized for high-resolution generation)" ) # Usage tips gr.Markdown(""" ### 🚀 Usage Tips: 1. **First time**: Click "Load Model" to download and initialize Cosmos-Predict2 2. **High-res**: Try resolutions up to 2048x2048 with the powerful H200 GPU 3. **Quality**: Use 25-30 steps for high quality, 15-20 for faster generation 4. **Prompts**: Be descriptive and specific for best results 5. **Negative prompts**: Help avoid unwanted elements in your images """) return interface if __name__ == "__main__": print("🚀 Starting Cosmos-Predict2 ZeroGPU Space...") interface = create_interface() interface.launch()