"""
Curated HuggingFace Diffusers optimization knowledge base
Manually extracted and organized for reliable prompt injection
"""

OPTIMIZATION_GUIDE = """
# DIFFUSERS OPTIMIZATION TECHNIQUES

## Memory Optimization Techniques

### 1. Model CPU Offloading
Use `enable_model_cpu_offload()` to move models between GPU and CPU automatically:
```python
pipe.enable_model_cpu_offload()
```
- Saves significant VRAM by keeping only active models on GPU
- Automatic management, no manual intervention needed
- Compatible with all pipelines

### 2. Sequential CPU Offloading  
Use `enable_sequential_cpu_offload()` for more aggressive memory saving:
```python
pipe.enable_sequential_cpu_offload()
```
- More memory efficient than model offloading
- Moves models to CPU after each forward pass
- Best for very limited VRAM scenarios

### 3. Attention Slicing
Use `enable_attention_slicing()` to reduce memory during attention computation:
```python
pipe.enable_attention_slicing()
# or specify slice size
pipe.enable_attention_slicing("max")  # maximum slicing
pipe.enable_attention_slicing(1)      # slice_size = 1
```
- Trades compute time for memory
- Most effective for high-resolution images
- Can be combined with other techniques

### 4. VAE Slicing
Use `enable_vae_slicing()` for large batch processing:
```python
pipe.enable_vae_slicing()
```
- Decodes images one at a time instead of all at once
- Essential for batch sizes > 4
- Minimal performance impact on single images

### 5. VAE Tiling
Use `enable_vae_tiling()` for high-resolution image generation:
```python
pipe.enable_vae_tiling()
```
- Enables 4K+ image generation on 8GB VRAM
- Splits images into overlapping tiles
- Automatically disabled for 512x512 or smaller images

### 6. Memory Efficient Attention (xFormers)
Use `enable_xformers_memory_efficient_attention()` if xFormers is installed:
```python
pipe.enable_xformers_memory_efficient_attention()
```
- Significantly reduces memory usage and improves speed
- Requires xformers library installation
- Compatible with most models

## Performance Optimization Techniques

### 1. Half Precision (FP16/BF16)
Use lower precision for better memory and speed:
```python
# FP16 (widely supported)
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)

# BF16 (better numerical stability, newer hardware)
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
```
- FP16: Halves memory usage, widely supported
- BF16: Better numerical stability, requires newer GPUs
- Essential for most optimization scenarios

### 2. Torch Compile (PyTorch 2.0+)
Use `torch.compile()` for significant speed improvements:
```python
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
# For some models, compile VAE too:
pipe.vae.decode = torch.compile(pipe.vae.decode, mode="reduce-overhead", fullgraph=True)
```
- 5-50% speed improvement
- Requires PyTorch 2.0+
- First run is slower due to compilation

### 3. Fast Schedulers
Use faster schedulers for fewer steps:
```python
from diffusers import LMSDiscreteScheduler, UniPCMultistepScheduler

# LMS Scheduler (good quality, fast)
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)

# UniPC Scheduler (fastest)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
```

## Hardware-Specific Optimizations

### NVIDIA GPU Optimizations
```python
# Enable Tensor Cores
torch.backends.cudnn.benchmark = True

# Optimal data type for NVIDIA
torch_dtype = torch.float16  # or torch.bfloat16 for RTX 30/40 series
```

### Apple Silicon (MPS) Optimizations
```python
# Use MPS device
device = "mps" if torch.backends.mps.is_available() else "cpu"
pipe = pipe.to(device)

# Recommended dtype for Apple Silicon
torch_dtype = torch.bfloat16  # Better than float16 on Apple Silicon

# Attention slicing often helps on MPS
pipe.enable_attention_slicing()
```

### CPU Optimizations
```python
# Use float32 for CPU
torch_dtype = torch.float32

# Enable optimized attention
pipe.enable_attention_slicing()
```

## Model-Specific Guidelines

### FLUX Models
- Do NOT use guidance_scale parameter (not needed for FLUX)
- Use 4-8 inference steps maximum
- BF16 dtype recommended
- Enable attention slicing for memory optimization

### Stable Diffusion XL
- Enable attention slicing for high resolutions
- Use refiner model sparingly to save memory
- Consider VAE tiling for >1024px images

### Stable Diffusion 1.5/2.1
- Very memory efficient base models
- Can often run without optimizations on 8GB+ VRAM
- Enable VAE slicing for batch processing

## Memory Usage Estimation
- FLUX.1: ~24GB for full precision, ~12GB for FP16
- SDXL: ~7GB for FP16, ~14GB for FP32
- SD 1.5: ~2GB for FP16, ~4GB for FP32

## Optimization Combinations by VRAM

### 24GB+ VRAM (High-end)
```python
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
pipe = pipe.to("cuda")
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
```

### 12-24GB VRAM (Mid-range)
```python
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")
pipe.enable_model_cpu_offload()
pipe.enable_xformers_memory_efficient_attention()
```

### 8-12GB VRAM (Entry-level)
```python
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.enable_sequential_cpu_offload()
pipe.enable_attention_slicing()
pipe.enable_vae_slicing()
pipe.enable_xformers_memory_efficient_attention()
```

### <8GB VRAM (Low-end)
```python
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.enable_sequential_cpu_offload()
pipe.enable_attention_slicing("max")
pipe.enable_vae_slicing()
pipe.enable_vae_tiling()
```
"""


def get_optimization_guide():
    """Return the curated optimization guide."""
    return OPTIMIZATION_GUIDE


if __name__ == "__main__":
    print("Optimization guide loaded successfully!")
    print(f"Guide length: {len(OPTIMIZATION_GUIDE)} characters")