diff --git "a/auto_diffusers.log" "b/auto_diffusers.log"
--- "a/auto_diffusers.log"
+++ "b/auto_diffusers.log"
@@ -15442,3 +15442,1781 @@ If you are trying to access a private or gated repo, make sure you are authentic
 2025-05-30 12:50:48,350 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
 2025-05-30 12:50:48,351 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
 2025-05-30 12:50:48,351 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:53,641 - __main__ - INFO - Initializing GradioAutodiffusers
+2025-05-30 12:54:53,641 - __main__ - DEBUG - API key found, length: 39
+2025-05-30 12:54:53,641 - auto_diffusers - INFO - Initializing AutoDiffusersGenerator
+2025-05-30 12:54:53,641 - auto_diffusers - DEBUG - API key length: 39
+2025-05-30 12:54:53,641 - auto_diffusers - WARNING - Tool calling dependencies not available, running without tools
+2025-05-30 12:54:53,641 - hardware_detector - INFO - Initializing HardwareDetector
+2025-05-30 12:54:53,641 - hardware_detector - DEBUG - Starting system hardware detection
+2025-05-30 12:54:53,641 - hardware_detector - DEBUG - Platform: Darwin, Architecture: arm64
+2025-05-30 12:54:53,641 - hardware_detector - DEBUG - CPU cores: 16, Python: 3.11.11
+2025-05-30 12:54:53,641 - hardware_detector - DEBUG - Attempting GPU detection via nvidia-smi
+2025-05-30 12:54:53,645 - hardware_detector - DEBUG - nvidia-smi not found, no NVIDIA GPU detected
+2025-05-30 12:54:53,645 - hardware_detector - DEBUG - Checking PyTorch availability
+2025-05-30 12:54:54,110 - hardware_detector - INFO - PyTorch 2.7.0 detected
+2025-05-30 12:54:54,110 - hardware_detector - DEBUG - CUDA available: False, MPS available: True
+2025-05-30 12:54:54,110 - hardware_detector - INFO - Hardware detection completed successfully
+2025-05-30 12:54:54,110 - hardware_detector - DEBUG - Detected specs: {'platform': 'Darwin', 'architecture': 'arm64', 'cpu_count': 16, 'python_version': '3.11.11', 'gpu_info': None, 'cuda_available': False, 'mps_available': True, 'torch_version': '2.7.0'}
+2025-05-30 12:54:54,110 - auto_diffusers - INFO - Hardware detector initialized successfully
+2025-05-30 12:54:54,110 - __main__ - INFO - AutoDiffusersGenerator initialized successfully
+2025-05-30 12:54:54,110 - simple_memory_calculator - INFO - Initializing SimpleMemoryCalculator
+2025-05-30 12:54:54,110 - simple_memory_calculator - DEBUG - HuggingFace API initialized
+2025-05-30 12:54:54,110 - simple_memory_calculator - DEBUG - Known models in database: 4
+2025-05-30 12:54:54,110 - __main__ - INFO - SimpleMemoryCalculator initialized successfully
+2025-05-30 12:54:54,110 - __main__ - DEBUG - Default model settings: gemini-2.5-flash-preview-05-20, temp=0.7
+2025-05-30 12:54:54,112 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 12:54:54,125 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=3 socket_options=None
+2025-05-30 12:54:54,133 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 12:54:54,212 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 12:54:54,245 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=None socket_options=None
+2025-05-30 12:54:54,246 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x13c373fd0>
+2025-05-30 12:54:54,246 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 12:54:54,246 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 12:54:54,246 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 12:54:54,247 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 12:54:54,247 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 12:54:54,247 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 03:54:54 GMT'), (b'server', b'uvicorn'), (b'content-length', b'4'), (b'content-type', b'application/json')])
+2025-05-30 12:54:54,247 - httpx - INFO - HTTP Request: GET http://localhost:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
+2025-05-30 12:54:54,247 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 12:54:54,247 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 12:54:54,247 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 12:54:54,247 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 12:54:54,247 - httpcore.connection - DEBUG - close.started
+2025-05-30 12:54:54,247 - httpcore.connection - DEBUG - close.complete
+2025-05-30 12:54:54,248 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=3 socket_options=None
+2025-05-30 12:54:54,248 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x13c2994d0>
+2025-05-30 12:54:54,248 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'HEAD']>
+2025-05-30 12:54:54,248 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 12:54:54,248 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'HEAD']>
+2025-05-30 12:54:54,249 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 12:54:54,249 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'HEAD']>
+2025-05-30 12:54:54,255 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 03:54:54 GMT'), (b'server', b'uvicorn'), (b'content-length', b'104594'), (b'content-type', b'text/html; charset=utf-8')])
+2025-05-30 12:54:54,255 - httpx - INFO - HTTP Request: HEAD http://localhost:7860/ "HTTP/1.1 200 OK"
+2025-05-30 12:54:54,255 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'HEAD']>
+2025-05-30 12:54:54,255 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 12:54:54,255 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 12:54:54,255 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 12:54:54,256 - httpcore.connection - DEBUG - close.started
+2025-05-30 12:54:54,256 - httpcore.connection - DEBUG - close.complete
+2025-05-30 12:54:54,267 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=30 socket_options=None
+2025-05-30 12:54:54,288 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12e896a50>
+2025-05-30 12:54:54,288 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x10c52d130> server_hostname='api.gradio.app' timeout=3
+2025-05-30 12:54:54,404 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12e175b90>
+2025-05-30 12:54:54,404 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x13c20fe30> server_hostname='api.gradio.app' timeout=30
+2025-05-30 12:54:54,408 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/initiated HTTP/1.1" 200 0
+2025-05-30 12:54:54,561 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x13c18e8d0>
+2025-05-30 12:54:54,561 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 12:54:54,561 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 12:54:54,561 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 12:54:54,561 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 12:54:54,561 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 12:54:54,682 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12dd238d0>
+2025-05-30 12:54:54,682 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 12:54:54,682 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 12:54:54,682 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 12:54:54,682 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 12:54:54,683 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 12:54:54,699 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 03:54:54 GMT'), (b'Content-Type', b'application/json'), (b'Content-Length', b'21'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'Access-Control-Allow-Origin', b'*')])
+2025-05-30 12:54:54,699 - httpx - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
+2025-05-30 12:54:54,699 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 12:54:54,699 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 12:54:54,700 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 12:54:54,700 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 12:54:54,700 - httpcore.connection - DEBUG - close.started
+2025-05-30 12:54:54,700 - httpcore.connection - DEBUG - close.complete
+2025-05-30 12:54:54,822 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 03:54:54 GMT'), (b'Content-Type', b'text/html; charset=utf-8'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'ContentType', b'application/json'), (b'Access-Control-Allow-Origin', b'*'), (b'Content-Encoding', b'gzip')])
+2025-05-30 12:54:54,822 - httpx - INFO - HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"
+2025-05-30 12:54:54,823 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 12:54:54,823 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 12:54:54,823 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 12:54:54,823 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 12:54:54,824 - httpcore.connection - DEBUG - close.started
+2025-05-30 12:54:54,824 - httpcore.connection - DEBUG - close.complete
+2025-05-30 12:54:55,469 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 12:54:55,694 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/launched HTTP/1.1" 200 0
+2025-05-30 12:54:57,037 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:57,038 - simple_memory_calculator - INFO - Using known memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:57,038 - simple_memory_calculator - DEBUG - Known data: {'params_billions': 12.0, 'fp16_gb': 24.0, 'inference_fp16_gb': 36.0}
+2025-05-30 12:54:57,038 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 12:54:57,038 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:57,038 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:57,038 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 12:54:57,039 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:57,039 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:59,172 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:59,172 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:59,172 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 12:54:59,172 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:59,172 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:59,173 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 12:54:59,173 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:59,173 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:59,173 - auto_diffusers - INFO - Starting code generation for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:54:59,173 - auto_diffusers - DEBUG - Parameters: prompt='A cat holding a sign that says hello world...', size=(768, 1360), steps=4
+2025-05-30 12:54:59,173 - auto_diffusers - DEBUG - Manual specs: True, Memory analysis provided: True
+2025-05-30 12:54:59,173 - auto_diffusers - INFO - Using manual hardware specifications
+2025-05-30 12:54:59,173 - auto_diffusers - DEBUG - Manual specs: {'platform': 'Linux', 'architecture': 'manual_input', 'cpu_count': 8, 'python_version': '3.11', 'cuda_available': False, 'mps_available': False, 'torch_version': '2.0+', 'manual_input': True, 'ram_gb': 16, 'user_dtype': None, 'gpu_info': [{'name': 'Custom GPU', 'memory_mb': 8192}]}
+2025-05-30 12:54:59,173 - auto_diffusers - DEBUG - GPU detected with 8.0 GB VRAM
+2025-05-30 12:54:59,173 - auto_diffusers - INFO - Selected optimization profile: balanced
+2025-05-30 12:54:59,173 - auto_diffusers - DEBUG - Creating generation prompt for Gemini API
+2025-05-30 12:54:59,173 - auto_diffusers - DEBUG - Prompt length: 7598 characters
+2025-05-30 12:54:59,173 - auto_diffusers - INFO - ================================================================================
+2025-05-30 12:54:59,173 - auto_diffusers - INFO - PROMPT SENT TO GEMINI API:
+2025-05-30 12:54:59,173 - auto_diffusers - INFO - ================================================================================
+2025-05-30 12:54:59,174 - auto_diffusers - INFO - 
+You are an expert in optimizing diffusers library code for different hardware configurations.
+
+NOTE: This system includes curated optimization knowledge from HuggingFace documentation.
+
+TASK: Generate optimized Python code for running a diffusion model with the following specifications:
+- Model: black-forest-labs/FLUX.1-schnell
+- Prompt: "A cat holding a sign that says hello world"
+- Image size: 768x1360
+- Inference steps: 4
+
+HARDWARE SPECIFICATIONS:
+- Platform: Linux (manual_input)
+- CPU Cores: 8
+- CUDA Available: False
+- MPS Available: False
+- Optimization Profile: balanced
+- GPU: Custom GPU (8.0 GB VRAM)
+
+MEMORY ANALYSIS:
+- Model Memory Requirements: 36.0 GB (FP16 inference)
+- Model Weights Size: 24.0 GB (FP16)
+- Memory Recommendation: 🔄 Requires sequential CPU offloading
+- Recommended Precision: float16
+- Attention Slicing Recommended: True
+- VAE Slicing Recommended: True
+
+OPTIMIZATION KNOWLEDGE BASE:
+
+# DIFFUSERS OPTIMIZATION TECHNIQUES
+
+## Memory Optimization Techniques
+
+### 1. Model CPU Offloading
+Use `enable_model_cpu_offload()` to move models between GPU and CPU automatically:
+```python
+pipe.enable_model_cpu_offload()
+```
+- Saves significant VRAM by keeping only active models on GPU
+- Automatic management, no manual intervention needed
+- Compatible with all pipelines
+
+### 2. Sequential CPU Offloading  
+Use `enable_sequential_cpu_offload()` for more aggressive memory saving:
+```python
+pipe.enable_sequential_cpu_offload()
+```
+- More memory efficient than model offloading
+- Moves models to CPU after each forward pass
+- Best for very limited VRAM scenarios
+
+### 3. Attention Slicing
+Use `enable_attention_slicing()` to reduce memory during attention computation:
+```python
+pipe.enable_attention_slicing()
+# or specify slice size
+pipe.enable_attention_slicing("max")  # maximum slicing
+pipe.enable_attention_slicing(1)      # slice_size = 1
+```
+- Trades compute time for memory
+- Most effective for high-resolution images
+- Can be combined with other techniques
+
+### 4. VAE Slicing
+Use `enable_vae_slicing()` for large batch processing:
+```python
+pipe.enable_vae_slicing()
+```
+- Decodes images one at a time instead of all at once
+- Essential for batch sizes > 4
+- Minimal performance impact on single images
+
+### 5. VAE Tiling
+Use `enable_vae_tiling()` for high-resolution image generation:
+```python
+pipe.enable_vae_tiling()
+```
+- Enables 4K+ image generation on 8GB VRAM
+- Splits images into overlapping tiles
+- Automatically disabled for 512x512 or smaller images
+
+### 6. Memory Efficient Attention (xFormers)
+Use `enable_xformers_memory_efficient_attention()` if xFormers is installed:
+```python
+pipe.enable_xformers_memory_efficient_attention()
+```
+- Significantly reduces memory usage and improves speed
+- Requires xformers library installation
+- Compatible with most models
+
+## Performance Optimization Techniques
+
+### 1. Half Precision (FP16/BF16)
+Use lower precision for better memory and speed:
+```python
+# FP16 (widely supported)
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+
+# BF16 (better numerical stability, newer hardware)
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+```
+- FP16: Halves memory usage, widely supported
+- BF16: Better numerical stability, requires newer GPUs
+- Essential for most optimization scenarios
+
+### 2. Torch Compile (PyTorch 2.0+)
+Use `torch.compile()` for significant speed improvements:
+```python
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+# For some models, compile VAE too:
+pipe.vae.decode = torch.compile(pipe.vae.decode, mode="reduce-overhead", fullgraph=True)
+```
+- 5-50% speed improvement
+- Requires PyTorch 2.0+
+- First run is slower due to compilation
+
+### 3. Fast Schedulers
+Use faster schedulers for fewer steps:
+```python
+from diffusers import LMSDiscreteScheduler, UniPCMultistepScheduler
+
+# LMS Scheduler (good quality, fast)
+pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+
+# UniPC Scheduler (fastest)
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+```
+
+## Hardware-Specific Optimizations
+
+### NVIDIA GPU Optimizations
+```python
+# Enable Tensor Cores
+torch.backends.cudnn.benchmark = True
+
+# Optimal data type for NVIDIA
+torch_dtype = torch.float16  # or torch.bfloat16 for RTX 30/40 series
+```
+
+### Apple Silicon (MPS) Optimizations
+```python
+# Use MPS device
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+pipe = pipe.to(device)
+
+# Recommended dtype for Apple Silicon
+torch_dtype = torch.bfloat16  # Better than float16 on Apple Silicon
+
+# Attention slicing often helps on MPS
+pipe.enable_attention_slicing()
+```
+
+### CPU Optimizations
+```python
+# Use float32 for CPU
+torch_dtype = torch.float32
+
+# Enable optimized attention
+pipe.enable_attention_slicing()
+```
+
+## Model-Specific Guidelines
+
+### FLUX Models
+- Do NOT use guidance_scale parameter (not needed for FLUX)
+- Use 4-8 inference steps maximum
+- BF16 dtype recommended
+- Enable attention slicing for memory optimization
+
+### Stable Diffusion XL
+- Enable attention slicing for high resolutions
+- Use refiner model sparingly to save memory
+- Consider VAE tiling for >1024px images
+
+### Stable Diffusion 1.5/2.1
+- Very memory efficient base models
+- Can often run without optimizations on 8GB+ VRAM
+- Enable VAE slicing for batch processing
+
+## Memory Usage Estimation
+- FLUX.1: ~24GB for full precision, ~12GB for FP16
+- SDXL: ~7GB for FP16, ~14GB for FP32
+- SD 1.5: ~2GB for FP16, ~4GB for FP32
+
+## Optimization Combinations by VRAM
+
+### 24GB+ VRAM (High-end)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+pipe = pipe.to("cuda")
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+```
+
+### 12-24GB VRAM (Mid-range)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.enable_model_cpu_offload()
+pipe.enable_xformers_memory_efficient_attention()
+```
+
+### 8-12GB VRAM (Entry-level)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing()
+pipe.enable_vae_slicing()
+pipe.enable_xformers_memory_efficient_attention()
+```
+
+### <8GB VRAM (Low-end)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing("max")
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+```
+
+
+IMPORTANT: For FLUX.1-schnell models, do NOT include guidance_scale parameter as it's not needed.
+
+Using the OPTIMIZATION KNOWLEDGE BASE above, generate Python code that:
+
+1. **Selects the best optimization techniques** for the specific hardware profile
+2. **Applies appropriate memory optimizations** based on available VRAM
+3. **Uses optimal data types** for the target hardware:
+   - User specified dtype (if provided): Use exactly as specified
+   - Apple Silicon (MPS): prefer torch.bfloat16
+   - NVIDIA GPUs: prefer torch.float16 or torch.bfloat16 
+   - CPU only: use torch.float32
+4. **Implements hardware-specific optimizations** (CUDA, MPS, CPU)
+5. **Follows model-specific guidelines** (e.g., FLUX guidance_scale handling)
+
+IMPORTANT GUIDELINES:
+- Reference the OPTIMIZATION KNOWLEDGE BASE to select appropriate techniques
+- Include all necessary imports
+- Add brief comments explaining optimization choices
+- Generate compact, production-ready code
+- Inline values where possible for concise code
+- Generate ONLY the Python code, no explanations before or after the code block
+
+2025-05-30 12:54:59,174 - auto_diffusers - INFO - ================================================================================
+2025-05-30 12:54:59,174 - auto_diffusers - INFO - Sending request to Gemini API
+2025-05-30 12:55:22,590 - auto_diffusers - INFO - Successfully received response from Gemini API (no tools used)
+2025-05-30 12:55:22,590 - auto_diffusers - DEBUG - Response length: 2545 characters
+2025-05-30 12:56:22,011 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:56:22,011 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:56:22,011 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 12:56:22,012 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:56:22,012 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:56:22,012 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 12:56:22,012 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:56:22,012 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:56:22,012 - auto_diffusers - INFO - Starting code generation for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:56:22,012 - auto_diffusers - DEBUG - Parameters: prompt='A cat holding a sign that says hello world...', size=(768, 1360), steps=4
+2025-05-30 12:56:22,012 - auto_diffusers - DEBUG - Manual specs: True, Memory analysis provided: True
+2025-05-30 12:56:22,012 - auto_diffusers - INFO - Using manual hardware specifications
+2025-05-30 12:56:22,012 - auto_diffusers - DEBUG - Manual specs: {'platform': 'Darwin', 'architecture': 'manual_input', 'cpu_count': 8, 'python_version': '3.11', 'cuda_available': False, 'mps_available': False, 'torch_version': '2.0+', 'manual_input': True, 'ram_gb': 16, 'user_dtype': None, 'gpu_info': [{'name': 'Custom GPU', 'memory_mb': 8192}]}
+2025-05-30 12:56:22,012 - auto_diffusers - DEBUG - GPU detected with 8.0 GB VRAM
+2025-05-30 12:56:22,012 - auto_diffusers - INFO - Selected optimization profile: balanced
+2025-05-30 12:56:22,012 - auto_diffusers - DEBUG - Creating generation prompt for Gemini API
+2025-05-30 12:56:22,012 - auto_diffusers - DEBUG - Prompt length: 7599 characters
+2025-05-30 12:56:22,013 - auto_diffusers - INFO - ================================================================================
+2025-05-30 12:56:22,013 - auto_diffusers - INFO - PROMPT SENT TO GEMINI API:
+2025-05-30 12:56:22,013 - auto_diffusers - INFO - ================================================================================
+2025-05-30 12:56:22,013 - auto_diffusers - INFO - 
+You are an expert in optimizing diffusers library code for different hardware configurations.
+
+NOTE: This system includes curated optimization knowledge from HuggingFace documentation.
+
+TASK: Generate optimized Python code for running a diffusion model with the following specifications:
+- Model: black-forest-labs/FLUX.1-schnell
+- Prompt: "A cat holding a sign that says hello world"
+- Image size: 768x1360
+- Inference steps: 4
+
+HARDWARE SPECIFICATIONS:
+- Platform: Darwin (manual_input)
+- CPU Cores: 8
+- CUDA Available: False
+- MPS Available: False
+- Optimization Profile: balanced
+- GPU: Custom GPU (8.0 GB VRAM)
+
+MEMORY ANALYSIS:
+- Model Memory Requirements: 36.0 GB (FP16 inference)
+- Model Weights Size: 24.0 GB (FP16)
+- Memory Recommendation: 🔄 Requires sequential CPU offloading
+- Recommended Precision: float16
+- Attention Slicing Recommended: True
+- VAE Slicing Recommended: True
+
+OPTIMIZATION KNOWLEDGE BASE:
+
+# DIFFUSERS OPTIMIZATION TECHNIQUES
+
+## Memory Optimization Techniques
+
+### 1. Model CPU Offloading
+Use `enable_model_cpu_offload()` to move models between GPU and CPU automatically:
+```python
+pipe.enable_model_cpu_offload()
+```
+- Saves significant VRAM by keeping only active models on GPU
+- Automatic management, no manual intervention needed
+- Compatible with all pipelines
+
+### 2. Sequential CPU Offloading  
+Use `enable_sequential_cpu_offload()` for more aggressive memory saving:
+```python
+pipe.enable_sequential_cpu_offload()
+```
+- More memory efficient than model offloading
+- Moves models to CPU after each forward pass
+- Best for very limited VRAM scenarios
+
+### 3. Attention Slicing
+Use `enable_attention_slicing()` to reduce memory during attention computation:
+```python
+pipe.enable_attention_slicing()
+# or specify slice size
+pipe.enable_attention_slicing("max")  # maximum slicing
+pipe.enable_attention_slicing(1)      # slice_size = 1
+```
+- Trades compute time for memory
+- Most effective for high-resolution images
+- Can be combined with other techniques
+
+### 4. VAE Slicing
+Use `enable_vae_slicing()` for large batch processing:
+```python
+pipe.enable_vae_slicing()
+```
+- Decodes images one at a time instead of all at once
+- Essential for batch sizes > 4
+- Minimal performance impact on single images
+
+### 5. VAE Tiling
+Use `enable_vae_tiling()` for high-resolution image generation:
+```python
+pipe.enable_vae_tiling()
+```
+- Enables 4K+ image generation on 8GB VRAM
+- Splits images into overlapping tiles
+- Automatically disabled for 512x512 or smaller images
+
+### 6. Memory Efficient Attention (xFormers)
+Use `enable_xformers_memory_efficient_attention()` if xFormers is installed:
+```python
+pipe.enable_xformers_memory_efficient_attention()
+```
+- Significantly reduces memory usage and improves speed
+- Requires xformers library installation
+- Compatible with most models
+
+## Performance Optimization Techniques
+
+### 1. Half Precision (FP16/BF16)
+Use lower precision for better memory and speed:
+```python
+# FP16 (widely supported)
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+
+# BF16 (better numerical stability, newer hardware)
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+```
+- FP16: Halves memory usage, widely supported
+- BF16: Better numerical stability, requires newer GPUs
+- Essential for most optimization scenarios
+
+### 2. Torch Compile (PyTorch 2.0+)
+Use `torch.compile()` for significant speed improvements:
+```python
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+# For some models, compile VAE too:
+pipe.vae.decode = torch.compile(pipe.vae.decode, mode="reduce-overhead", fullgraph=True)
+```
+- 5-50% speed improvement
+- Requires PyTorch 2.0+
+- First run is slower due to compilation
+
+### 3. Fast Schedulers
+Use faster schedulers for fewer steps:
+```python
+from diffusers import LMSDiscreteScheduler, UniPCMultistepScheduler
+
+# LMS Scheduler (good quality, fast)
+pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+
+# UniPC Scheduler (fastest)
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+```
+
+## Hardware-Specific Optimizations
+
+### NVIDIA GPU Optimizations
+```python
+# Enable Tensor Cores
+torch.backends.cudnn.benchmark = True
+
+# Optimal data type for NVIDIA
+torch_dtype = torch.float16  # or torch.bfloat16 for RTX 30/40 series
+```
+
+### Apple Silicon (MPS) Optimizations
+```python
+# Use MPS device
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+pipe = pipe.to(device)
+
+# Recommended dtype for Apple Silicon
+torch_dtype = torch.bfloat16  # Better than float16 on Apple Silicon
+
+# Attention slicing often helps on MPS
+pipe.enable_attention_slicing()
+```
+
+### CPU Optimizations
+```python
+# Use float32 for CPU
+torch_dtype = torch.float32
+
+# Enable optimized attention
+pipe.enable_attention_slicing()
+```
+
+## Model-Specific Guidelines
+
+### FLUX Models
+- Do NOT use guidance_scale parameter (not needed for FLUX)
+- Use 4-8 inference steps maximum
+- BF16 dtype recommended
+- Enable attention slicing for memory optimization
+
+### Stable Diffusion XL
+- Enable attention slicing for high resolutions
+- Use refiner model sparingly to save memory
+- Consider VAE tiling for >1024px images
+
+### Stable Diffusion 1.5/2.1
+- Very memory efficient base models
+- Can often run without optimizations on 8GB+ VRAM
+- Enable VAE slicing for batch processing
+
+## Memory Usage Estimation
+- FLUX.1: ~24GB for full precision, ~12GB for FP16
+- SDXL: ~7GB for FP16, ~14GB for FP32
+- SD 1.5: ~2GB for FP16, ~4GB for FP32
+
+## Optimization Combinations by VRAM
+
+### 24GB+ VRAM (High-end)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+pipe = pipe.to("cuda")
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+```
+
+### 12-24GB VRAM (Mid-range)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.enable_model_cpu_offload()
+pipe.enable_xformers_memory_efficient_attention()
+```
+
+### 8-12GB VRAM (Entry-level)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing()
+pipe.enable_vae_slicing()
+pipe.enable_xformers_memory_efficient_attention()
+```
+
+### <8GB VRAM (Low-end)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing("max")
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+```
+
+
+IMPORTANT: For FLUX.1-schnell models, do NOT include guidance_scale parameter as it's not needed.
+
+Using the OPTIMIZATION KNOWLEDGE BASE above, generate Python code that:
+
+1. **Selects the best optimization techniques** for the specific hardware profile
+2. **Applies appropriate memory optimizations** based on available VRAM
+3. **Uses optimal data types** for the target hardware:
+   - User specified dtype (if provided): Use exactly as specified
+   - Apple Silicon (MPS): prefer torch.bfloat16
+   - NVIDIA GPUs: prefer torch.float16 or torch.bfloat16 
+   - CPU only: use torch.float32
+4. **Implements hardware-specific optimizations** (CUDA, MPS, CPU)
+5. **Follows model-specific guidelines** (e.g., FLUX guidance_scale handling)
+
+IMPORTANT GUIDELINES:
+- Reference the OPTIMIZATION KNOWLEDGE BASE to select appropriate techniques
+- Include all necessary imports
+- Add brief comments explaining optimization choices
+- Generate compact, production-ready code
+- Inline values where possible for concise code
+- Generate ONLY the Python code, no explanations before or after the code block
+
+2025-05-30 12:56:22,013 - auto_diffusers - INFO - ================================================================================
+2025-05-30 12:56:22,013 - auto_diffusers - INFO - Sending request to Gemini API
+2025-05-30 12:56:53,133 - auto_diffusers - INFO - Successfully received response from Gemini API (no tools used)
+2025-05-30 12:56:53,133 - auto_diffusers - DEBUG - Response length: 3054 characters
+2025-05-30 12:59:28,018 - __main__ - INFO - Initializing GradioAutodiffusers
+2025-05-30 12:59:28,018 - __main__ - DEBUG - API key found, length: 39
+2025-05-30 12:59:28,018 - auto_diffusers - INFO - Initializing AutoDiffusersGenerator
+2025-05-30 12:59:28,018 - auto_diffusers - DEBUG - API key length: 39
+2025-05-30 12:59:28,018 - auto_diffusers - WARNING - Tool calling dependencies not available, running without tools
+2025-05-30 12:59:28,018 - hardware_detector - INFO - Initializing HardwareDetector
+2025-05-30 12:59:28,018 - hardware_detector - DEBUG - Starting system hardware detection
+2025-05-30 12:59:28,018 - hardware_detector - DEBUG - Platform: Darwin, Architecture: arm64
+2025-05-30 12:59:28,018 - hardware_detector - DEBUG - CPU cores: 16, Python: 3.11.11
+2025-05-30 12:59:28,018 - hardware_detector - DEBUG - Attempting GPU detection via nvidia-smi
+2025-05-30 12:59:28,022 - hardware_detector - DEBUG - nvidia-smi not found, no NVIDIA GPU detected
+2025-05-30 12:59:28,022 - hardware_detector - DEBUG - Checking PyTorch availability
+2025-05-30 12:59:28,486 - hardware_detector - INFO - PyTorch 2.7.0 detected
+2025-05-30 12:59:28,487 - hardware_detector - DEBUG - CUDA available: False, MPS available: True
+2025-05-30 12:59:28,487 - hardware_detector - INFO - Hardware detection completed successfully
+2025-05-30 12:59:28,487 - hardware_detector - DEBUG - Detected specs: {'platform': 'Darwin', 'architecture': 'arm64', 'cpu_count': 16, 'python_version': '3.11.11', 'gpu_info': None, 'cuda_available': False, 'mps_available': True, 'torch_version': '2.7.0'}
+2025-05-30 12:59:28,487 - auto_diffusers - INFO - Hardware detector initialized successfully
+2025-05-30 12:59:28,487 - __main__ - INFO - AutoDiffusersGenerator initialized successfully
+2025-05-30 12:59:28,487 - simple_memory_calculator - INFO - Initializing SimpleMemoryCalculator
+2025-05-30 12:59:28,487 - simple_memory_calculator - DEBUG - HuggingFace API initialized
+2025-05-30 12:59:28,487 - simple_memory_calculator - DEBUG - Known models in database: 4
+2025-05-30 12:59:28,487 - __main__ - INFO - SimpleMemoryCalculator initialized successfully
+2025-05-30 12:59:28,487 - __main__ - DEBUG - Default model settings: gemini-2.5-flash-preview-05-20, temp=0.7
+2025-05-30 12:59:28,489 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 12:59:28,502 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=3 socket_options=None
+2025-05-30 12:59:28,510 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 12:59:28,608 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 12:59:28,639 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=None socket_options=None
+2025-05-30 12:59:28,640 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x125437ed0>
+2025-05-30 12:59:28,640 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 12:59:28,640 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 12:59:28,640 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 12:59:28,640 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 12:59:28,641 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 12:59:28,641 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 03:59:28 GMT'), (b'server', b'uvicorn'), (b'content-length', b'4'), (b'content-type', b'application/json')])
+2025-05-30 12:59:28,641 - httpx - INFO - HTTP Request: GET http://localhost:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
+2025-05-30 12:59:28,641 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 12:59:28,641 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 12:59:28,641 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 12:59:28,641 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 12:59:28,641 - httpcore.connection - DEBUG - close.started
+2025-05-30 12:59:28,641 - httpcore.connection - DEBUG - close.complete
+2025-05-30 12:59:28,642 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=3 socket_options=None
+2025-05-30 12:59:28,642 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x125437f90>
+2025-05-30 12:59:28,642 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'HEAD']>
+2025-05-30 12:59:28,642 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 12:59:28,642 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'HEAD']>
+2025-05-30 12:59:28,642 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 12:59:28,642 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'HEAD']>
+2025-05-30 12:59:28,649 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 03:59:28 GMT'), (b'server', b'uvicorn'), (b'content-length', b'105022'), (b'content-type', b'text/html; charset=utf-8')])
+2025-05-30 12:59:28,649 - httpx - INFO - HTTP Request: HEAD http://localhost:7860/ "HTTP/1.1 200 OK"
+2025-05-30 12:59:28,649 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'HEAD']>
+2025-05-30 12:59:28,649 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 12:59:28,649 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 12:59:28,649 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 12:59:28,649 - httpcore.connection - DEBUG - close.started
+2025-05-30 12:59:28,649 - httpcore.connection - DEBUG - close.complete
+2025-05-30 12:59:28,660 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=30 socket_options=None
+2025-05-30 12:59:28,674 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x123da0d50>
+2025-05-30 12:59:28,674 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x112729130> server_hostname='api.gradio.app' timeout=3
+2025-05-30 12:59:28,799 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x125337ed0>
+2025-05-30 12:59:28,799 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x124ccbec0> server_hostname='api.gradio.app' timeout=30
+2025-05-30 12:59:28,858 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/initiated HTTP/1.1" 200 0
+2025-05-30 12:59:28,960 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x122fc36d0>
+2025-05-30 12:59:28,960 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 12:59:28,960 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 12:59:28,960 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 12:59:28,960 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 12:59:28,960 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 12:59:29,071 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x122bb3790>
+2025-05-30 12:59:29,071 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 12:59:29,072 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 12:59:29,072 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 12:59:29,072 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 12:59:29,072 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 12:59:29,108 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 03:59:29 GMT'), (b'Content-Type', b'application/json'), (b'Content-Length', b'21'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'Access-Control-Allow-Origin', b'*')])
+2025-05-30 12:59:29,108 - httpx - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
+2025-05-30 12:59:29,108 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 12:59:29,108 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 12:59:29,108 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 12:59:29,109 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 12:59:29,109 - httpcore.connection - DEBUG - close.started
+2025-05-30 12:59:29,109 - httpcore.connection - DEBUG - close.complete
+2025-05-30 12:59:29,211 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 03:59:29 GMT'), (b'Content-Type', b'text/html; charset=utf-8'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'ContentType', b'application/json'), (b'Access-Control-Allow-Origin', b'*'), (b'Content-Encoding', b'gzip')])
+2025-05-30 12:59:29,212 - httpx - INFO - HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"
+2025-05-30 12:59:29,212 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 12:59:29,212 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 12:59:29,213 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 12:59:29,213 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 12:59:29,213 - httpcore.connection - DEBUG - close.started
+2025-05-30 12:59:29,213 - httpcore.connection - DEBUG - close.complete
+2025-05-30 12:59:29,858 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 12:59:30,074 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/launched HTTP/1.1" 200 0
+2025-05-30 12:59:30,112 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:30,112 - simple_memory_calculator - INFO - Using known memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:30,112 - simple_memory_calculator - DEBUG - Known data: {'params_billions': 12.0, 'fp16_gb': 24.0, 'inference_fp16_gb': 36.0}
+2025-05-30 12:59:30,112 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 12:59:30,112 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:30,112 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:30,112 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 12:59:30,112 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:30,112 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:31,998 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:31,998 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:31,998 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 12:59:31,999 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:31,999 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:31,999 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 12:59:31,999 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:31,999 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:31,999 - auto_diffusers - INFO - Starting code generation for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 12:59:31,999 - auto_diffusers - DEBUG - Parameters: prompt='A cat holding a sign that says hello world...', size=(768, 1360), steps=4
+2025-05-30 12:59:31,999 - auto_diffusers - DEBUG - Manual specs: True, Memory analysis provided: True
+2025-05-30 12:59:31,999 - auto_diffusers - INFO - Using manual hardware specifications
+2025-05-30 12:59:31,999 - auto_diffusers - DEBUG - Manual specs: {'platform': 'Linux', 'architecture': 'manual_input', 'cpu_count': 8, 'python_version': '3.11', 'cuda_available': False, 'mps_available': False, 'torch_version': '2.0+', 'manual_input': True, 'ram_gb': 16, 'user_dtype': None, 'gpu_info': [{'name': 'Custom GPU', 'memory_mb': 8192}]}
+2025-05-30 12:59:31,999 - auto_diffusers - DEBUG - GPU detected with 8.0 GB VRAM
+2025-05-30 12:59:31,999 - auto_diffusers - INFO - Selected optimization profile: balanced
+2025-05-30 12:59:31,999 - auto_diffusers - DEBUG - Creating generation prompt for Gemini API
+2025-05-30 12:59:31,999 - auto_diffusers - DEBUG - Prompt length: 7598 characters
+2025-05-30 12:59:31,999 - auto_diffusers - INFO - ================================================================================
+2025-05-30 12:59:32,000 - auto_diffusers - INFO - PROMPT SENT TO GEMINI API:
+2025-05-30 12:59:32,000 - auto_diffusers - INFO - ================================================================================
+2025-05-30 12:59:32,000 - auto_diffusers - INFO - 
+You are an expert in optimizing diffusers library code for different hardware configurations.
+
+NOTE: This system includes curated optimization knowledge from HuggingFace documentation.
+
+TASK: Generate optimized Python code for running a diffusion model with the following specifications:
+- Model: black-forest-labs/FLUX.1-schnell
+- Prompt: "A cat holding a sign that says hello world"
+- Image size: 768x1360
+- Inference steps: 4
+
+HARDWARE SPECIFICATIONS:
+- Platform: Linux (manual_input)
+- CPU Cores: 8
+- CUDA Available: False
+- MPS Available: False
+- Optimization Profile: balanced
+- GPU: Custom GPU (8.0 GB VRAM)
+
+MEMORY ANALYSIS:
+- Model Memory Requirements: 36.0 GB (FP16 inference)
+- Model Weights Size: 24.0 GB (FP16)
+- Memory Recommendation: 🔄 Requires sequential CPU offloading
+- Recommended Precision: float16
+- Attention Slicing Recommended: True
+- VAE Slicing Recommended: True
+
+OPTIMIZATION KNOWLEDGE BASE:
+
+# DIFFUSERS OPTIMIZATION TECHNIQUES
+
+## Memory Optimization Techniques
+
+### 1. Model CPU Offloading
+Use `enable_model_cpu_offload()` to move models between GPU and CPU automatically:
+```python
+pipe.enable_model_cpu_offload()
+```
+- Saves significant VRAM by keeping only active models on GPU
+- Automatic management, no manual intervention needed
+- Compatible with all pipelines
+
+### 2. Sequential CPU Offloading  
+Use `enable_sequential_cpu_offload()` for more aggressive memory saving:
+```python
+pipe.enable_sequential_cpu_offload()
+```
+- More memory efficient than model offloading
+- Moves models to CPU after each forward pass
+- Best for very limited VRAM scenarios
+
+### 3. Attention Slicing
+Use `enable_attention_slicing()` to reduce memory during attention computation:
+```python
+pipe.enable_attention_slicing()
+# or specify slice size
+pipe.enable_attention_slicing("max")  # maximum slicing
+pipe.enable_attention_slicing(1)      # slice_size = 1
+```
+- Trades compute time for memory
+- Most effective for high-resolution images
+- Can be combined with other techniques
+
+### 4. VAE Slicing
+Use `enable_vae_slicing()` for large batch processing:
+```python
+pipe.enable_vae_slicing()
+```
+- Decodes images one at a time instead of all at once
+- Essential for batch sizes > 4
+- Minimal performance impact on single images
+
+### 5. VAE Tiling
+Use `enable_vae_tiling()` for high-resolution image generation:
+```python
+pipe.enable_vae_tiling()
+```
+- Enables 4K+ image generation on 8GB VRAM
+- Splits images into overlapping tiles
+- Automatically disabled for 512x512 or smaller images
+
+### 6. Memory Efficient Attention (xFormers)
+Use `enable_xformers_memory_efficient_attention()` if xFormers is installed:
+```python
+pipe.enable_xformers_memory_efficient_attention()
+```
+- Significantly reduces memory usage and improves speed
+- Requires xformers library installation
+- Compatible with most models
+
+## Performance Optimization Techniques
+
+### 1. Half Precision (FP16/BF16)
+Use lower precision for better memory and speed:
+```python
+# FP16 (widely supported)
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+
+# BF16 (better numerical stability, newer hardware)
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+```
+- FP16: Halves memory usage, widely supported
+- BF16: Better numerical stability, requires newer GPUs
+- Essential for most optimization scenarios
+
+### 2. Torch Compile (PyTorch 2.0+)
+Use `torch.compile()` for significant speed improvements:
+```python
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+# For some models, compile VAE too:
+pipe.vae.decode = torch.compile(pipe.vae.decode, mode="reduce-overhead", fullgraph=True)
+```
+- 5-50% speed improvement
+- Requires PyTorch 2.0+
+- First run is slower due to compilation
+
+### 3. Fast Schedulers
+Use faster schedulers for fewer steps:
+```python
+from diffusers import LMSDiscreteScheduler, UniPCMultistepScheduler
+
+# LMS Scheduler (good quality, fast)
+pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+
+# UniPC Scheduler (fastest)
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+```
+
+## Hardware-Specific Optimizations
+
+### NVIDIA GPU Optimizations
+```python
+# Enable Tensor Cores
+torch.backends.cudnn.benchmark = True
+
+# Optimal data type for NVIDIA
+torch_dtype = torch.float16  # or torch.bfloat16 for RTX 30/40 series
+```
+
+### Apple Silicon (MPS) Optimizations
+```python
+# Use MPS device
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+pipe = pipe.to(device)
+
+# Recommended dtype for Apple Silicon
+torch_dtype = torch.bfloat16  # Better than float16 on Apple Silicon
+
+# Attention slicing often helps on MPS
+pipe.enable_attention_slicing()
+```
+
+### CPU Optimizations
+```python
+# Use float32 for CPU
+torch_dtype = torch.float32
+
+# Enable optimized attention
+pipe.enable_attention_slicing()
+```
+
+## Model-Specific Guidelines
+
+### FLUX Models
+- Do NOT use guidance_scale parameter (not needed for FLUX)
+- Use 4-8 inference steps maximum
+- BF16 dtype recommended
+- Enable attention slicing for memory optimization
+
+### Stable Diffusion XL
+- Enable attention slicing for high resolutions
+- Use refiner model sparingly to save memory
+- Consider VAE tiling for >1024px images
+
+### Stable Diffusion 1.5/2.1
+- Very memory efficient base models
+- Can often run without optimizations on 8GB+ VRAM
+- Enable VAE slicing for batch processing
+
+## Memory Usage Estimation
+- FLUX.1: ~24GB for full precision, ~12GB for FP16
+- SDXL: ~7GB for FP16, ~14GB for FP32
+- SD 1.5: ~2GB for FP16, ~4GB for FP32
+
+## Optimization Combinations by VRAM
+
+### 24GB+ VRAM (High-end)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+pipe = pipe.to("cuda")
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+```
+
+### 12-24GB VRAM (Mid-range)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.enable_model_cpu_offload()
+pipe.enable_xformers_memory_efficient_attention()
+```
+
+### 8-12GB VRAM (Entry-level)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing()
+pipe.enable_vae_slicing()
+pipe.enable_xformers_memory_efficient_attention()
+```
+
+### <8GB VRAM (Low-end)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing("max")
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+```
+
+
+IMPORTANT: For FLUX.1-schnell models, do NOT include guidance_scale parameter as it's not needed.
+
+Using the OPTIMIZATION KNOWLEDGE BASE above, generate Python code that:
+
+1. **Selects the best optimization techniques** for the specific hardware profile
+2. **Applies appropriate memory optimizations** based on available VRAM
+3. **Uses optimal data types** for the target hardware:
+   - User specified dtype (if provided): Use exactly as specified
+   - Apple Silicon (MPS): prefer torch.bfloat16
+   - NVIDIA GPUs: prefer torch.float16 or torch.bfloat16 
+   - CPU only: use torch.float32
+4. **Implements hardware-specific optimizations** (CUDA, MPS, CPU)
+5. **Follows model-specific guidelines** (e.g., FLUX guidance_scale handling)
+
+IMPORTANT GUIDELINES:
+- Reference the OPTIMIZATION KNOWLEDGE BASE to select appropriate techniques
+- Include all necessary imports
+- Add brief comments explaining optimization choices
+- Generate compact, production-ready code
+- Inline values where possible for concise code
+- Generate ONLY the Python code, no explanations before or after the code block
+
+2025-05-30 12:59:32,000 - auto_diffusers - INFO - ================================================================================
+2025-05-30 12:59:32,000 - auto_diffusers - INFO - Sending request to Gemini API
+2025-05-30 12:59:47,609 - auto_diffusers - INFO - Successfully received response from Gemini API (no tools used)
+2025-05-30 12:59:47,609 - auto_diffusers - DEBUG - Response length: 2336 characters
+2025-05-30 13:01:46,192 - __main__ - INFO - Initializing GradioAutodiffusers
+2025-05-30 13:01:46,192 - __main__ - DEBUG - API key found, length: 39
+2025-05-30 13:01:46,192 - auto_diffusers - INFO - Initializing AutoDiffusersGenerator
+2025-05-30 13:01:46,192 - auto_diffusers - DEBUG - API key length: 39
+2025-05-30 13:01:46,192 - auto_diffusers - WARNING - Tool calling dependencies not available, running without tools
+2025-05-30 13:01:46,192 - hardware_detector - INFO - Initializing HardwareDetector
+2025-05-30 13:01:46,192 - hardware_detector - DEBUG - Starting system hardware detection
+2025-05-30 13:01:46,193 - hardware_detector - DEBUG - Platform: Darwin, Architecture: arm64
+2025-05-30 13:01:46,193 - hardware_detector - DEBUG - CPU cores: 16, Python: 3.11.11
+2025-05-30 13:01:46,193 - hardware_detector - DEBUG - Attempting GPU detection via nvidia-smi
+2025-05-30 13:01:46,197 - hardware_detector - DEBUG - nvidia-smi not found, no NVIDIA GPU detected
+2025-05-30 13:01:46,197 - hardware_detector - DEBUG - Checking PyTorch availability
+2025-05-30 13:01:46,686 - hardware_detector - INFO - PyTorch 2.7.0 detected
+2025-05-30 13:01:46,686 - hardware_detector - DEBUG - CUDA available: False, MPS available: True
+2025-05-30 13:01:46,686 - hardware_detector - INFO - Hardware detection completed successfully
+2025-05-30 13:01:46,686 - hardware_detector - DEBUG - Detected specs: {'platform': 'Darwin', 'architecture': 'arm64', 'cpu_count': 16, 'python_version': '3.11.11', 'gpu_info': None, 'cuda_available': False, 'mps_available': True, 'torch_version': '2.7.0'}
+2025-05-30 13:01:46,686 - auto_diffusers - INFO - Hardware detector initialized successfully
+2025-05-30 13:01:46,686 - __main__ - INFO - AutoDiffusersGenerator initialized successfully
+2025-05-30 13:01:46,687 - simple_memory_calculator - INFO - Initializing SimpleMemoryCalculator
+2025-05-30 13:01:46,687 - simple_memory_calculator - DEBUG - HuggingFace API initialized
+2025-05-30 13:01:46,687 - simple_memory_calculator - DEBUG - Known models in database: 4
+2025-05-30 13:01:46,687 - __main__ - INFO - SimpleMemoryCalculator initialized successfully
+2025-05-30 13:01:46,687 - __main__ - DEBUG - Default model settings: gemini-2.5-flash-preview-05-20, temp=0.7
+2025-05-30 13:01:46,689 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:01:46,702 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=3 socket_options=None
+2025-05-30 13:01:46,710 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:01:46,789 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:01:46,822 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=None socket_options=None
+2025-05-30 13:01:46,823 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x126713f90>
+2025-05-30 13:01:46,823 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:01:46,823 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:01:46,823 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:01:46,824 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:01:46,824 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:01:46,824 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:01:46 GMT'), (b'server', b'uvicorn'), (b'content-length', b'4'), (b'content-type', b'application/json')])
+2025-05-30 13:01:46,824 - httpx - INFO - HTTP Request: GET http://localhost:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
+2025-05-30 13:01:46,824 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:01:46,824 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:01:46,824 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:01:46,824 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:01:46,824 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:01:46,824 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:01:46,825 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=3 socket_options=None
+2025-05-30 13:01:46,825 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x127d31850>
+2025-05-30 13:01:46,825 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:01:46,825 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:01:46,825 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'HEAD']>
+2025-05-30 13:01:46,825 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:01:46,825 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:01:46,832 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:01:46 GMT'), (b'server', b'uvicorn'), (b'content-length', b'105165'), (b'content-type', b'text/html; charset=utf-8')])
+2025-05-30 13:01:46,832 - httpx - INFO - HTTP Request: HEAD http://localhost:7860/ "HTTP/1.1 200 OK"
+2025-05-30 13:01:46,832 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'HEAD']>
+2025-05-30 13:01:46,832 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:01:46,832 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:01:46,832 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:01:46,832 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:01:46,832 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:01:46,843 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=30 socket_options=None
+2025-05-30 13:01:47,914 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x126775f90>
+2025-05-30 13:01:47,914 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x127c7f790>
+2025-05-30 13:01:47,914 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x112d29130> server_hostname='api.gradio.app' timeout=3
+2025-05-30 13:01:47,914 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x127aabe30> server_hostname='api.gradio.app' timeout=30
+2025-05-30 13:01:48,744 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/initiated HTTP/1.1" 200 0
+2025-05-30 13:01:48,881 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12678ee50>
+2025-05-30 13:01:48,882 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:01:48,882 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x125de85d0>
+2025-05-30 13:01:48,883 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:01:48,883 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:01:48,883 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:01:48,883 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:01:48,883 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:01:48,884 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:01:48,884 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:01:48,884 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:01:48,884 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:01:49,012 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:01:49,012 - simple_memory_calculator - INFO - Using known memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:01:49,012 - simple_memory_calculator - DEBUG - Known data: {'params_billions': 12.0, 'fp16_gb': 24.0, 'inference_fp16_gb': 36.0}
+2025-05-30 13:01:49,012 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 13:01:49,012 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:01:49,013 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:01:49,013 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 13:01:49,013 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:01:49,013 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:01:49,026 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:01:48 GMT'), (b'Content-Type', b'text/html; charset=utf-8'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'ContentType', b'application/json'), (b'Access-Control-Allow-Origin', b'*'), (b'Content-Encoding', b'gzip')])
+2025-05-30 13:01:49,026 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:01:48 GMT'), (b'Content-Type', b'application/json'), (b'Content-Length', b'21'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'Access-Control-Allow-Origin', b'*')])
+2025-05-30 13:01:49,026 - httpx - INFO - HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"
+2025-05-30 13:01:49,026 - httpx - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
+2025-05-30 13:01:49,026 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:01:49,026 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:01:49,027 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:01:49,027 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:01:49,027 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:01:49,027 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:01:49,027 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:01:49,027 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:01:49,027 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:01:49,027 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:01:49,027 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:01:49,027 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:01:49,637 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:01:49,851 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/launched HTTP/1.1" 200 0
+2025-05-30 13:03:36,653 - __main__ - INFO - Initializing GradioAutodiffusers
+2025-05-30 13:03:36,653 - __main__ - DEBUG - API key found, length: 39
+2025-05-30 13:03:36,653 - auto_diffusers - INFO - Initializing AutoDiffusersGenerator
+2025-05-30 13:03:36,653 - auto_diffusers - DEBUG - API key length: 39
+2025-05-30 13:03:36,653 - auto_diffusers - WARNING - Tool calling dependencies not available, running without tools
+2025-05-30 13:03:36,653 - hardware_detector - INFO - Initializing HardwareDetector
+2025-05-30 13:03:36,653 - hardware_detector - DEBUG - Starting system hardware detection
+2025-05-30 13:03:36,653 - hardware_detector - DEBUG - Platform: Darwin, Architecture: arm64
+2025-05-30 13:03:36,653 - hardware_detector - DEBUG - CPU cores: 16, Python: 3.11.11
+2025-05-30 13:03:36,653 - hardware_detector - DEBUG - Attempting GPU detection via nvidia-smi
+2025-05-30 13:03:36,657 - hardware_detector - DEBUG - nvidia-smi not found, no NVIDIA GPU detected
+2025-05-30 13:03:36,657 - hardware_detector - DEBUG - Checking PyTorch availability
+2025-05-30 13:03:37,119 - hardware_detector - INFO - PyTorch 2.7.0 detected
+2025-05-30 13:03:37,119 - hardware_detector - DEBUG - CUDA available: False, MPS available: True
+2025-05-30 13:03:37,119 - hardware_detector - INFO - Hardware detection completed successfully
+2025-05-30 13:03:37,119 - hardware_detector - DEBUG - Detected specs: {'platform': 'Darwin', 'architecture': 'arm64', 'cpu_count': 16, 'python_version': '3.11.11', 'gpu_info': None, 'cuda_available': False, 'mps_available': True, 'torch_version': '2.7.0'}
+2025-05-30 13:03:37,119 - auto_diffusers - INFO - Hardware detector initialized successfully
+2025-05-30 13:03:37,119 - __main__ - INFO - AutoDiffusersGenerator initialized successfully
+2025-05-30 13:03:37,119 - simple_memory_calculator - INFO - Initializing SimpleMemoryCalculator
+2025-05-30 13:03:37,119 - simple_memory_calculator - DEBUG - HuggingFace API initialized
+2025-05-30 13:03:37,119 - simple_memory_calculator - DEBUG - Known models in database: 4
+2025-05-30 13:03:37,119 - __main__ - INFO - SimpleMemoryCalculator initialized successfully
+2025-05-30 13:03:37,119 - __main__ - DEBUG - Default model settings: gemini-2.5-flash-preview-05-20, temp=0.7
+2025-05-30 13:03:37,121 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:03:37,135 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=3 socket_options=None
+2025-05-30 13:03:37,135 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:03:37,221 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:03:37,253 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=None socket_options=None
+2025-05-30 13:03:37,253 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x129008e50>
+2025-05-30 13:03:37,254 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:03:37,254 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:03:37,254 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:03:37,254 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:03:37,254 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:03:37,254 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:03:37 GMT'), (b'server', b'uvicorn'), (b'content-length', b'4'), (b'content-type', b'application/json')])
+2025-05-30 13:03:37,255 - httpx - INFO - HTTP Request: GET http://localhost:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
+2025-05-30 13:03:37,255 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:03:37,255 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:03:37,255 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:03:37,255 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:03:37,255 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:03:37,255 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:03:37,255 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=3 socket_options=None
+2025-05-30 13:03:37,256 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x129009f90>
+2025-05-30 13:03:37,256 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:03:37,256 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:03:37,256 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'HEAD']>
+2025-05-30 13:03:37,256 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:03:37,256 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:03:37,263 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:03:37 GMT'), (b'server', b'uvicorn'), (b'content-length', b'105762'), (b'content-type', b'text/html; charset=utf-8')])
+2025-05-30 13:03:37,263 - httpx - INFO - HTTP Request: HEAD http://localhost:7860/ "HTTP/1.1 200 OK"
+2025-05-30 13:03:37,263 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'HEAD']>
+2025-05-30 13:03:37,263 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:03:37,263 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:03:37,263 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:03:37,263 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:03:37,263 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:03:37,274 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=30 socket_options=None
+2025-05-30 13:03:37,299 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x11f8927d0>
+2025-05-30 13:03:37,299 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x11f791880> server_hostname='api.gradio.app' timeout=3
+2025-05-30 13:03:37,422 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x11fc652d0>
+2025-05-30 13:03:37,422 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x11fdd3ec0> server_hostname='api.gradio.app' timeout=30
+2025-05-30 13:03:37,427 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/initiated HTTP/1.1" 200 0
+2025-05-30 13:03:37,577 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x11fada110>
+2025-05-30 13:03:37,577 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:03:37,577 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:03:37,577 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:03:37,577 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:03:37,577 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:03:37,717 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:03:37 GMT'), (b'Content-Type', b'application/json'), (b'Content-Length', b'21'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'Access-Control-Allow-Origin', b'*')])
+2025-05-30 13:03:37,718 - httpx - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
+2025-05-30 13:03:37,718 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:03:37,718 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:03:37,718 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:03:37,718 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:03:37,719 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:03:37,719 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x11fa87e10>
+2025-05-30 13:03:37,719 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:03:37,719 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:03:37,720 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:03:37,720 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:03:37,720 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:03:37,720 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:03:37,873 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:03:37 GMT'), (b'Content-Type', b'text/html; charset=utf-8'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'ContentType', b'application/json'), (b'Access-Control-Allow-Origin', b'*'), (b'Content-Encoding', b'gzip')])
+2025-05-30 13:03:37,873 - httpx - INFO - HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"
+2025-05-30 13:03:37,874 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:03:37,874 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:03:37,874 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:03:37,875 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:03:37,875 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:03:37,875 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:03:38,014 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:38,014 - simple_memory_calculator - INFO - Using known memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:38,014 - simple_memory_calculator - DEBUG - Known data: {'params_billions': 12.0, 'fp16_gb': 24.0, 'inference_fp16_gb': 36.0}
+2025-05-30 13:03:38,014 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 13:03:38,014 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:38,014 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:38,015 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 13:03:38,015 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:38,015 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:38,475 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:03:38,704 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/launched HTTP/1.1" 200 0
+2025-05-30 13:03:44,362 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:44,362 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:44,362 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 13:03:44,362 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:44,362 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:44,362 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 13:03:44,362 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:44,362 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:44,365 - auto_diffusers - INFO - Starting code generation for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:03:44,366 - auto_diffusers - DEBUG - Parameters: prompt='A cat holding a sign that says hello world...', size=(768, 1360), steps=4
+2025-05-30 13:03:44,366 - auto_diffusers - DEBUG - Manual specs: True, Memory analysis provided: True
+2025-05-30 13:03:44,366 - auto_diffusers - INFO - Using manual hardware specifications
+2025-05-30 13:03:44,366 - auto_diffusers - DEBUG - Manual specs: {'platform': 'Linux', 'architecture': 'manual_input', 'cpu_count': 8, 'python_version': '3.11', 'cuda_available': False, 'mps_available': False, 'torch_version': '2.0+', 'manual_input': True, 'ram_gb': 16, 'user_dtype': None, 'gpu_info': [{'name': 'Custom GPU', 'memory_mb': 8192}]}
+2025-05-30 13:03:44,367 - auto_diffusers - DEBUG - GPU detected with 8.0 GB VRAM
+2025-05-30 13:03:44,367 - auto_diffusers - INFO - Selected optimization profile: balanced
+2025-05-30 13:03:44,367 - auto_diffusers - DEBUG - Creating generation prompt for Gemini API
+2025-05-30 13:03:44,367 - auto_diffusers - DEBUG - Prompt length: 7598 characters
+2025-05-30 13:03:44,367 - auto_diffusers - INFO - ================================================================================
+2025-05-30 13:03:44,367 - auto_diffusers - INFO - PROMPT SENT TO GEMINI API:
+2025-05-30 13:03:44,368 - auto_diffusers - INFO - ================================================================================
+2025-05-30 13:03:44,368 - auto_diffusers - INFO - 
+You are an expert in optimizing diffusers library code for different hardware configurations.
+
+NOTE: This system includes curated optimization knowledge from HuggingFace documentation.
+
+TASK: Generate optimized Python code for running a diffusion model with the following specifications:
+- Model: black-forest-labs/FLUX.1-schnell
+- Prompt: "A cat holding a sign that says hello world"
+- Image size: 768x1360
+- Inference steps: 4
+
+HARDWARE SPECIFICATIONS:
+- Platform: Linux (manual_input)
+- CPU Cores: 8
+- CUDA Available: False
+- MPS Available: False
+- Optimization Profile: balanced
+- GPU: Custom GPU (8.0 GB VRAM)
+
+MEMORY ANALYSIS:
+- Model Memory Requirements: 36.0 GB (FP16 inference)
+- Model Weights Size: 24.0 GB (FP16)
+- Memory Recommendation: 🔄 Requires sequential CPU offloading
+- Recommended Precision: float16
+- Attention Slicing Recommended: True
+- VAE Slicing Recommended: True
+
+OPTIMIZATION KNOWLEDGE BASE:
+
+# DIFFUSERS OPTIMIZATION TECHNIQUES
+
+## Memory Optimization Techniques
+
+### 1. Model CPU Offloading
+Use `enable_model_cpu_offload()` to move models between GPU and CPU automatically:
+```python
+pipe.enable_model_cpu_offload()
+```
+- Saves significant VRAM by keeping only active models on GPU
+- Automatic management, no manual intervention needed
+- Compatible with all pipelines
+
+### 2. Sequential CPU Offloading  
+Use `enable_sequential_cpu_offload()` for more aggressive memory saving:
+```python
+pipe.enable_sequential_cpu_offload()
+```
+- More memory efficient than model offloading
+- Moves models to CPU after each forward pass
+- Best for very limited VRAM scenarios
+
+### 3. Attention Slicing
+Use `enable_attention_slicing()` to reduce memory during attention computation:
+```python
+pipe.enable_attention_slicing()
+# or specify slice size
+pipe.enable_attention_slicing("max")  # maximum slicing
+pipe.enable_attention_slicing(1)      # slice_size = 1
+```
+- Trades compute time for memory
+- Most effective for high-resolution images
+- Can be combined with other techniques
+
+### 4. VAE Slicing
+Use `enable_vae_slicing()` for large batch processing:
+```python
+pipe.enable_vae_slicing()
+```
+- Decodes images one at a time instead of all at once
+- Essential for batch sizes > 4
+- Minimal performance impact on single images
+
+### 5. VAE Tiling
+Use `enable_vae_tiling()` for high-resolution image generation:
+```python
+pipe.enable_vae_tiling()
+```
+- Enables 4K+ image generation on 8GB VRAM
+- Splits images into overlapping tiles
+- Automatically disabled for 512x512 or smaller images
+
+### 6. Memory Efficient Attention (xFormers)
+Use `enable_xformers_memory_efficient_attention()` if xFormers is installed:
+```python
+pipe.enable_xformers_memory_efficient_attention()
+```
+- Significantly reduces memory usage and improves speed
+- Requires xformers library installation
+- Compatible with most models
+
+## Performance Optimization Techniques
+
+### 1. Half Precision (FP16/BF16)
+Use lower precision for better memory and speed:
+```python
+# FP16 (widely supported)
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+
+# BF16 (better numerical stability, newer hardware)
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+```
+- FP16: Halves memory usage, widely supported
+- BF16: Better numerical stability, requires newer GPUs
+- Essential for most optimization scenarios
+
+### 2. Torch Compile (PyTorch 2.0+)
+Use `torch.compile()` for significant speed improvements:
+```python
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+# For some models, compile VAE too:
+pipe.vae.decode = torch.compile(pipe.vae.decode, mode="reduce-overhead", fullgraph=True)
+```
+- 5-50% speed improvement
+- Requires PyTorch 2.0+
+- First run is slower due to compilation
+
+### 3. Fast Schedulers
+Use faster schedulers for fewer steps:
+```python
+from diffusers import LMSDiscreteScheduler, UniPCMultistepScheduler
+
+# LMS Scheduler (good quality, fast)
+pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+
+# UniPC Scheduler (fastest)
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+```
+
+## Hardware-Specific Optimizations
+
+### NVIDIA GPU Optimizations
+```python
+# Enable Tensor Cores
+torch.backends.cudnn.benchmark = True
+
+# Optimal data type for NVIDIA
+torch_dtype = torch.float16  # or torch.bfloat16 for RTX 30/40 series
+```
+
+### Apple Silicon (MPS) Optimizations
+```python
+# Use MPS device
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+pipe = pipe.to(device)
+
+# Recommended dtype for Apple Silicon
+torch_dtype = torch.bfloat16  # Better than float16 on Apple Silicon
+
+# Attention slicing often helps on MPS
+pipe.enable_attention_slicing()
+```
+
+### CPU Optimizations
+```python
+# Use float32 for CPU
+torch_dtype = torch.float32
+
+# Enable optimized attention
+pipe.enable_attention_slicing()
+```
+
+## Model-Specific Guidelines
+
+### FLUX Models
+- Do NOT use guidance_scale parameter (not needed for FLUX)
+- Use 4-8 inference steps maximum
+- BF16 dtype recommended
+- Enable attention slicing for memory optimization
+
+### Stable Diffusion XL
+- Enable attention slicing for high resolutions
+- Use refiner model sparingly to save memory
+- Consider VAE tiling for >1024px images
+
+### Stable Diffusion 1.5/2.1
+- Very memory efficient base models
+- Can often run without optimizations on 8GB+ VRAM
+- Enable VAE slicing for batch processing
+
+## Memory Usage Estimation
+- FLUX.1: ~24GB for full precision, ~12GB for FP16
+- SDXL: ~7GB for FP16, ~14GB for FP32
+- SD 1.5: ~2GB for FP16, ~4GB for FP32
+
+## Optimization Combinations by VRAM
+
+### 24GB+ VRAM (High-end)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+pipe = pipe.to("cuda")
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+```
+
+### 12-24GB VRAM (Mid-range)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+pipe.enable_model_cpu_offload()
+pipe.enable_xformers_memory_efficient_attention()
+```
+
+### 8-12GB VRAM (Entry-level)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing()
+pipe.enable_vae_slicing()
+pipe.enable_xformers_memory_efficient_attention()
+```
+
+### <8GB VRAM (Low-end)
+```python
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing("max")
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+```
+
+
+IMPORTANT: For FLUX.1-schnell models, do NOT include guidance_scale parameter as it's not needed.
+
+Using the OPTIMIZATION KNOWLEDGE BASE above, generate Python code that:
+
+1. **Selects the best optimization techniques** for the specific hardware profile
+2. **Applies appropriate memory optimizations** based on available VRAM
+3. **Uses optimal data types** for the target hardware:
+   - User specified dtype (if provided): Use exactly as specified
+   - Apple Silicon (MPS): prefer torch.bfloat16
+   - NVIDIA GPUs: prefer torch.float16 or torch.bfloat16 
+   - CPU only: use torch.float32
+4. **Implements hardware-specific optimizations** (CUDA, MPS, CPU)
+5. **Follows model-specific guidelines** (e.g., FLUX guidance_scale handling)
+
+IMPORTANT GUIDELINES:
+- Reference the OPTIMIZATION KNOWLEDGE BASE to select appropriate techniques
+- Include all necessary imports
+- Add brief comments explaining optimization choices
+- Generate compact, production-ready code
+- Inline values where possible for concise code
+- Generate ONLY the Python code, no explanations before or after the code block
+
+2025-05-30 13:03:44,368 - auto_diffusers - INFO - ================================================================================
+2025-05-30 13:03:44,368 - auto_diffusers - INFO - Sending request to Gemini API
+2025-05-30 13:04:03,397 - auto_diffusers - INFO - Successfully received response from Gemini API (no tools used)
+2025-05-30 13:04:03,398 - auto_diffusers - DEBUG - Response length: 2233 characters
+2025-05-30 13:05:29,939 - __main__ - INFO - Initializing GradioAutodiffusers
+2025-05-30 13:05:29,939 - __main__ - DEBUG - API key found, length: 39
+2025-05-30 13:05:29,939 - auto_diffusers - INFO - Initializing AutoDiffusersGenerator
+2025-05-30 13:05:29,939 - auto_diffusers - DEBUG - API key length: 39
+2025-05-30 13:05:29,939 - auto_diffusers - WARNING - Tool calling dependencies not available, running without tools
+2025-05-30 13:05:29,939 - hardware_detector - INFO - Initializing HardwareDetector
+2025-05-30 13:05:29,939 - hardware_detector - DEBUG - Starting system hardware detection
+2025-05-30 13:05:29,939 - hardware_detector - DEBUG - Platform: Darwin, Architecture: arm64
+2025-05-30 13:05:29,939 - hardware_detector - DEBUG - CPU cores: 16, Python: 3.11.11
+2025-05-30 13:05:29,939 - hardware_detector - DEBUG - Attempting GPU detection via nvidia-smi
+2025-05-30 13:05:29,943 - hardware_detector - DEBUG - nvidia-smi not found, no NVIDIA GPU detected
+2025-05-30 13:05:29,943 - hardware_detector - DEBUG - Checking PyTorch availability
+2025-05-30 13:05:30,408 - hardware_detector - INFO - PyTorch 2.7.0 detected
+2025-05-30 13:05:30,408 - hardware_detector - DEBUG - CUDA available: False, MPS available: True
+2025-05-30 13:05:30,408 - hardware_detector - INFO - Hardware detection completed successfully
+2025-05-30 13:05:30,408 - hardware_detector - DEBUG - Detected specs: {'platform': 'Darwin', 'architecture': 'arm64', 'cpu_count': 16, 'python_version': '3.11.11', 'gpu_info': None, 'cuda_available': False, 'mps_available': True, 'torch_version': '2.7.0'}
+2025-05-30 13:05:30,409 - auto_diffusers - INFO - Hardware detector initialized successfully
+2025-05-30 13:05:30,409 - __main__ - INFO - AutoDiffusersGenerator initialized successfully
+2025-05-30 13:05:30,409 - simple_memory_calculator - INFO - Initializing SimpleMemoryCalculator
+2025-05-30 13:05:30,409 - simple_memory_calculator - DEBUG - HuggingFace API initialized
+2025-05-30 13:05:30,409 - simple_memory_calculator - DEBUG - Known models in database: 4
+2025-05-30 13:05:30,409 - __main__ - INFO - SimpleMemoryCalculator initialized successfully
+2025-05-30 13:05:30,409 - __main__ - DEBUG - Default model settings: gemini-2.5-flash-preview-05-20, temp=0.7
+2025-05-30 13:05:30,411 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:05:30,412 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:05:30,429 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=3 socket_options=None
+2025-05-30 13:05:30,516 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:05:30,550 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=None socket_options=None
+2025-05-30 13:05:30,551 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12752c310>
+2025-05-30 13:05:30,551 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:05:30,551 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:05:30,551 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:05:30,551 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:05:30,551 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:05:30,552 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:05:30 GMT'), (b'server', b'uvicorn'), (b'content-length', b'4'), (b'content-type', b'application/json')])
+2025-05-30 13:05:30,552 - httpx - INFO - HTTP Request: GET http://localhost:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
+2025-05-30 13:05:30,552 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:05:30,552 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:05:30,552 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:05:30,552 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:05:30,552 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:05:30,552 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:05:30,553 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=3 socket_options=None
+2025-05-30 13:05:30,553 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12740cad0>
+2025-05-30 13:05:30,553 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:05:30,553 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:05:30,553 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'HEAD']>
+2025-05-30 13:05:30,553 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:05:30,553 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:05:30,561 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:05:30 GMT'), (b'server', b'uvicorn'), (b'content-length', b'106851'), (b'content-type', b'text/html; charset=utf-8')])
+2025-05-30 13:05:30,561 - httpx - INFO - HTTP Request: HEAD http://localhost:7860/ "HTTP/1.1 200 OK"
+2025-05-30 13:05:30,561 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'HEAD']>
+2025-05-30 13:05:30,561 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:05:30,561 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:05:30,561 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:05:30,561 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:05:30,561 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:05:30,573 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=30 socket_options=None
+2025-05-30 13:05:30,598 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12609de10>
+2025-05-30 13:05:30,599 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x11592d130> server_hostname='api.gradio.app' timeout=3
+2025-05-30 13:05:30,688 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/initiated HTTP/1.1" 200 0
+2025-05-30 13:05:30,711 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1260851d0>
+2025-05-30 13:05:30,711 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x1272cbec0> server_hostname='api.gradio.app' timeout=30
+2025-05-30 13:05:30,891 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12721d4d0>
+2025-05-30 13:05:30,892 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:05:30,892 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:05:30,892 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:05:30,892 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:05:30,893 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:05:30,991 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1261279d0>
+2025-05-30 13:05:30,991 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:05:30,991 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:05:30,991 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:05:30,992 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:05:30,992 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:05:31,039 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:05:30 GMT'), (b'Content-Type', b'application/json'), (b'Content-Length', b'21'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'Access-Control-Allow-Origin', b'*')])
+2025-05-30 13:05:31,040 - httpx - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
+2025-05-30 13:05:31,040 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:05:31,040 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:05:31,040 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:05:31,040 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:05:31,041 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:05:31,041 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:05:31,135 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:05:31 GMT'), (b'Content-Type', b'text/html; charset=utf-8'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'ContentType', b'application/json'), (b'Access-Control-Allow-Origin', b'*'), (b'Content-Encoding', b'gzip')])
+2025-05-30 13:05:31,135 - httpx - INFO - HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"
+2025-05-30 13:05:31,135 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:05:31,136 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:05:31,136 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:05:31,136 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:05:31,136 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:05:31,136 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:05:31,772 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:05:31,821 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:05:31,821 - simple_memory_calculator - INFO - Using known memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:05:31,821 - simple_memory_calculator - DEBUG - Known data: {'params_billions': 12.0, 'fp16_gb': 24.0, 'inference_fp16_gb': 36.0}
+2025-05-30 13:05:31,821 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 13:05:31,821 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:05:31,821 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:05:31,821 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 13:05:31,822 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:05:31,822 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:05:32,345 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/launched HTTP/1.1" 200 0
+2025-05-30 13:08:55,159 - __main__ - INFO - Initializing GradioAutodiffusers
+2025-05-30 13:08:55,159 - __main__ - DEBUG - API key found, length: 39
+2025-05-30 13:08:55,159 - auto_diffusers - INFO - Initializing AutoDiffusersGenerator
+2025-05-30 13:08:55,159 - auto_diffusers - DEBUG - API key length: 39
+2025-05-30 13:08:55,159 - auto_diffusers - WARNING - Tool calling dependencies not available, running without tools
+2025-05-30 13:08:55,159 - hardware_detector - INFO - Initializing HardwareDetector
+2025-05-30 13:08:55,159 - hardware_detector - DEBUG - Starting system hardware detection
+2025-05-30 13:08:55,159 - hardware_detector - DEBUG - Platform: Darwin, Architecture: arm64
+2025-05-30 13:08:55,159 - hardware_detector - DEBUG - CPU cores: 16, Python: 3.11.11
+2025-05-30 13:08:55,159 - hardware_detector - DEBUG - Attempting GPU detection via nvidia-smi
+2025-05-30 13:08:55,163 - hardware_detector - DEBUG - nvidia-smi not found, no NVIDIA GPU detected
+2025-05-30 13:08:55,163 - hardware_detector - DEBUG - Checking PyTorch availability
+2025-05-30 13:08:55,637 - hardware_detector - INFO - PyTorch 2.7.0 detected
+2025-05-30 13:08:55,637 - hardware_detector - DEBUG - CUDA available: False, MPS available: True
+2025-05-30 13:08:55,637 - hardware_detector - INFO - Hardware detection completed successfully
+2025-05-30 13:08:55,637 - hardware_detector - DEBUG - Detected specs: {'platform': 'Darwin', 'architecture': 'arm64', 'cpu_count': 16, 'python_version': '3.11.11', 'gpu_info': None, 'cuda_available': False, 'mps_available': True, 'torch_version': '2.7.0'}
+2025-05-30 13:08:55,637 - auto_diffusers - INFO - Hardware detector initialized successfully
+2025-05-30 13:08:55,637 - __main__ - INFO - AutoDiffusersGenerator initialized successfully
+2025-05-30 13:08:55,637 - simple_memory_calculator - INFO - Initializing SimpleMemoryCalculator
+2025-05-30 13:08:55,637 - simple_memory_calculator - DEBUG - HuggingFace API initialized
+2025-05-30 13:08:55,637 - simple_memory_calculator - DEBUG - Known models in database: 4
+2025-05-30 13:08:55,637 - __main__ - INFO - SimpleMemoryCalculator initialized successfully
+2025-05-30 13:08:55,638 - __main__ - DEBUG - Default model settings: gemini-2.5-flash-preview-05-20, temp=0.7
+2025-05-30 13:08:55,640 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:08:55,647 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:08:55,654 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=3 socket_options=None
+2025-05-30 13:08:55,739 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:08:55,771 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=None socket_options=None
+2025-05-30 13:08:55,772 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12cf9fa50>
+2025-05-30 13:08:55,772 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:08:55,772 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:08:55,772 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:08:55,772 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:08:55,773 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:08:55,773 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:08:55 GMT'), (b'server', b'uvicorn'), (b'content-length', b'4'), (b'content-type', b'application/json')])
+2025-05-30 13:08:55,773 - httpx - INFO - HTTP Request: GET http://localhost:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
+2025-05-30 13:08:55,773 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:08:55,773 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:08:55,773 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:08:55,773 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:08:55,773 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:08:55,773 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:08:55,774 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=3 socket_options=None
+2025-05-30 13:08:55,774 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12cec8c10>
+2025-05-30 13:08:55,774 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:08:55,774 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:08:55,774 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'HEAD']>
+2025-05-30 13:08:55,774 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:08:55,774 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:08:55,781 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:08:55 GMT'), (b'server', b'uvicorn'), (b'content-length', b'106859'), (b'content-type', b'text/html; charset=utf-8')])
+2025-05-30 13:08:55,781 - httpx - INFO - HTTP Request: HEAD http://localhost:7860/ "HTTP/1.1 200 OK"
+2025-05-30 13:08:55,781 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'HEAD']>
+2025-05-30 13:08:55,781 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:08:55,781 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:08:55,781 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:08:55,781 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:08:55,781 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:08:55,793 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=30 socket_options=None
+2025-05-30 13:08:55,816 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x128fbb510>
+2025-05-30 13:08:55,816 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x10fa29130> server_hostname='api.gradio.app' timeout=3
+2025-05-30 13:08:55,943 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/initiated HTTP/1.1" 200 0
+2025-05-30 13:08:55,949 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12ce42a50>
+2025-05-30 13:08:55,949 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x12ce37ec0> server_hostname='api.gradio.app' timeout=30
+2025-05-30 13:08:56,092 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12cdb5cd0>
+2025-05-30 13:08:56,093 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:08:56,093 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:08:56,093 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:08:56,093 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:08:56,093 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:08:56,234 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:08:56 GMT'), (b'Content-Type', b'application/json'), (b'Content-Length', b'21'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'Access-Control-Allow-Origin', b'*')])
+2025-05-30 13:08:56,235 - httpx - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
+2025-05-30 13:08:56,235 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:08:56,236 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:08:56,236 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:08:56,236 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:08:56,237 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:08:56,237 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:08:56,263 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x128a00cd0>
+2025-05-30 13:08:56,263 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:08:56,264 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:08:56,264 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:08:56,264 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:08:56,264 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:08:56,421 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:08:56 GMT'), (b'Content-Type', b'text/html; charset=utf-8'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'ContentType', b'application/json'), (b'Access-Control-Allow-Origin', b'*'), (b'Content-Encoding', b'gzip')])
+2025-05-30 13:08:56,422 - httpx - INFO - HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"
+2025-05-30 13:08:56,422 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:08:56,422 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:08:56,422 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:08:56,422 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:08:56,422 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:08:56,423 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:08:57,087 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:08:57,301 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/launched HTTP/1.1" 200 0
+2025-05-30 13:09:01,464 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:09:01,464 - simple_memory_calculator - INFO - Using known memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:09:01,465 - simple_memory_calculator - DEBUG - Known data: {'params_billions': 12.0, 'fp16_gb': 24.0, 'inference_fp16_gb': 36.0}
+2025-05-30 13:09:01,465 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 13:09:01,465 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:09:01,465 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:09:01,465 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 13:09:01,465 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:09:01,465 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:10:26,389 - __main__ - INFO - Initializing GradioAutodiffusers
+2025-05-30 13:10:26,389 - __main__ - DEBUG - API key found, length: 39
+2025-05-30 13:10:26,389 - auto_diffusers - INFO - Initializing AutoDiffusersGenerator
+2025-05-30 13:10:26,389 - auto_diffusers - DEBUG - API key length: 39
+2025-05-30 13:10:26,389 - auto_diffusers - WARNING - Tool calling dependencies not available, running without tools
+2025-05-30 13:10:26,389 - hardware_detector - INFO - Initializing HardwareDetector
+2025-05-30 13:10:26,389 - hardware_detector - DEBUG - Starting system hardware detection
+2025-05-30 13:10:26,389 - hardware_detector - DEBUG - Platform: Darwin, Architecture: arm64
+2025-05-30 13:10:26,389 - hardware_detector - DEBUG - CPU cores: 16, Python: 3.11.11
+2025-05-30 13:10:26,389 - hardware_detector - DEBUG - Attempting GPU detection via nvidia-smi
+2025-05-30 13:10:26,393 - hardware_detector - DEBUG - nvidia-smi not found, no NVIDIA GPU detected
+2025-05-30 13:10:26,393 - hardware_detector - DEBUG - Checking PyTorch availability
+2025-05-30 13:10:26,846 - hardware_detector - INFO - PyTorch 2.7.0 detected
+2025-05-30 13:10:26,846 - hardware_detector - DEBUG - CUDA available: False, MPS available: True
+2025-05-30 13:10:26,846 - hardware_detector - INFO - Hardware detection completed successfully
+2025-05-30 13:10:26,846 - hardware_detector - DEBUG - Detected specs: {'platform': 'Darwin', 'architecture': 'arm64', 'cpu_count': 16, 'python_version': '3.11.11', 'gpu_info': None, 'cuda_available': False, 'mps_available': True, 'torch_version': '2.7.0'}
+2025-05-30 13:10:26,846 - auto_diffusers - INFO - Hardware detector initialized successfully
+2025-05-30 13:10:26,846 - __main__ - INFO - AutoDiffusersGenerator initialized successfully
+2025-05-30 13:10:26,846 - simple_memory_calculator - INFO - Initializing SimpleMemoryCalculator
+2025-05-30 13:10:26,846 - simple_memory_calculator - DEBUG - HuggingFace API initialized
+2025-05-30 13:10:26,846 - simple_memory_calculator - DEBUG - Known models in database: 4
+2025-05-30 13:10:26,847 - __main__ - INFO - SimpleMemoryCalculator initialized successfully
+2025-05-30 13:10:26,847 - __main__ - DEBUG - Default model settings: gemini-2.5-flash-preview-05-20, temp=0.7
+2025-05-30 13:10:26,849 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:10:26,861 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=3 socket_options=None
+2025-05-30 13:10:26,869 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:10:26,948 - asyncio - DEBUG - Using selector: KqueueSelector
+2025-05-30 13:10:26,990 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=None socket_options=None
+2025-05-30 13:10:26,991 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12b0be310>
+2025-05-30 13:10:26,991 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:10:26,991 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:10:26,991 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:10:26,992 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:10:26,992 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:10:26,992 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:10:26 GMT'), (b'server', b'uvicorn'), (b'content-length', b'4'), (b'content-type', b'application/json')])
+2025-05-30 13:10:26,992 - httpx - INFO - HTTP Request: GET http://localhost:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
+2025-05-30 13:10:26,992 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:10:26,992 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:10:26,992 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:10:26,992 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:10:26,992 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:10:26,992 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:10:26,993 - httpcore.connection - DEBUG - connect_tcp.started host='localhost' port=7860 local_address=None timeout=3 socket_options=None
+2025-05-30 13:10:26,994 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x12b0bde90>
+2025-05-30 13:10:26,994 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:10:26,995 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:10:26,995 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'HEAD']>
+2025-05-30 13:10:26,996 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:10:26,996 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'HEAD']>
+2025-05-30 13:10:27,001 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'date', b'Fri, 30 May 2025 04:10:26 GMT'), (b'server', b'uvicorn'), (b'content-length', b'108000'), (b'content-type', b'text/html; charset=utf-8')])
+2025-05-30 13:10:27,002 - httpx - INFO - HTTP Request: HEAD http://localhost:7860/ "HTTP/1.1 200 OK"
+2025-05-30 13:10:27,002 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'HEAD']>
+2025-05-30 13:10:27,003 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:10:27,003 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:10:27,003 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:10:27,004 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:10:27,004 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:10:27,017 - httpcore.connection - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=30 socket_options=None
+2025-05-30 13:10:27,221 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x129097790>
+2025-05-30 13:10:27,222 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x12b059010> server_hostname='api.gradio.app' timeout=30
+2025-05-30 13:10:27,245 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:10:27,245 - simple_memory_calculator - INFO - Using known memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:10:27,245 - simple_memory_calculator - DEBUG - Known data: {'params_billions': 12.0, 'fp16_gb': 24.0, 'inference_fp16_gb': 36.0}
+2025-05-30 13:10:27,245 - simple_memory_calculator - INFO - Generating memory recommendations for black-forest-labs/FLUX.1-schnell with 8.0GB VRAM
+2025-05-30 13:10:27,245 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:10:27,245 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:10:27,245 - simple_memory_calculator - DEBUG - Model memory: 24.0GB, Inference memory: 36.0GB
+2025-05-30 13:10:27,245 - simple_memory_calculator - INFO - Getting memory requirements for model: black-forest-labs/FLUX.1-schnell
+2025-05-30 13:10:27,245 - simple_memory_calculator - DEBUG - Using cached memory data for black-forest-labs/FLUX.1-schnell
+2025-05-30 13:10:27,252 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x129095c10>
+2025-05-30 13:10:27,252 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x128c8d640> server_hostname='api.gradio.app' timeout=3
+2025-05-30 13:10:27,259 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/initiated HTTP/1.1" 200 0
+2025-05-30 13:10:27,511 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1288f3990>
+2025-05-30 13:10:27,511 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:10:27,511 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:10:27,511 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:10:27,511 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:10:27,511 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:10:27,588 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1290411d0>
+2025-05-30 13:10:27,588 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'GET']>
+2025-05-30 13:10:27,589 - httpcore.http11 - DEBUG - send_request_headers.complete
+2025-05-30 13:10:27,589 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'GET']>
+2025-05-30 13:10:27,589 - httpcore.http11 - DEBUG - send_request_body.complete
+2025-05-30 13:10:27,589 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
+2025-05-30 13:10:27,657 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:10:27 GMT'), (b'Content-Type', b'text/html; charset=utf-8'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'ContentType', b'application/json'), (b'Access-Control-Allow-Origin', b'*'), (b'Content-Encoding', b'gzip')])
+2025-05-30 13:10:27,657 - httpx - INFO - HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"
+2025-05-30 13:10:27,658 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:10:27,658 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:10:27,658 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:10:27,659 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:10:27,659 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:10:27,659 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:10:27,757 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 30 May 2025 04:10:27 GMT'), (b'Content-Type', b'application/json'), (b'Content-Length', b'21'), (b'Connection', b'keep-alive'), (b'Server', b'nginx/1.18.0'), (b'Access-Control-Allow-Origin', b'*')])
+2025-05-30 13:10:27,758 - httpx - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
+2025-05-30 13:10:27,758 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'GET']>
+2025-05-30 13:10:27,758 - httpcore.http11 - DEBUG - receive_response_body.complete
+2025-05-30 13:10:27,758 - httpcore.http11 - DEBUG - response_closed.started
+2025-05-30 13:10:27,758 - httpcore.http11 - DEBUG - response_closed.complete
+2025-05-30 13:10:27,758 - httpcore.connection - DEBUG - close.started
+2025-05-30 13:10:27,758 - httpcore.connection - DEBUG - close.complete
+2025-05-30 13:10:28,324 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
+2025-05-30 13:10:28,548 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/launched HTTP/1.1" 200 0