Spaces:

comrender
/

fluxhdupscaler

Running on Zero

App Files Files Community

comrender commited on 4 days ago

Commit

597b21e

verified ·

1 Parent(s): 6b42d6b

Update app.py

Browse files

Files changed (1) hide show

app.py +180 -464

app.py CHANGED Viewed

@@ -1,491 +1,207 @@
-import logging
-import random
 import warnings
-import os
 import gradio as gr
-import numpy as np
-import spaces
 import torch
-from diffusers import FluxImg2ImgPipeline
-from transformers import AutoProcessor, AutoModelForCausalLM
-from gradio_imageslider import ImageSlider
 from PIL import Image
-from huggingface_hub import snapshot_download
-import requests
-# For ESRGAN (requires pip install basicsr gfpgan)
 try:
-    from basicsr.archs.rrdbnet_arch import RRDBNet
-    from basicsr.utils import img2tensor, tensor2img
-    USE_ESRGAN = True
 except ImportError:
-    USE_ESRGAN = False
     warnings.warn("basicsr not installed; falling back to LANCZOS interpolation.")
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 800px;
-}
-.main-header {
-    text-align: center;
-    margin-bottom: 2rem;
-}
-"""
-# Device setup - Force CPU for startup in ZeroGPU
-power_device = "ZeroGPU"
-device = "cpu"
-# Get HuggingFace token
-huggingface_token = os.getenv("HF_TOKEN")
-# Download FLUX model
-print("📥 Downloading FLUX model...")
-model_path = snapshot_download(
-    repo_id="black-forest-labs/FLUX.1-dev",
-    repo_type="model",
-    ignore_patterns=["*.md", "*.gitattributes"],
-    local_dir="FLUX.1-dev",
-    token=huggingface_token,
-)
-# Load Florence-2 model for image captioning on CPU
-print("📥 Loading Florence-2 model...")
-florence_model = AutoModelForCausalLM.from_pretrained(
-    "microsoft/Florence-2-large",
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True,
-    attn_implementation="eager"
-).to(device)
 florence_processor = AutoProcessor.from_pretrained(
-    "microsoft/Florence-2-large",
     trust_remote_code=True
 )
-# Load FLUX Img2Img pipeline on CPU
-print("📥 Loading FLUX Img2Img...")
-pipe = FluxImg2ImgPipeline.from_pretrained(
-    model_path,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
-)
-pipe.enable_vae_tiling()
-pipe.enable_vae_slicing()
-print("✅ All models loaded successfully!")
-# Download ESRGAN model if using
-if USE_ESRGAN:
-    esrgan_path = "4x-UltraSharp.pth"
-    if not os.path.exists(esrgan_path):
-        url = "https://huggingface.co/uwg/upscaler/resolve/main/ESRGAN/4x-UltraSharp.pth"
-        with open(esrgan_path, "wb") as f:
-            f.write(requests.get(url).content)
-    esrgan_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
-    state_dict = torch.load(esrgan_path)['params_ema']
-    esrgan_model.load_state_dict(state_dict)
-    esrgan_model.eval()
-MAX_SEED = 1000000
-MAX_PIXEL_BUDGET = 8192 * 8192  # Increased for tiling support
-def generate_caption(image):
-    """Generate detailed caption using Florence-2"""
-    try:
-        task_prompt = "<MORE_DETAILED_CAPTION>"
-        prompt = task_prompt
-        inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to(florence_model.device)  # Fixed: Use model's current device instead of static 'device'
-        generated_ids = florence_model.generate(
-            input_ids=inputs["input_ids"],
-            pixel_values=inputs["pixel_values"],
-            max_new_tokens=1024,
-            num_beams=3,
-            do_sample=True,
-        )
-        generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-        parsed_answer = florence_processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
-        caption = parsed_answer[task_prompt]
-        return caption
-    except Exception as e:
-        print(f"Caption generation failed: {e}")
-        return "a high quality detailed image"
-def process_input(input_image, upscale_factor):
-    """Process input image and handle size constraints"""
-    w, h = input_image.size
-    w_original, h_original = w, h
-    aspect_ratio = w / h
-    was_resized = False
-    if w * h * upscale_factor**2 > MAX_PIXEL_BUDGET:
-        warnings.warn(
-            f"Requested output image is too large ({w * upscale_factor}x{h * upscale_factor}). Resizing to fit budget."
-        )
-        gr.Info(
-            f"Requested output image is too large. Resizing input to fit within pixel budget."
-        )
-        target_input_pixels = MAX_PIXEL_BUDGET / (upscale_factor ** 2)
-        scale = (target_input_pixels / (w * h)) ** 0.5
-        new_w = int(w * scale) - int(w * scale) % 16  # Fixed: Use % 16 for FLUX alignment (was % 8)
-        new_h = int(h * scale) - int(h * scale) % 16  # Fixed: Use % 16 for FLUX alignment (was % 8)
-        input_image = input_image.resize((new_w, new_h), resample=Image.LANCZOS)
-        was_resized = True
-    return input_image, w_original, h_original, was_resized
-def load_image_from_url(url):
-    """Load image from URL"""
-    try:
-        response = requests.get(url, stream=True)
-        response.raise_for_status()
-        return Image.open(response.raw)
-    except Exception as e:
-        raise gr.Error(f"Failed to load image from URL: {e}")
-def esrgan_upscale(image, scale=4):
-    if not USE_ESRGAN:
-        return image.resize((image.width * scale, image.height * scale), resample=Image.LANCZOS)
-    img = img2tensor(np.array(image) / 255., bgr2rgb=False, float32=True)
-    with torch.no_grad():
-        output = esrgan_model(img.unsqueeze(0)).squeeze()
-    output_img = tensor2img(output, rgb2bgr=False, min_max=(0, 1))
-    return Image.fromarray(output_img)
-def tiled_flux_img2img(pipe, prompt, image, strength, steps, guidance, generator, tile_size=1024, overlap=32):
-    """Tiled Img2Img to mimic Ultimate SD Upscaler tiling"""
-    w, h = image.size
-    output = image.copy()  # Start with the control image
-    for x in range(0, w, tile_size - overlap):
-        for y in range(0, h, tile_size - overlap):
-            tile_w = min(tile_size, w - x)
-            tile_h = min(tile_size, h - y)
-            tile = image.crop((x, y, x + tile_w, y + tile_h))
-            # Run Flux on tile
-            gen_tile = pipe(
-                prompt=prompt,
-                image=tile,
-                strength=strength,
-                num_inference_steps=steps,
-                guidance_scale=guidance,
-                height=tile_h,
-                width=tile_w,
-                generator=generator,
-            ).images[0]
-            # Fixed: Resize generated tile back to exact tile dimensions if pipeline auto-resized for multiple-of-16 requirement
-            gen_tile = gen_tile.resize((tile_w, tile_h), resample=Image.LANCZOS)
-            # Paste with blending if overlap
-            if overlap > 0:
-                paste_box = (x, y, x + tile_w, y + tile_h)
-                if x > 0 or y > 0:
-                    # Simple linear blend on overlaps
-                    mask = Image.new('L', (tile_w, tile_h), 255)
-                    if x > 0:
-                        for i in range(overlap):
-                            for j in range(tile_h):
-                                mask.putpixel((i, j), int(255 * (i / overlap)))
-                    if y > 0:
-                        for i in range(tile_w):
-                            for j in range(overlap):
-                                mask.putpixel((i, j), int(255 * (j / overlap)))
-                    output.paste(gen_tile, paste_box, mask)
-                else:
-                    output.paste(gen_tile, paste_box)
-            else:
-                output.paste(gen_tile, (x, y))
-    return output
-@spaces.GPU(duration=120)
-def enhance_image(
-    image_input,
-    image_url,
-    seed,
-    randomize_seed,
-    num_inference_steps,
-    upscale_factor,
-    denoising_strength,
-    use_generated_caption,
-    custom_prompt,
-    progress=gr.Progress(track_tqdm=True),
-):
-    """Main enhancement function"""
-    # Move models to GPU inside the function
-    pipe.to("cuda")
-    florence_model.to("cuda")
-    # Handle image input
-    if image_input is not None:
-        input_image = image_input
-    elif image_url:
-        input_image = load_image_from_url(image_url)
-    else:
-        raise gr.Error("Please provide an image (upload or URL)")
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    true_input_image = input_image
-    # Process input image
-    input_image, w_original, h_original, was_resized = process_input(
-        input_image, upscale_factor
     )
-    # Generate caption if requested
-    if use_generated_caption:
-        gr.Info("🔍 Generating image caption...")
-        generated_caption = generate_caption(input_image)
-        prompt = generated_caption
-    else:
-        prompt = custom_prompt if custom_prompt.strip() else ""
-    generator = torch.Generator(device="cuda").manual_seed(seed)
-    gr.Info("🚀 Upscaling image...")
-    # Initial upscale
-    if USE_ESRGAN and upscale_factor == 4:
-        esrgan_model.to("cuda")
-        control_image = esrgan_upscale(input_image, upscale_factor)
-        esrgan_model.to("cpu")
-    else:
-        w, h = input_image.size
-        control_image = input_image.resize((w * upscale_factor, h * upscale_factor), resample=Image.LANCZOS)
-    # Tiled Flux Img2Img for refinement
-    image = tiled_flux_img2img(
-        pipe,
-        prompt,
-        control_image,
-        denoising_strength,
-        num_inference_steps,
-        1.0,  # Hardcoded guidance_scale to 1
-        generator,
-        tile_size=1024,
-        overlap=32
     )
-    if was_resized:
-        gr.Info(f"📏 Resizing output to target size: {w_original * upscale_factor}x{h_original * upscale_factor}")
-        image = image.resize((w_original * upscale_factor, h_original * upscale_factor), resample=Image.LANCZOS)
-    # Resize input image to match output size for slider alignment
-    resized_input = true_input_image.resize(image.size, resample=Image.LANCZOS)
-    # Move back to CPU to release GPU
-    pipe.to("cpu")
-    florence_model.to("cpu")
-    return [resized_input, image]
-# Create Gradio interface
-with gr.Blocks(css=css, title="🎨 AI Image Upscaler - Florence-2 + FLUX") as demo:
-    gr.HTML("""
-    <div class="main-header">
-        <h1>🎨 AI Image Upscaler</h1>
-        <p>Upload an image or provide a URL to upscale it using Florence-2 captioning and FLUX upscaling</p>
-        <p>Currently running on <strong>{}</strong></p>
-    </div>
-    """.format(power_device))
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.HTML("<h3>📤 Input</h3>")
-            with gr.Tabs():
-                with gr.TabItem("📁 Upload Image"):
-                    input_image = gr.Image(
-                        label="Upload Image",
-                        type="pil",
-                        height=200  # Made smaller
-                    )
-                with gr.TabItem("🔗 Image URL"):
-                    image_url = gr.Textbox(
-                        label="Image URL",
-                        placeholder="https://example.com/image.jpg",
-                        value="https://upload.wikimedia.org/wikipedia/commons/thumb/a/a7/Example.jpg/800px-Example.jpg"
-                    )
-            gr.HTML("<h3>🎛️ Caption Settings</h3>")
-            use_generated_caption = gr.Checkbox(
-                label="Use AI-generated caption (Florence-2)",
-                value=True,
-                info="Generate detailed caption automatically"
-            )
-            custom_prompt = gr.Textbox(
-                label="Custom Prompt (optional)",
-                placeholder="Enter custom prompt or leave empty for generated caption",
-                lines=2
-            )
-            gr.HTML("<h3>⚙️ Upscaling Settings</h3>")
-            upscale_factor = gr.Slider(
-                label="Upscale Factor",
-                minimum=1,
-                maximum=4,
-                step=1,
-                value=2,
-                info="How much to upscale the image"
-            )
-            num_inference_steps = gr.Slider(
-                label="Number of Inference Steps",
-                minimum=8,
-                maximum=50,
-                step=1,
-                value=25,
-                info="More steps = better quality but slower"
-            )
-            denoising_strength = gr.Slider(
-                label="Denoising Strength",
-                minimum=0.0,
-                maximum=1.0,
-                step=0.05,
-                value=0.3,
-                info="Controls how much the image is transformed"
-            )
-            with gr.Row():
-                randomize_seed = gr.Checkbox(
-                    label="Randomize seed",
-                    value=True
-                )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=0,
-                    maximum=MAX_SEED,
-                    step=1,
-                    value=42,
-                    interactive=True
-                )
-            enhance_btn = gr.Button(
-                "🚀 Upscale Image",
-                variant="primary",
-                size="lg"
-            )
-        with gr.Column(scale=2):  # Larger scale for results
-            gr.HTML("<h3>📊 Results</h3>")
-            result_slider = ImageSlider(
-                type="pil",
-                interactive=False,  # Disable interactivity to prevent uploads
-                height=600,  # Made larger
-                elem_id="result_slider",
-                label=None  # Remove default label
-            )
-    # Event handler
-    enhance_btn.click(
-        fn=enhance_image,
-        inputs=[
-            input_image,
-            image_url,
-            seed,
-            randomize_seed,
-            num_inference_steps,
-            upscale_factor,
-            denoising_strength,
-            use_generated_caption,
-            custom_prompt,
-        ],
-        outputs=[result_slider]
-    )
-    gr.HTML("""
-    <div style="margin-top: 2rem; padding: 1rem; background: #f0f0f0; border-radius: 8px;">
-        <p><strong>Note:</strong> This upscaler uses the Flux dev model. Users are responsible for obtaining commercial rights if used commercially under their license.</p>
-    </div>
-    """)
-    # Custom CSS for slider
-    gr.HTML("""
-    <style>
-        #result_slider .slider {
-            width: 100% !important;
-            max-width: inherit !important;
-        }
-        #result_slider img {
-            object-fit: contain !important;
-            width: 100% !important;
-            height: auto !important;
-        }
-        #result_slider .gr-button-tool {
-            display: none !important;
-        }
-        #result_slider .gr-button-undo {
-            display: none !important;
-        }
-        #result_slider .gr-button-clear {
-            display: none !important;
-        }
-        #result_slider .badge-container .badge {
-            display: none !important;
-        }
-        #result_slider .badge-container::before {
-            content: "Before";
-            position: absolute;
-            top: 10px;
-            left: 10px;
-            background: rgba(0,0,0,0.5);
-            color: white;
-            padding: 5px;
-            border-radius: 5px;
-            z-index: 10;
-        }
-        #result_slider .badge-container::after {
-            content: "After";
-            position: absolute;
-            top: 10px;
-            right: 10px;
-            background: rgba(0,0,0,0.5);
-            color: white;
-            padding: 5px;
-            border-radius: 5px;
-            z-index: 10;
-        }
-        #result_slider .fullscreen img {
-            object-fit: contain !important;
-            width: 100vw !important;
-            height: 100vh !important;
-        }
-    </style>
-    """)
-    # JS to set slider default position to middle
-    gr.HTML("""
-    <script>
-        document.addEventListener('DOMContentLoaded', function() {
-            const sliderInput = document.querySelector('#result_slider input[type="range"]');
-            if (sliderInput) {
-                sliderInput.value = 50;
-                sliderInput.dispatchEvent(new Event('input'));
-            }
-        });
-    </script>
-    """)
-if __name__ == "__main__":
-    demo.queue().launch(share=True, server_name="0.0.0.0", server_port=7860)

 import warnings
 import gradio as gr
 import torch
 from PIL import Image
+from transformers import AutoProcessor, Florence2ForConditionalGeneration
+from diffusers import AutoPipelineForImage2Image
+import random
+import numpy as np
+import os
+import spaces
 try:
+    import basicsr
+    # Assume basicsr interpolation setup
+    interpolation = "basicsr"  # Placeholder for actual basicsr usage
 except ImportError:
     warnings.warn("basicsr not installed; falling back to LANCZOS interpolation.")
+    interpolation = Image.LANCZOS
+# Initialize models
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.bfloat16
+huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
+# Load FLUX img2img pipeline
+pipe = AutoPipelineForImage2Image.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=dtype,
+    token=huggingface_token
+).to(device)
+pipe.enable_vae_tiling()  # To help with memory for large images
+# Initialize Florence model with float32 to avoid dtype mismatch
+florence_model = Florence2ForConditionalGeneration.from_pretrained(
+    'microsoft/Florence-2-large',
     trust_remote_code=True,
+    torch_dtype=torch.float32
+).to(device).eval()
 florence_processor = AutoProcessor.from_pretrained(
+    'microsoft/Florence-2-large',
     trust_remote_code=True
 )
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 2048
+# Florence caption function
+@spaces.GPU
+def florence_caption(image):
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    inputs = florence_processor(text="<DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
+    generated_ids = florence_model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        early_stopping=False,
+        do_sample=False,
+        num_beams=3,
     )
+    generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    parsed_answer = florence_processor.post_process_generation(
+        generated_text,
+        task="<DETAILED_CAPTION>",
+        image_size=(image.width, image.height)
     )
+    return parsed_answer["<DETAILED_CAPTION>"]
+# Tiled FLUX img2img function with fix for small dimensions and overlap
+def tiled_flux_img2img(image, prompt, strength, num_inference_steps, guidance_scale, tile_size=512, overlap=64):
+    width, height = image.size
+    # Resize to multiple of 16 to avoid dimension warnings
+    width = (width // 16) * 16 if width >= 16 else 16
+    height = (height // 16) * 16 if height >= 16 else 16
+    if width != image.size[0] or height != image.size[1]:
+        image = image.resize((width, height), resample=interpolation)
+    result = Image.new('RGB', (width, height))
+    stride = tile_size - overlap
+    # For simplicity, tile in both directions, but handle small sizes
+    for y in range(0, height, stride):
+        for x in range(0, width, stride):
+            tile_left = x
+            tile_top = y
+            tile_right = min(x + tile_size, width)
+            tile_bottom = min(y + tile_size, height)
+            tile = image.crop((tile_left, tile_top, tile_right, tile_bottom))
+            # Skip if tile is too small
+            if tile.width < 16 or tile.height < 16:
+                continue
+            # Generate with img2img
+            generated_tile = pipe(
+                prompt,
+                image=tile,
+                strength=strength,
+                guidance_scale=guidance_scale,
+                num_inference_steps=num_inference_steps
+            ).images[0]
+            generated_tile = generated_tile.resize(tile.size)  # Ensure size match
+            # Paste without blend if first tile
+            if x == 0 and y == 0:
+                result.paste(generated_tile, (tile_left, tile_top))
+                continue
+            # Blend with previous if overlap
+            if y > 0:  # Vertical blend
+                effective_overlap = min(overlap, tile_bottom - tile_top, result.crop((tile_left, tile_top - overlap, tile_right, tile_top)).height)
+                if effective_overlap > 0:
+                    mask = Image.new('L', (tile_right - tile_left, effective_overlap))
+                    for i in range(mask.width):
+                        for j in range(mask.height):
+                            # Fixed: use effective_overlap for division and range
+                            mask.putpixel((i, j), int(255 * (j / (effective_overlap - 1 if effective_overlap > 1 else 1))))
+                    # Blend the top part of the tile with the bottom of the previous
+                    blend_region = Image.composite(
+                        generated_tile.crop((0, 0, mask.width, mask.height)),
+                        result.crop((tile_left, tile_top, tile_right, tile_top + mask.height)),
+                        mask
+                    )
+                    result.paste(blend_region, (tile_left, tile_top))
+                    # Paste the non-overlap part
+                    result.paste(generated_tile.crop((0, effective_overlap, generated_tile.width, generated_tile.height)), (tile_left, tile_top + effective_overlap))
+                else:
+                    result.paste(generated_tile, (tile_left, tile_top))
+            # Similar for horizontal blend (if x > 0), implement analogously
+            if x > 0:  # Horizontal blend
+                # Similar logic, but for left overlap, gradient horizontal
+                effective_overlap_h = min(overlap, tile_right - tile_left)
+                if effective_overlap_h > 0:
+                    mask_h = Image.new('L', (effective_overlap_h, tile_bottom - tile_top))
+                    for i in range(mask_h.width):
+                        for j in range(mask_h.height):
+                            mask_h.putpixel((i, j), int(255 * (i / (effective_overlap_h - 1 if effective_overlap_h > 1 else 1))))
+                    # Blend left part
+                    blend_region_h = Image.composite(
+                        generated_tile.crop((0, 0, mask_h.width, mask_h.height)),
+                        result.crop((tile_left, tile_top, tile_left + mask_h.width, tile_bottom)),
+                        mask_h
+                    )
+                    result.paste(blend_region_h, (tile_left, tile_top))
+                    # Paste non-overlap
+                    result.paste(generated_tile.crop((effective_overlap_h, 0, generated_tile.width, generated_tile.height)), (tile_left + effective_overlap_h, tile_top))
+                else:
+                    result.paste(generated_tile, (tile_left, tile_top))
+    return result
+# Main enhance function
+@spaces.GPU(duration=190)
+def enhance_image(image, text_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, strength, progress=gr.Progress(track_tqdm=True)):
+    prompt = text_prompt
+    if image is not None:
+        prompt = florence_caption(image)
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(device=device).manual_seed(seed)
+    # Use tiled if large, else direct
+    if image.size[0] > MAX_IMAGE_SIZE or image.size[1] > MAX_IMAGE_SIZE:
+        output_image = tiled_flux_img2img(image, prompt, strength, num_inference_steps, guidance_scale)
+    else:
+        output_image = pipe(
+            prompt,
+            image=image,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            width=width,
+            height=height,
+            guidance_scale=guidance_scale,
+            strength=strength
+        ).images[0]
+    return output_image, prompt, seed
+# Gradio interface
+title = "<h1 align='center'>FLUX Image Enhancer with Florence-2 Captioner</h1>"
+with gr.Blocks() as demo:
+    gr.HTML(title)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(label="Upload Image")
+            text_prompt = gr.Textbox(label="Text Prompt (if no image)")
+            strength = gr.Slider(label="Strength", minimum=0.1, maximum=1.0, value=0.8)
+            guidance_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=10, value=5.0)
+            num_inference_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=20)
+            seed = gr.Number(value=42, label="Seed")
+            randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+            width = gr.Slider(minimum=256, maximum=1024, step=16, value=512, label="Width")
+            height = gr.Slider(minimum=256, maximum=1024, step=16, value=512, label="Height")
+            submit = gr.Button("Enhance")
+        with gr.Column():
+            output_image = gr.Image(label="Enhanced Image")
+            output_prompt = gr.Textbox(label="Generated Prompt")
+            output_seed = gr.Number(label="Used Seed")
+    submit.click(
+        enhance_image,
+        inputs=[input_image, text_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, strength],
+        outputs=[output_image, output_prompt, output_seed]
+    )
+print("✅ All models loaded successfully!")
+demo.launch(server_port=7860, server_name="0.0.0.0")