Spaces:

multimodalart
/

diptych-zero-shot-subject-driven

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Jun 17

Commit

7a2b253

verified ·

1 Parent(s): 4743e79

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -64

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ import random
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# --- Helper Dataclasses (Identical to diptych_prompting_inference.py) ---
 @dataclass
 class BoundingBox:
     xmin: int
@@ -48,7 +48,7 @@ class DetectionResult:
                                    ymax=detection_dict['box']['ymax']))
-# --- Helper Functions (Identical to diptych_prompting_inference.py) ---
 def mask_to_polygon(mask: np.ndarray) -> List[List[int]]:
     contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     if not contours:
@@ -127,7 +127,7 @@ def make_diptych(image):
     return Image.fromarray(diptych_np)
-# --- Custom Attention Processor (EXACTLY as in diptych_prompting_inference.py) ---
 class CustomFluxAttnProcessor2_0:
     def __init__(self, height=44, width=88, attn_enforce=1.0):
         if not hasattr(F, "scaled_dot_product_attention"):
@@ -197,7 +197,6 @@ class CustomFluxAttnProcessor2_0:
 print("--- Loading Models: This may take a few minutes and requires >40GB VRAM ---")
 controlnet = FluxControlNetModel.from_pretrained("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", torch_dtype=torch.bfloat16)
 pipe = FluxControlNetInpaintingPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", controlnet=controlnet, torch_dtype=torch.bfloat16).to(device)
-# pipe.load_lora_weights(hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors"))
 pipe.transformer.to(torch.bfloat16)
 pipe.controlnet.to(torch.bfloat16)
@@ -213,21 +212,21 @@ print("--- All models loaded successfully! ---")
 def get_duration(
     input_image: Image.Image,
-    subject_name: str,
-    target_prompt: str,
-    attn_enforce: float = 1.3,
-    ctrl_scale: float = 0.95,
-    width: int = 768,
-    height: int = 768,
-    pixel_offset: int = 8,
-    num_steps: int = 28,
-    guidance: float = 3.5,
-    real_guidance: float = 4.5,
-    seed: int = 42,
-    randomize_seed: bool = False,
     progress=gr.Progress(track_tqdm=True)
 ):
-    if width > 768 and height > 768:
         return 210
     else:
         return 120
@@ -236,17 +235,18 @@ def get_duration(
 def run_diptych_prompting(
     input_image: Image.Image,
     subject_name: str,
-    target_prompt: str,
-    attn_enforce: float = 1.3,
-    ctrl_scale: float = 0.95,
-    width: int = 768,
-    height: int = 768,
-    pixel_offset: int = 8,
-    num_steps: int = 28,
-    guidance: float = 3.5,
-    real_guidance: float = 4.5,
-    seed: int = 42,
-    randomize_seed: bool = False,
     progress=gr.Progress(track_tqdm=True)
 ):
     if randomize_seed:
@@ -255,40 +255,42 @@ def run_diptych_prompting(
         actual_seed = seed
     if input_image is None: raise gr.Error("Please upload a reference image.")
-    if not subject_name: raise gr.Error("Please provide the subject's name (e.g., 'a red car').")
-    if not target_prompt: raise gr.Error("Please provide a target prompt.")
-    # 1. Prepare dimensions (logic from original script's main block)
     padded_width = width + pixel_offset * 2
     padded_height = height + pixel_offset * 2
     diptych_size = (padded_width * 2, padded_height)
-    # 2. Prepare prompts and images
-    progress(0, desc="Resizing and segmenting reference image...")
-    base_prompt = f"a photo of {subject_name}"
-    diptych_text_prompt = f"A diptych with two side-by-side images of same {subject_name}. On the left, {base_prompt}. On the right, replicate this {subject_name} exactly but as {target_prompt}"
     reference_image = input_image.resize((padded_width, padded_height)).convert("RGB")
-    segmented_image = segment_image(reference_image, subject_name, object_detector, segmentator, segment_processor)
     progress(0.2, desc="Creating diptych and mask...")
     mask_image = np.concatenate([np.zeros((padded_height, padded_width, 3)), np.ones((padded_height, padded_width, 3)) * 255], axis=1)
     mask_image = Image.fromarray(mask_image.astype(np.uint8))
-    diptych_image_prompt = make_diptych(segmented_image)
-    # 3. Setup Attention Processor (logic from original script's main block)
     progress(0.3, desc="Setting up attention processors...")
     new_attn_procs = base_attn_procs.copy()
     for k in new_attn_procs:
-        # Use full diptych dimensions for the attention processor
         new_attn_procs[k] = CustomFluxAttnProcessor2_0(height=padded_height // 16, width=padded_width * 2 // 16, attn_enforce=attn_enforce)
     pipe.transformer.set_attn_processor(new_attn_procs)
-    # 4. Run Inference (using parameters identical to the original script)
     progress(0.4, desc="Running diffusion process...")
     generator = torch.Generator(device="cuda").manual_seed(actual_seed)
-    result = pipe(
-        prompt=diptych_text_prompt,
         height=diptych_size[1],
         width=diptych_size[0],
         control_image=diptych_image_prompt,
@@ -301,14 +303,13 @@ def run_diptych_prompting(
         true_guidance_scale=real_guidance
     ).images[0]
-    # 5. Final cropping (logic from original script's main block)
     progress(0.95, desc="Finalizing image...")
-    # Crop the right panel
-    result = result.crop((padded_width, 0, padded_width * 2, padded_height))
-    # Crop the pixel offset padding
-    result = result.crop((pixel_offset, pixel_offset, padded_width - pixel_offset, padded_height - pixel_offset))
-    return result
 # --- Gradio UI Definition ---
@@ -318,18 +319,29 @@ css = '''
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     gr.Markdown(
         """
-        # Diptych Prompting: Zero-Shot Subject-Driven Image Generation
         ### Gradio Demo for the paper "[Large-Scale Text-to-Image Model with Inpainting is a Zero-Shot Subject-Driven Image Generator](https://diptychprompting.github.io/)"
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
-            input_image = gr.Image(type="pil", label="1. Reference Image")
-            subject_name = gr.Textbox(label="2. Subject Name", placeholder="e.g., a plush bear")
-            target_prompt = gr.Textbox(label="3. Target Prompt", placeholder="e.g., a plush bear riding a skate on the moon")
             run_button = gr.Button("Generate Image", variant="primary")
             with gr.Accordion("Advanced Settings", open=False):
                 attn_enforce = gr.Slider(minimum=1.0, maximum=2.0, value=1.3, step=0.05, label="Attention Enforcement")
                 ctrl_scale = gr.Slider(minimum=0.5, maximum=1.0, value=0.95, step=0.01, label="ControlNet Scale")
                 num_steps = gr.Slider(minimum=20, maximum=50, value=28, step=1, label="Inference Steps")
                 guidance = gr.Slider(minimum=1.0, maximum=10.0, value=3.5, step=0.1, label="Distilled Guidance Scale")
@@ -339,9 +351,85 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
                 pixel_offset = gr.Slider(minimum=0, maximum=32, value=8, step=1, label="Padding (Pixel Offset)")
                 seed = gr.Slider(minimum=0, maximum=9223372036854775807, value=42, step=1, label="Seed")
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
         with gr.Column(scale=1):
             output_image = gr.Image(type="pil", label="Generated Image")
     gr.Examples(
         examples=[
             ["./assets/cat_squished.png", "a cat toy", "a cat toy riding a skate"],
@@ -349,16 +437,10 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
             ["./assets/bear_plushie.jpg", "a bear plushie", "a bear plushie drinking bubble tea"]
         ],
         inputs=[input_image, subject_name, target_prompt],
-        outputs=output_image,
-        fn=run_diptych_prompting,
-        cache_examples="lazy",
-    )
-    run_button.click(
-        fn=run_diptych_prompting,
-        inputs=[input_image, subject_name, target_prompt, attn_enforce, ctrl_scale, width, height, pixel_offset, num_steps, guidance, real_guidance, seed, randomize_seed],
-        outputs=output_image
     )
 if __name__ == "__main__":
-    demo.launch(share=True)

 device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- Helper Dataclasses (Identical to previous version) ---
 @dataclass
 class BoundingBox:
     xmin: int
                                    ymax=detection_dict['box']['ymax']))
+# --- Helper Functions (Identical to previous version) ---
 def mask_to_polygon(mask: np.ndarray) -> List[List[int]]:
     contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     if not contours:
     return Image.fromarray(diptych_np)
+# --- Custom Attention Processor (Identical to previous version) ---
 class CustomFluxAttnProcessor2_0:
     def __init__(self, height=44, width=88, attn_enforce=1.0):
         if not hasattr(F, "scaled_dot_product_attention"):
 print("--- Loading Models: This may take a few minutes and requires >40GB VRAM ---")
 controlnet = FluxControlNetModel.from_pretrained("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", torch_dtype=torch.bfloat16)
 pipe = FluxControlNetInpaintingPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", controlnet=controlnet, torch_dtype=torch.bfloat16).to(device)
 pipe.transformer.to(torch.bfloat16)
 pipe.controlnet.to(torch.bfloat16)
 def get_duration(
     input_image: Image.Image,
+    do_segmentation: bool,
+    full_prompt: str,
+    attn_enforce: float,
+    ctrl_scale: float,
+    width: int,
+    height: int,
+    pixel_offset: int,
+    num_steps: int,
+    guidance: float,
+    real_guidance: float,
+    seed: int,
+    randomize_seed: bool,
     progress=gr.Progress(track_tqdm=True)
 ):
+    if width > 768 or height > 768:
         return 210
     else:
         return 120
 def run_diptych_prompting(
     input_image: Image.Image,
     subject_name: str,
+    do_segmentation: bool,
+    full_prompt: str,
+    attn_enforce: float,
+    ctrl_scale: float,
+    width: int,
+    height: int,
+    pixel_offset: int,
+    num_steps: int,
+    guidance: float,
+    real_guidance: float,
+    seed: int,
+    randomize_seed: bool,
     progress=gr.Progress(track_tqdm=True)
 ):
     if randomize_seed:
         actual_seed = seed
     if input_image is None: raise gr.Error("Please upload a reference image.")
+    if not full_prompt: raise gr.Error("Full Prompt is empty. Please fill out the prompt fields.")
+    # 1. Prepare dimensions and reference image
     padded_width = width + pixel_offset * 2
     padded_height = height + pixel_offset * 2
     diptych_size = (padded_width * 2, padded_height)
     reference_image = input_image.resize((padded_width, padded_height)).convert("RGB")
+    # 2. Process reference image based on segmentation flag
+    progress(0, desc="Preparing reference image...")
+    if do_segmentation:
+        if not subject_name:
+            raise gr.Error("Subject Name is required when 'Do Segmentation' is checked.")
+        progress(0.05, desc="Segmenting reference image...")
+        processed_image = segment_image(reference_image, subject_name, object_detector, segmentator, segment_processor)
+    else:
+        processed_image = reference_image
+    # 3. Create diptych and mask
     progress(0.2, desc="Creating diptych and mask...")
     mask_image = np.concatenate([np.zeros((padded_height, padded_width, 3)), np.ones((padded_height, padded_width, 3)) * 255], axis=1)
     mask_image = Image.fromarray(mask_image.astype(np.uint8))
+    diptych_image_prompt = make_diptych(processed_image)
+    # 4. Setup Attention Processor
     progress(0.3, desc="Setting up attention processors...")
     new_attn_procs = base_attn_procs.copy()
     for k in new_attn_procs:
         new_attn_procs[k] = CustomFluxAttnProcessor2_0(height=padded_height // 16, width=padded_width * 2 // 16, attn_enforce=attn_enforce)
     pipe.transformer.set_attn_processor(new_attn_procs)
+    # 5. Run Inference
     progress(0.4, desc="Running diffusion process...")
     generator = torch.Generator(device="cuda").manual_seed(actual_seed)
+    full_diptych_result = pipe(
+        prompt=full_prompt,
         height=diptych_size[1],
         width=diptych_size[0],
         control_image=diptych_image_prompt,
         true_guidance_scale=real_guidance
     ).images[0]
+    # 6. Final cropping
     progress(0.95, desc="Finalizing image...")
+    final_image = full_diptych_result.crop((padded_width, 0, padded_width * 2, padded_height))
+    final_image = final_image.crop((pixel_offset, pixel_offset, padded_width - pixel_offset, padded_height - pixel_offset))
+    # 7. Return all outputs
+    return final_image, processed_image, full_diptych_result, full_prompt
 # --- Gradio UI Definition ---
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     gr.Markdown(
         """
+        # Diptych Prompting: Zero-Shot Subject-Driven & Style-Driven Image Generation
         ### Gradio Demo for the paper "[Large-Scale Text-to-Image Model with Inpainting is a Zero-Shot Subject-Driven Image Generator](https://diptychprompting.github.io/)"
+        This demo implements both subject-driven generation and style transfer with advanced controls.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
+            input_image = gr.Image(type="pil", label="Reference Image")
+            with gr.Group() as subject_driven_group:
+                subject_name = gr.Textbox(label="Subject Name", placeholder="e.g., a plush bear")
+            target_prompt = gr.Textbox(label="Target Prompt", placeholder="e.g., riding a skateboard on the moon")
             run_button = gr.Button("Generate Image", variant="primary")
             with gr.Accordion("Advanced Settings", open=False):
+                mode = gr.Radio(["Subject-Driven", "Style-Driven (unstable)"], label="Generation Mode", value="Subject-Driven")
+                with gr.Group(visible=False) as style_driven_group:
+                    original_style_description = gr.Textbox(label="Original Image Description", placeholder="e.g., in watercolor painting style")
+                do_segmentation = gr.Checkbox(label="Do Segmentation", value=True)
                 attn_enforce = gr.Slider(minimum=1.0, maximum=2.0, value=1.3, step=0.05, label="Attention Enforcement")
+                full_prompt = gr.Textbox(label="Full Prompt (Auto-generated, editable)", lines=3)
                 ctrl_scale = gr.Slider(minimum=0.5, maximum=1.0, value=0.95, step=0.01, label="ControlNet Scale")
                 num_steps = gr.Slider(minimum=20, maximum=50, value=28, step=1, label="Inference Steps")
                 guidance = gr.Slider(minimum=1.0, maximum=10.0, value=3.5, step=0.1, label="Distilled Guidance Scale")
                 pixel_offset = gr.Slider(minimum=0, maximum=32, value=8, step=1, label="Padding (Pixel Offset)")
                 seed = gr.Slider(minimum=0, maximum=9223372036854775807, value=42, step=1, label="Seed")
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
         with gr.Column(scale=1):
             output_image = gr.Image(type="pil", label="Generated Image")
+            with gr.Accordion("Other Outputs", open=False) as other_outputs_accordion:
+                processed_ref_image = gr.Image(label="Processed Reference (Left Panel)")
+                full_diptych_image = gr.Image(label="Full Diptych Output")
+                final_prompt_used = gr.Textbox(label="Final Prompt Used")
+    # --- UI Event Handlers ---
+    def toggle_mode_visibility(mode_choice):
+        """Hides/shows the relevant input textboxes based on mode."""
+        if mode_choice == "Subject-Driven":
+            return gr.update(visible=True), gr.update(visible=False)
+        else:
+            return gr.update(visible=False), gr.update(visible=True)
+    def update_derived_fields(mode_choice, subject, style_desc, target):
+        """Updates the full prompt and segmentation checkbox based on other inputs."""
+        if mode_choice == "Subject-Driven":
+            prompt = f"A diptych with two side-by-side images of same {subject}. On the left, a photo of {subject}. On the right, replicate this {subject} exactly but as {target}"
+            return gr.update(value=prompt), gr.update(value=True)
+        else: # Style-Driven
+            prompt = f"A diptych with two side-by-side images of same style. On the left, {style_desc}. On the right, replicate this style exactly but as {target}"
+            return gr.update(value=prompt), gr.update(value=False)
+    # --- UI Connections ---
+    # When mode changes, toggle visibility of the specific prompt fields
+    mode.change(
+        fn=toggle_mode_visibility,
+        inputs=mode,
+        outputs=[subject_driven_group, style_driven_group],
+        queue=False
+    )
+    # A list of all inputs that affect the full prompt or segmentation checkbox
+    prompt_component_inputs = [mode, subject_name, original_style_description, target_prompt]
+    # A list of the UI elements that are derived from the above inputs
+    derived_outputs = [full_prompt, do_segmentation]
+    # When any prompt component changes, update the derived fields
+    for component in prompt_component_inputs:
+        # Use .then() to chain the update after the visibility toggle for the mode radio
+        if component == mode:
+            component.change(update_derived_fields, inputs=prompt_component_inputs, outputs=derived_outputs, queue=False)
+        else:
+            component.input(update_derived_fields, inputs=prompt_component_inputs, outputs=derived_outputs, queue=False)
+    run_button.click(
+        fn=run_diptych_prompting,
+        inputs=[
+            input_image, subject_name, do_segmentation, full_prompt, attn_enforce,
+            ctrl_scale, width, height, pixel_offset, num_steps, guidance,
+            real_guidance, seed, randomize_seed
+        ],
+        outputs=[output_image, processed_ref_image, full_diptych_image, final_prompt_used]
+    )
+    def run_subject_driven_example(input_image, subject_name, target_prompt):
+        # Construct the full prompt for subject-driven mode
+        full_prompt = f"A diptych with two side-by-side images of same {subject_name}. On the left, a photo of {subject_name}. On the right, replicate this {subject_name} exactly but as {target_prompt}"
+        # Call the main function with all arguments, using defaults for subject-driven mode
+        return run_diptych_prompting(
+            input_image=input_image,
+            subject_name=subject_name,
+            do_segmentation=True,
+            full_prompt=full_prompt,
+            attn_enforce=1.3,
+            ctrl_scale=0.95,
+            width=768,
+            height=768,
+            pixel_offset=8,
+            num_steps=28,
+            guidance=3.5,
+            real_guidance=4.5,
+            seed=42,
+            randomize_seed=False,
+    )
     gr.Examples(
         examples=[
             ["./assets/cat_squished.png", "a cat toy", "a cat toy riding a skate"],
             ["./assets/bear_plushie.jpg", "a bear plushie", "a bear plushie drinking bubble tea"]
         ],
         inputs=[input_image, subject_name, target_prompt],
+        outputs=[output_image, processed_ref_image, full_diptych_image, final_prompt_used],
+        fn=run_subject_driven_example,
+        cache_examples="lazy"
     )
 if __name__ == "__main__":
+    demo.launch(share=True, debug=True)