Spaces:

omer11a
/

bounded-attention

Runtime error

App Files Files Community

omer11a commited on Apr 3, 2024

Commit

49a7542

1 Parent(s): f0d244c

Decreased runtime

Browse files

Files changed (1) hide show

app.py +141 -139

app.py CHANGED Viewed

@@ -35,11 +35,50 @@ COPY_LINK = """
     </a>
     Duplicate this space to generate more samples without waiting in queue
 """
 FOOTNOTE = """
     <p>The source code of this demo is based on the <a href="https://huggingface.co/spaces/gligen/demo/tree/main">GLIGEN demo</a>.</p>
 """
 def inference(
     boxes,
     prompts,
@@ -61,11 +100,7 @@ def inference(
         raise gr.Error("cuda is not available")
     device = torch.device("cuda")
-    model_path = "stabilityai/stable-diffusion-xl-base-1.0"
-    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
-    model = StableDiffusionXLPipeline.from_pretrained(model_path, scheduler=scheduler, torch_dtype=torch.float16).to(device)
-    model.unet.set_default_attn_processor()
-    model.enable_sequential_cpu_offload()
     seed_everything(seed)
     start_code = torch.randn([len(prompts), 4, 128, 128], device=device)
@@ -89,12 +124,14 @@ def inference(
         num_clusters_per_box=num_clusters_per_subject,
     )
-    regiter_attention_editor_diffusers(model, editor)
-    return model(prompts, latents=start_code, guidance_scale=classifier_free_guidance_scale).images
-@spaces.GPU(duration=500)
 def generate(
     prompt,
     subject_token_indices,
@@ -220,134 +257,99 @@ def main():
     }
     """
-    nltk.download("averaged_perceptron_tagger")
-    with gr.Blocks(
-            css=css,
-            title="Bounded Attention demo",
-    ) as demo:
-        gr.HTML(DESCRIPTION)
-        gr.HTML(COPY_LINK)
-        with gr.Column():
-            gr.HTML("Scroll down to see examples of the required input format.")
-            prompt = gr.Textbox(
-                label="Text prompt",
-            )
-            subject_token_indices = gr.Textbox(
-                label="The token indices of each subject (separate indices for the same subject with commas, and for different subjects with semicolons)",
-            )
-            filter_token_indices = gr.Textbox(
-                label="Optional: The token indices to filter, i.e. conjunctions, numbers, postional relations, etc. (if left empty, this will be automatically inferred)",
-            )
-            num_tokens = gr.Textbox(
-                label="Optional: The number of tokens in the prompt (We use this to verify your input, as sometimes rare words are split into more than one token)",
-            )
-            with gr.Row():
-                sketchpad = gr.Sketchpad(label="Sketch Pad (draw each bounding box in a different layer)")
-                layout_image = gr.Image(type="pil", label="Bounding Boxes", interactive=False)
-            with gr.Row():
-                clear_button = gr.Button(value="Clear")
-                generate_layout_button = gr.Button(value="Generate layout")
-                generate_image_button = gr.Button(value="Generate image")
-            with gr.Row():
-                out_images = gr.Gallery(type="pil", label="Generated Images", interactive=False)
-            with gr.Accordion("Advanced Options", open=False):
-                with gr.Column():
-                    description = """
-                        <div class="tooltip">Batch size &#9432
-                        <span class="tooltiptext">The number of images to generate.</span>
-                        </div>
-                        <div class="tooltip">Initial step size &#9432
-                        <span class="tooltiptext">The initial step size of the linear step size scheduler when performing guidance.</span>
-                        </div>
-                        <div class="tooltip">Final step size &#9432
-                        <span class="tooltiptext">The final step size of the linear step size scheduler when performing guidance.</span>
-                        </div>
-                        <div class="tooltip">Number of self-attention clusters per subject &#9432
-                        <span class="tooltiptext">Determines the number of clusters when clustering the self-attention maps (#clusters = #subject x #clusters_per_subject). Changing this value might improve semantics (adherence to the prompt), especially when the subjects exceed their bounding boxes.</span>
-                        </div>
-                        <div class="tooltip">Cross-attention loss scale factor &#9432
-                        <span class="tooltiptext">The scale factor of the cross-attention loss term. Increasing it will improve semantic control (adherence to the prompt), but may reduce image quality.</span>
-                        </div>
-                        <div class="tooltip">Self-attention loss scale factor &#9432
-                        <span class="tooltiptext">The scale factor of the self-attention loss term. Increasing it will improve layout control (adherence to the bounding boxes), but may reduce image quality.</span>
-                        </div>
-                        <div class="tooltip">Classifier-free guidance scale &#9432
-                        <span class="tooltiptext">The scale factor of classifier-free guidance.</span>
-                        </div>
-                        <div class="tooltip" >Number of Gradient Descent iterations per timestep &#9432
-                        <span class="tooltiptext">The number of Gradient Descent iterations for each timestep when performing guidance.</span>
-                        </div>
-                        <div class="tooltip" >Loss Threshold &#9432
-                        <span class="tooltiptext">If the loss is below the threshold, Gradient Descent stops for that timestep. </span>
-                        </div>
-                        <div class="tooltip" >Number of guidance steps &#9432
-                        <span class="tooltiptext">The number of timesteps in which to perform guidance.</span>
-                        </div>
-                    """
-                    gr.HTML(description)
-                    batch_size = gr.Slider(minimum=1, maximum=5, step=1, value=1, label="Number of samples (limited to one sample on current space)")
-                    init_step_size = gr.Slider(minimum=0, maximum=50, step=0.5, value=18, label="Initial step size")
-                    final_step_size = gr.Slider(minimum=0, maximum=20, step=0.5, value=5, label="Final step size")
-                    num_clusters_per_subject = gr.Slider(minimum=0, maximum=5, step=0.5, value=3, label="Number of clusters per subject")
-                    cross_loss_scale = gr.Slider(minimum=0, maximum=2, step=0.1, value=1, label="Cross-attention loss scale factor")
-                    self_loss_scale = gr.Slider(minimum=0, maximum=2, step=0.1, value=1, label="Self-attention loss scale factor")
-                    classifier_free_guidance_scale = gr.Slider(minimum=0, maximum=50, step=0.5, value=7.5, label="Classifier-free guidance Scale")
-                    num_iterations = gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Number of Gradient Descent iterations")
-                    loss_threshold = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.2, label="Loss threshold")
-                    num_guidance_steps = gr.Slider(minimum=10, maximum=20, step=1, value=15, label="Number of timesteps to perform guidance")
-                    seed = gr.Slider(minimum=0, maximum=1000, step=1, value=445, label="Random Seed")
-            boxes = gr.State([])
-            clear_button.click(
-                clear,
-                inputs=[batch_size],
-                outputs=[boxes, sketchpad, layout_image, out_images],
-                queue=False,
-            )
-            generate_layout_button.click(
-                draw,
-                inputs=[sketchpad],
-                outputs=[boxes, layout_image],
-                queue=False,
-            )
-            generate_image_button.click(
-                fn=generate,
-                inputs=[
-                    prompt, subject_token_indices, filter_token_indices, num_tokens,
-                    init_step_size, final_step_size, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
-                    classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
-                    seed,
-                    boxes,
-                ],
-                outputs=[out_images],
-                queue=True,
-            )
-        with gr.Column():
-            gr.Examples(
-                examples=[
-                    ["a ginger kitten and a gray puppy in a yard", "2,3;6,7", "1,4,5,8,9", "10"],
-                    ["a realistic photo of a highway with a semi trailer and a concrete mixer and a helicopter", "9,10;13,14;17", "1,4,5,7,8,11,12,15,16", "17"],
-                ],
-                inputs=[prompt, subject_token_indices, filter_token_indices, num_tokens],
-            )
-        gr.HTML(FOOTNOTE)
-    demo.launch(show_api=False, show_error=True)
-if __name__ == "__main__":
-    main()

     </a>
     Duplicate this space to generate more samples without waiting in queue
 """
+ADVANCED_OPTION_DESCRIPTION = """
+    <div class="tooltip" >Number of guidance steps &#9432
+    <span class="tooltiptext">The number of timesteps in which to perform guidance. Recommended value is 15, but increasing this will also increases the runtime.</span>
+    </div>
+    <div class="tooltip">Batch size &#9432
+    <span class="tooltiptext">The number of images to generate.</span>
+    </div>
+    <div class="tooltip">Initial step size &#9432
+    <span class="tooltiptext">The initial step size of the linear step size scheduler when performing guidance.</span>
+    </div>
+    <div class="tooltip">Final step size &#9432
+    <span class="tooltiptext">The final step size of the linear step size scheduler when performing guidance.</span>
+    </div>
+    <div class="tooltip">Number of self-attention clusters per subject &#9432
+    <span class="tooltiptext">Determines the number of clusters when clustering the self-attention maps (#clusters = #subject x #clusters_per_subject). Changing this value might improve semantics (adherence to the prompt), especially when the subjects exceed their bounding boxes.</span>
+    </div>
+    <div class="tooltip">Cross-attention loss scale factor &#9432
+    <span class="tooltiptext">The scale factor of the cross-attention loss term. Increasing it will improve semantic control (adherence to the prompt), but may reduce image quality.</span>
+    </div>
+    <div class="tooltip">Self-attention loss scale factor &#9432
+    <span class="tooltiptext">The scale factor of the self-attention loss term. Increasing it will improve layout control (adherence to the bounding boxes), but may reduce image quality.</span>
+    </div>
+    <div class="tooltip" >Number of Gradient Descent iterations per timestep &#9432
+    <span class="tooltiptext">The number of Gradient Descent iterations for each timestep when performing guidance.</span>
+    </div>
+    <div class="tooltip" >Loss Threshold &#9432
+    <span class="tooltiptext">If the loss is below the threshold, Gradient Descent stops for that timestep. </span>
+    </div>
+    <div class="tooltip">Classifier-free guidance scale &#9432
+    <span class="tooltiptext">The scale factor of classifier-free guidance.</span>
+    </div>
+"""
 FOOTNOTE = """
     <p>The source code of this demo is based on the <a href="https://huggingface.co/spaces/gligen/demo/tree/main">GLIGEN demo</a>.</p>
 """
+MODEL_PATH = "stabilityai/stable-diffusion-xl-base-1.0"
+scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+model = StableDiffusionXLPipeline.from_pretrained(MODEL_PATH, scheduler=scheduler, torch_dtype=torch.float16)
+model.unet.set_default_attn_processor()
+model.enable_sequential_cpu_offload()
 def inference(
     boxes,
     prompts,
         raise gr.Error("cuda is not available")
     device = torch.device("cuda")
+    model = model.to(device)
     seed_everything(seed)
     start_code = torch.randn([len(prompts), 4, 128, 128], device=device)
         num_clusters_per_box=num_clusters_per_subject,
     )
+    register_attention_editor_diffusers(model, editor)
+    images = model(prompts, latents=start_code, guidance_scale=classifier_free_guidance_scale).images
+    unregister_attention_editor_diffusers(model)
+    model.to(torch.device("cpu"))
+@spaces.GPU(duration=300)
 def generate(
     prompt,
     subject_token_indices,
     }
     """
+nltk.download("averaged_perceptron_tagger")
+with gr.Blocks(
+        css=css,
+        title="Bounded Attention demo",
+) as demo:
+    gr.HTML(DESCRIPTION)
+    gr.HTML(COPY_LINK)
+    with gr.Column():
+        gr.HTML("Scroll down to see examples of the required input format.")
+        prompt = gr.Textbox(
+            label="Text prompt",
+        )
+        subject_token_indices = gr.Textbox(
+            label="The token indices of each subject (separate indices for the same subject with commas, and for different subjects with semicolons)",
+        )
+        filter_token_indices = gr.Textbox(
+            label="Optional: The token indices to filter, i.e. conjunctions, numbers, postional relations, etc. (if left empty, this will be automatically inferred)",
+        )
+        num_tokens = gr.Textbox(
+            label="Optional: The number of tokens in the prompt (We use this to verify your input, as sometimes rare words are split into more than one token)",
+        )
+        with gr.Row():
+            sketchpad = gr.Sketchpad(label="Sketch Pad (draw each bounding box in a different layer)")
+            layout_image = gr.Image(type="pil", label="Bounding Boxes", interactive=False)
+        with gr.Row():
+            clear_button = gr.Button(value="Clear")
+            generate_layout_button = gr.Button(value="Generate layout")
+            generate_image_button = gr.Button(value="Generate image")
+        with gr.Row():
+            out_images = gr.Gallery(type="pil", label="Generated Images", interactive=False)
+        with gr.Accordion("Advanced Options", open=False):
+            with gr.Column():
+                gr.HTML(ADVANCED_OPTION_DESCRIPTION)
+                batch_size = gr.Slider(minimum=1, maximum=5, step=1, value=1, label="Number of samples (limited to one sample on current space)")
+                num_guidance_steps = gr.Slider(minimum=5, maximum=20, step=1, value=8, label="Number of timesteps to perform guidance")
+                init_step_size = gr.Slider(minimum=0, maximum=50, step=0.5, value=25, label="Initial step size")
+                final_step_size = gr.Slider(minimum=0, maximum=20, step=0.5, value=10, label="Final step size")
+                num_clusters_per_subject = gr.Slider(minimum=0, maximum=5, step=0.5, value=3, label="Number of clusters per subject")
+                cross_loss_scale = gr.Slider(minimum=0, maximum=2, step=0.1, value=1, label="Cross-attention loss scale factor")
+                self_loss_scale = gr.Slider(minimum=0, maximum=2, step=0.1, value=1, label="Self-attention loss scale factor")
+                num_iterations = gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Number of Gradient Descent iterations")
+                loss_threshold = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.2, label="Loss threshold")
+                classifier_free_guidance_scale = gr.Slider(minimum=0, maximum=50, step=0.5, value=7.5, label="Classifier-free guidance Scale")
+                seed = gr.Slider(minimum=0, maximum=1000, step=1, value=445, label="Random Seed")
+        boxes = gr.State([])
+        clear_button.click(
+            clear,
+            inputs=[batch_size],
+            outputs=[boxes, sketchpad, layout_image, out_images],
+            queue=False,
+        )
+        generate_layout_button.click(
+            draw,
+            inputs=[sketchpad],
+            outputs=[boxes, layout_image],
+            queue=False,
+        )
+        generate_image_button.click(
+            fn=generate,
+            inputs=[
+                prompt, subject_token_indices, filter_token_indices, num_tokens,
+                init_step_size, final_step_size, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
+                classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
+                seed,
+                boxes,
+            ],
+            outputs=[out_images],
+            queue=True,
+        )
+    with gr.Column():
+        gr.Examples(
+            examples=[
+                ["a ginger kitten and a gray puppy in a yard", "2,3;6,7", "1,4,5,8,9", "10"],
+                ["a realistic photo of a highway with a semi trailer and a concrete mixer and a helicopter", "9,10;13,14;17", "1,4,5,7,8,11,12,15,16", "17"],
+            ],
+            inputs=[prompt, subject_token_indices, filter_token_indices, num_tokens],
+        )
+    gr.HTML(FOOTNOTE)
+demo.launch(show_api=False, show_error=True)