Spaces:

nvidia
/

addit

Running on Zero

App Files Files Community

YoadTew commited on about 24 hours ago

Commit

6b756b2

1 Parent(s): 12af8b2

Update tqdm descriptions

Browse files

Files changed (3) hide show

addit_flux_pipeline.py +159 -164
addit_methods.py +8 -2
app.py +7 -7

addit_flux_pipeline.py CHANGED Viewed

@@ -17,6 +17,7 @@
 # This work is licensed under the LICENSE file
 # located at the root directory.
 from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 import numpy as np
@@ -175,6 +176,9 @@ class AdditFluxPipeline(FluxPipeline):
         is_img_src: bool = False,
         use_offset: bool = False,
         img_src_latents: Optional[List[torch.FloatTensor]] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -401,51 +405,25 @@ class AdditFluxPipeline(FluxPipeline):
                 img_src_latents.append((1.0 - sigma) * source_latents[0] + sigma * rand_noise)
         # 6. Denoising loop
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                # For denoising from source image
-                if is_img_src:
-                    latents[0] = img_src_latents[i]
-                # For Structure Transfer
-                if (source_latents is not None) and i == structure_transfer_step:
-                    sigma = self.scheduler.sigmas[i]
-                    latents[1] = (1.0 - sigma) * source_latents[0] + sigma * noise[1]
-                if is_auto_extend_scale and i == auto_extended_step:
-                    def f(gamma):
-                        self.attention_store.attention_ratios[i] = {}
-                        noise_pred = self.transformer(
-                            hidden_states=latents,
-                            timestep=timestep / 1000,
-                            guidance=guidance,
-                            pooled_projections=pooled_prompt_embeds,
-                            encoder_hidden_states=prompt_embeds,
-                            txt_ids=text_ids,
-                            img_ids=latent_image_ids,
-                            joint_attention_kwargs=self.joint_attention_kwargs,
-                            return_dict=False,
-                            proccesor_kwargs={"step_index": i, "extended_scale": gamma},
-                        )[0]
-                        scores_per_layer = self.attention_store.get_attention_ratios(step_indices=[i], display_imgs=False)
-                        source_sum, text_sum, target_sum = scores_per_layer['transformer_blocks']
-                        # We want to find the gamma that makes the ratio equal to K
-                        ratio = (target_sum / source_sum)
-                        return (ratio - target_auto_ratio)
-                    gamma_sol = brentq(f, 1.0, 1.2, xtol=0.01)
-                    print('Chosen gamma:', gamma_sol)
-                    extended_scale = gamma_sol
-                else:
                     noise_pred = self.transformer(
                         hidden_states=latents,
                         timestep=timestep / 1000,
@@ -456,47 +434,68 @@ class AdditFluxPipeline(FluxPipeline):
                         img_ids=latent_image_ids,
                         joint_attention_kwargs=self.joint_attention_kwargs,
                         return_dict=False,
-                        proccesor_kwargs={"step_index": i, "extended_scale": extended_scale},
-                    )[0]
-                # compute the previous noisy sample x_t -> x_t-1
-                latents_dtype = latents.dtype
-                latents, x0 = self.scheduler.step(noise_pred, t, latents, return_dict=False, step_index=i)
-                if use_offset and is_img_src and (i+1 < len(img_src_latents)):
-                    next_latent = img_src_latents[i+1]
-                    offset = (next_latent - latents[0])
-                    latents[1] = latents[1] + offset
-                # blend latents
-                if i in blend_steps and (subject_token is not None) and (localization_model is not None):
-                    x0 = self._unpack_latents(x0, height, width, self.vae_scale_factor)
-                    x0 = (x0 / self.vae.config.scaling_factor) + self.vae.config.shift_factor
-                    images = self.vae.decode(x0, return_dict=False)[0]
-                    images = self.image_processor.postprocess(images, output_type="pil")
-                    self.do_step_blend(images, latents, subject_token, localization_model, show_attention, i, blend_models)
-                if latents.dtype != latents_dtype:
-                    if torch.backends.mps.is_available():
-                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                        latents = latents.to(latents_dtype)
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                # if XLA_AVAILABLE:
-                #     xm.mark_step()
         if output_type == "latent":
             image = latents
@@ -793,6 +792,9 @@ class AdditFluxPipeline(FluxPipeline):
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -987,49 +989,44 @@ class AdditFluxPipeline(FluxPipeline):
         latent_image_ids = latent_image_ids.expand(latents.shape[0], -1, -1)
         # 6. Denoising loop
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                noise_pred = self.transformer(
-                    hidden_states=latents,
-                    timestep=timestep / 1000,
-                    guidance=guidance,
-                    pooled_projections=pooled_prompt_embeds,
-                    encoder_hidden_states=prompt_embeds,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                    joint_attention_kwargs=self.joint_attention_kwargs,
-                    return_dict=False,
-                )[0]
-                # compute the previous noisy sample x_t -> x_t-1
-                latents_dtype = latents.dtype
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-                if latents.dtype != latents_dtype:
-                    if torch.backends.mps.is_available():
-                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                        latents = latents.to(latents_dtype)
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                # if XLA_AVAILABLE:
-                #     xm.mark_step()
         if output_type == "latent":
             image = latents
@@ -1126,6 +1123,9 @@ class AdditFluxPipeline(FluxPipeline):
         max_sequence_length: int = 512,
         fixed_point_iterations: int = 1,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -1328,60 +1328,55 @@ class AdditFluxPipeline(FluxPipeline):
         latents_list.append(latents)
         # 6. Denoising loop
-        with self.progress_bar(total=num_inference_steps * fixed_point_iterations) as progress_bar:
-            for i, t in enumerate(timesteps):
-                original_latents = latents.clone()
-                for j in range(fixed_point_iterations):
-                    if self.interrupt:
-                        continue
-                    if j == 0:
-                        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                        timestep = timesteps[i].expand(latents.shape[0]).to(latents.dtype)
-                    else:
-                        timestep = timesteps_one_start[i].expand(latents.shape[0]).to(latents.dtype)
-                    noise_pred = self.transformer(
-                        hidden_states=latents,
-                        # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
-                        timestep=timestep / 1000,
-                        guidance=guidance,
-                        pooled_projections=pooled_prompt_embeds,
-                        encoder_hidden_states=prompt_embeds,
-                        txt_ids=text_ids,
-                        img_ids=latent_image_ids,
-                        joint_attention_kwargs=self.joint_attention_kwargs,
-                        return_dict=False,
-                    )[0]
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents_dtype = latents.dtype
-                    # noise_pred = -noise_pred
-                    latents = self.scheduler.step(noise_pred, t, original_latents, return_dict=False, step_index=i)[0]
-                    if latents.dtype != latents_dtype:
-                        if torch.backends.mps.is_available():
-                            # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                            latents = latents.to(latents_dtype)
-                    if callback_on_step_end is not None:
-                        callback_kwargs = {}
-                        for k in callback_on_step_end_tensor_inputs:
-                            callback_kwargs[k] = locals()[k]
-                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-                        latents = callback_outputs.pop("latents", latents)
-                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-                    # if XLA_AVAILABLE:
-                    #     xm.mark_step()
-                latents_list.append(latents)
         # Offload all models
         self.maybe_free_model_hooks()

 # This work is licensed under the LICENSE file
 # located at the root directory.
+from tqdm import tqdm
 from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 import numpy as np
         is_img_src: bool = False,
         use_offset: bool = False,
         img_src_latents: Optional[List[torch.FloatTensor]] = None,
+        # TQDM
+        tqdm_desc: str = "Denoising",
     ):
         r"""
         Function invoked when calling the pipeline for generation.
                 img_src_latents.append((1.0 - sigma) * source_latents[0] + sigma * rand_noise)
         # 6. Denoising loop
+        for i, t in enumerate(tqdm(timesteps, desc=tqdm_desc)):
+            if self.interrupt:
+                continue
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            # For denoising from source image
+            if is_img_src:
+                latents[0] = img_src_latents[i]
+            # For Structure Transfer
+            if (source_latents is not None) and i == structure_transfer_step:
+                sigma = self.scheduler.sigmas[i]
+                latents[1] = (1.0 - sigma) * source_latents[0] + sigma * noise[1]
+            if is_auto_extend_scale and i == auto_extended_step:
+                def f(gamma):
+                    self.attention_store.attention_ratios[i] = {}
                     noise_pred = self.transformer(
                         hidden_states=latents,
                         timestep=timestep / 1000,
                         img_ids=latent_image_ids,
                         joint_attention_kwargs=self.joint_attention_kwargs,
                         return_dict=False,
+                        proccesor_kwargs={"step_index": i, "extended_scale": gamma},
+                    )[0]
+                    scores_per_layer = self.attention_store.get_attention_ratios(step_indices=[i], display_imgs=False)
+                    source_sum, text_sum, target_sum = scores_per_layer['transformer_blocks']
+                    # We want to find the gamma that makes the ratio equal to K
+                    ratio = (target_sum / source_sum)
+                    return (ratio - target_auto_ratio)
+                gamma_sol = brentq(f, 1.0, 1.2, xtol=0.01)
+                print('Chosen gamma:', gamma_sol)
+                extended_scale = gamma_sol
+            else:
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                    proccesor_kwargs={"step_index": i, "extended_scale": extended_scale},
+                )[0]
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents, x0 = self.scheduler.step(noise_pred, t, latents, return_dict=False, step_index=i)
+            if use_offset and is_img_src and (i+1 < len(img_src_latents)):
+                next_latent = img_src_latents[i+1]
+                offset = (next_latent - latents[0])
+                latents[1] = latents[1] + offset
+            # blend latents
+            if i in blend_steps and (subject_token is not None) and (localization_model is not None):
+                x0 = self._unpack_latents(x0, height, width, self.vae_scale_factor)
+                x0 = (x0 / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+                images = self.vae.decode(x0, return_dict=False)[0]
+                images = self.image_processor.postprocess(images, output_type="pil")
+                self.do_step_blend(images, latents, subject_token, localization_model, show_attention, i, blend_models)
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            # if XLA_AVAILABLE:
+            #     xm.mark_step()
         if output_type == "latent":
             image = latents
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
+        # TQDM
+        tqdm_desc: str = "Denoising",
     ):
         r"""
         Function invoked when calling the pipeline for generation.
         latent_image_ids = latent_image_ids.expand(latents.shape[0], -1, -1)
         # 6. Denoising loop
+        for i, t in enumerate(tqdm(timesteps, desc=tqdm_desc)):
+            if self.interrupt:
+                continue
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            noise_pred = self.transformer(
+                hidden_states=latents,
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=latent_image_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            # if XLA_AVAILABLE:
+            #     xm.mark_step()
         if output_type == "latent":
             image = latents
         max_sequence_length: int = 512,
         fixed_point_iterations: int = 1,
+        # TQDM
+        tqdm_desc: str = "Denoising",
     ):
         r"""
         Function invoked when calling the pipeline for generation.
         latents_list.append(latents)
         # 6. Denoising loop
+        for i, t in enumerate(tqdm(timesteps, desc=tqdm_desc)):
+            original_latents = latents.clone()
+            for j in range(fixed_point_iterations):
+                if self.interrupt:
+                    continue
+                if j == 0:
+                    # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                    timestep = timesteps[i].expand(latents.shape[0]).to(latents.dtype)
+                else:
+                    timestep = timesteps_one_start[i].expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                # noise_pred = -noise_pred
+                latents = self.scheduler.step(noise_pred, t, original_latents, return_dict=False, step_index=i)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # if XLA_AVAILABLE:
+                #     xm.mark_step()
+            latents_list.append(latents)
         # Offload all models
         self.maybe_free_model_hooks()

addit_methods.py CHANGED Viewed

@@ -55,6 +55,9 @@ def _add_object(
         is_img_src=is_img_src,
         img_src_latents=img_src_latents,
         use_offset=use_offset,
     )
     if display_output:
@@ -90,6 +93,7 @@ def add_object_generated(
         num_inference_steps=30,
         seed=[seed_src],
         output_type="both",
     )
     source_image = source_image[0]
@@ -141,7 +145,8 @@ def add_object_real(
         strength=0.1,
         guidance_scale=3.5,
         output_type="latent",
-        generator=torch.Generator(device=pipe.device).manual_seed(0)
     ).images
     # Optional inversion step
@@ -157,7 +162,8 @@ def add_object_real(
             num_inference_steps=30,
             guidance_scale=1,
             fixed_point_iterations=2,
-            generator=torch.Generator(device=pipe.device).manual_seed(0)
         )
         img_src_latents = [x[0] for x in latents_list][::-1]

         is_img_src=is_img_src,
         img_src_latents=img_src_latents,
         use_offset=use_offset,
+        # TQDM
+        tqdm_desc="Running Addit: Generating Edited Image",
     )
     if display_output:
         num_inference_steps=30,
         seed=[seed_src],
         output_type="both",
+        tqdm_desc="Generating Source Image",
     )
     source_image = source_image[0]
         strength=0.1,
         guidance_scale=3.5,
         output_type="latent",
+        generator=torch.Generator(device=pipe.device).manual_seed(0),
+        tqdm_desc="Encoding Source Image",
     ).images
     # Optional inversion step
             num_inference_steps=30,
             guidance_scale=1,
             fixed_point_iterations=2,
+            generator=torch.Generator(device=pipe.device).manual_seed(0),
+            tqdm_desc="Inverting Source Image",
         )
         img_src_latents = [x[0] for x in latents_list][::-1]

app.py CHANGED Viewed

@@ -216,8 +216,8 @@ def create_interface():
                         )
                         gen_prompt_target = gr.Textbox(
                             label="Target Prompt",
-                            placeholder="A photo of a cat wearing a red hat sitting on the couch",
-                            value="A photo of a cat wearing a red hat sitting on the couch"
                         )
                         gen_subject_token = gr.Textbox(
                             label="Subject Token",
@@ -227,8 +227,8 @@ def create_interface():
                         )
                         with gr.Accordion("Advanced Settings", open=False):
-                            gen_seed_src = gr.Number(label="Source Seed", value=6311, precision=0)
-                            gen_seed_obj = gr.Number(label="Object Seed", value=1, precision=0)
                             gen_extended_scale = gr.Slider(
                                 label="Extended Scale",
                                 minimum=1.0,
@@ -283,7 +283,7 @@ def create_interface():
                 gr.Examples(
                     examples=[
                         ["A photo of a man sitting on a bench", "A photo of a man sitting on a bench with a dog", "dog"],
-                        ["A photo of a cat sitting on the couch", "A photo of a cat wearing a red hat sitting on the couch", "hat"],
                         ["A car driving through an empty street", "A pink car driving through an empty street", "car"]
                     ],
                     inputs=[
@@ -317,8 +317,8 @@ def create_interface():
                         )
                         with gr.Accordion("Advanced Settings", open=False):
-                            real_seed_src = gr.Number(label="Source Seed", value=6311, precision=0)
-                            real_seed_obj = gr.Number(label="Object Seed", value=1, precision=0)
                             real_extended_scale = gr.Slider(
                                 label="Extended Scale",
                                 minimum=1.0,

                         )
                         gen_prompt_target = gr.Textbox(
                             label="Target Prompt",
+                            placeholder="A photo of a cat wearing a blue hat sitting on the couch",
+                            value="A photo of a cat wearing a blue hat sitting on the couch"
                         )
                         gen_subject_token = gr.Textbox(
                             label="Subject Token",
                         )
                         with gr.Accordion("Advanced Settings", open=False):
+                            gen_seed_src = gr.Number(label="Source Seed", value=1, precision=0)
+                            gen_seed_obj = gr.Number(label="Object Seed", value=42, precision=0)
                             gen_extended_scale = gr.Slider(
                                 label="Extended Scale",
                                 minimum=1.0,
                 gr.Examples(
                     examples=[
                         ["A photo of a man sitting on a bench", "A photo of a man sitting on a bench with a dog", "dog"],
+                        ["A photo of a cat sitting on the couch", "A photo of a cat wearing a blue hat sitting on the couch", "hat"],
                         ["A car driving through an empty street", "A pink car driving through an empty street", "car"]
                     ],
                     inputs=[
                         )
                         with gr.Accordion("Advanced Settings", open=False):
+                            real_seed_src = gr.Number(label="Source Seed", value=1, precision=0)
+                            real_seed_obj = gr.Number(label="Object Seed", value=0, precision=0)
                             real_extended_scale = gr.Slider(
                                 label="Extended Scale",
                                 minimum=1.0,