Spaces:

alexnasa
/

SuperResolution

Running on Zero

App Files Files Community

alexnasa commited on Jul 13

Commit

acd710e

verified ·

1 Parent(s): 39bbec4

Update pipelines/pipeline_seesr.py

Browse files

Files changed (1) hide show

pipelines/pipeline_seesr.py +199 -295

pipelines/pipeline_seesr.py CHANGED Viewed

@@ -22,7 +22,6 @@ import numpy as np
 import PIL.Image
 import torch
 import torch.nn.functional as F
-from torch.nn.functional import unfold, fold
 from torchvision.utils import save_image
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
@@ -96,31 +95,7 @@ EXAMPLE_DOC_STRING = """
         ... ).images[0]
         ```
 """
-def kde_grad(x0: torch.Tensor, patch_size = 16, bandwidth = 0.1):
-    # x0: (N, C, H, W) in float32
-    N, C, H, W = x0.shape
-    patches = unfold(
-        x0, kernel_size=patch_size, stride=patch_size//2
-    )                           # (N, C*ps*ps, M)
-    P, M = patches.shape[1], patches.shape[2]
-    p_i = patches.unsqueeze(1)  # (N,1,P,M)
-    p_j = patches.unsqueeze(0)  # (1,N,P,M)
-    diff = p_j - p_i            # (N,N,P,M)
-    # Gaussian weights
-    w = torch.exp((-0.5 / bandwidth**2) *
-                  (diff.square().sum(dim=2)))  # (N,N,M)
-    # mean-shift numerator & normalizer
-    num = (w.unsqueeze(2) * diff).sum(dim=1)          # (N,P,M)
-    denom = w.sum(dim=1, keepdim=True) + 1e-8         # (N,1,M)
-    mshift = num / denom                             # (N,P,M)
-    # fold back
-    grad = fold(
-        mshift / bandwidth**2,
-        output_size=(H, W),
-        kernel_size=patch_size,
-        stride=patch_size//2
-    )                                                # (N, C, H, W)
-    return grad
 class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     r"""
@@ -803,6 +778,7 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
         return torch.tile(torch.tensor(weights, device=self.device), (nbatches, self.unet.config.in_channels, 1, 1))
     @perfcount
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
@@ -832,12 +808,7 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
         ram_encoder_hidden_states=None,
         latent_tiled_size=320,
         latent_tiled_overlap=4,
-        num_particles: Optional[int] = 4,
-        gamma_0: Optional[float] = 0.1,  # base steering strength
-        use_KDS = True,
-        patch_size = 16,
-        bandwidth = 0.1,
-        args=None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -1025,17 +996,6 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        if use_KDS:
-            # 1) update batch_size to account for the new particles
-            batch_size = batch_size * num_particles
-            # 2) now repeat latents/images/prompts
-            latents = latents.repeat_interleave(num_particles, dim=0)
-            image   = image.repeat_interleave(num_particles, dim=0)
-            ram_encoder_hidden_states = ram_encoder_hidden_states.repeat_interleave(num_particles, dim=0)
-            prompt_embeds = prompt_embeds.repeat_interleave(num_particles, dim=0)
-            latents.requires_grad_(True)
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -1048,220 +1008,184 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
                 print(f"[Tiled Latent]: the input size is {image.shape[-2]}x{image.shape[-1]}, need to tiled")
             for i, t in enumerate(timesteps):
-                with torch.no_grad():
-                    # pass, if the timestep is larger than start_steps
-                    if t > start_steps:
-                        print(f'pass {t} steps.')
-                        continue
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                    # controlnet(s) inference
                     if guess_mode and do_classifier_free_guidance:
-                        # Infer ControlNet only for the conditional batch.
-                        controlnet_latent_model_input = latents
-                        controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
-                    else:
-                        controlnet_latent_model_input = latent_model_input
-                        controlnet_prompt_embeds = prompt_embeds
-                    if h*w<=tile_size*tile_size: # tiled latent input
-                        down_block_res_samples, mid_block_res_sample = [None]*10, None
-                        down_block_res_samples, mid_block_res_sample = self.controlnet(
-                            controlnet_latent_model_input,
-                            t,
-                            encoder_hidden_states=controlnet_prompt_embeds,
-                            controlnet_cond=image,
-                            conditioning_scale=conditioning_scale,
-                            guess_mode=guess_mode,
-                            return_dict=False,
-                            image_encoder_hidden_states = ram_encoder_hidden_states,
-                        )
-                        if guess_mode and do_classifier_free_guidance:
-                            # Infered ControlNet only for the conditional batch.
-                            # To apply the output of ControlNet to both the unconditional and conditional batches,
-                            # add 0 to the unconditional batch to keep it unchanged.
-                            down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
-                            mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
-                        # predict the noise residual
-                        noise_pred = self.unet(
-                            latent_model_input,
-                            t,
-                            encoder_hidden_states=prompt_embeds,
-                            cross_attention_kwargs=cross_attention_kwargs,
-                            down_block_additional_residuals=down_block_res_samples,
-                            mid_block_additional_residual=mid_block_res_sample,
-                            return_dict=False,
-                            image_encoder_hidden_states = ram_encoder_hidden_states,
-                        )[0]
-                    else:
-                        tile_weights = self._gaussian_weights(tile_size, tile_size, batch_size)
-                        tile_size = min(tile_size, min(h, w))
-                        tile_weights = self._gaussian_weights(tile_size, tile_size, batch_size)
-                        grid_rows = 0
-                        cur_x = 0
-                        while cur_x < latent_model_input.size(-1):
-                            cur_x = max(grid_rows * tile_size-tile_overlap * grid_rows, 0)+tile_size
-                            grid_rows += 1
-                        grid_cols = 0
-                        cur_y = 0
-                        while cur_y < latent_model_input.size(-2):
-                            cur_y = max(grid_cols * tile_size-tile_overlap * grid_cols, 0)+tile_size
-                            grid_cols += 1
-                        input_list = []
-                        cond_list = []
-                        img_list = []
-                        noise_preds = []
-                        for row in range(grid_rows):
-                            noise_preds_row = []
-                            for col in range(grid_cols):
-                                if col < grid_cols-1 or row < grid_rows-1:
-                                    # extract tile from input image
-                                    ofs_x = max(row * tile_size-tile_overlap * row, 0)
-                                    ofs_y = max(col * tile_size-tile_overlap * col, 0)
-                                    # input tile area on total image
-                                if row == grid_rows-1:
-                                    ofs_x = w - tile_size
-                                if col == grid_cols-1:
-                                    ofs_y = h - tile_size
-                                input_start_x = ofs_x
-                                input_end_x = ofs_x + tile_size
-                                input_start_y = ofs_y
-                                input_end_y = ofs_y + tile_size
-                                # input tile dimensions
-                                input_tile = latent_model_input[:, :, input_start_y:input_end_y, input_start_x:input_end_x]
-                                input_list.append(input_tile)
-                                cond_tile = controlnet_latent_model_input[:, :, input_start_y:input_end_y, input_start_x:input_end_x]
-                                cond_list.append(cond_tile)
-                                img_tile = image[:, :, input_start_y*8:input_end_y*8, input_start_x*8:input_end_x*8]
-                                img_list.append(img_tile)
-                                if len(input_list) == batch_size or col == grid_cols-1:
-                                    input_list_t = torch.cat(input_list, dim=0)
-                                    cond_list_t = torch.cat(cond_list, dim=0)
-                                    img_list_t = torch.cat(img_list, dim=0)
-                                    #print(input_list_t.shape, cond_list_t.shape, img_list_t.shape, fg_mask_list_t.shape)
-                                    down_block_res_samples, mid_block_res_sample = self.controlnet(
-                                        cond_list_t,
-                                        t,
-                                        encoder_hidden_states=controlnet_prompt_embeds,
-                                        controlnet_cond=img_list_t,
-                                        conditioning_scale=conditioning_scale,
-                                        guess_mode=guess_mode,
-                                        return_dict=False,
-                                        image_encoder_hidden_states = ram_encoder_hidden_states,
-                                    )
-                                    if guess_mode and do_classifier_free_guidance:
-                                        # Infered ControlNet only for the conditional batch.
-                                        # To apply the output of ControlNet to both the unconditional and conditional batches,
-                                        # add 0 to the unconditional batch to keep it unchanged.
-                                        down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
-                                        mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
-                                    # predict the noise residual
-                                    model_out = self.unet(
-                                        input_list_t,
-                                        t,
-                                        encoder_hidden_states=prompt_embeds,
-                                        cross_attention_kwargs=cross_attention_kwargs,
-                                        down_block_additional_residuals=down_block_res_samples,
-                                        mid_block_additional_residual=mid_block_res_sample,
-                                        return_dict=False,
-                                        image_encoder_hidden_states = ram_encoder_hidden_states,
-                                    )[0]
-                                    #for sample_i in range(model_out.size(0)):
-                                    #    noise_preds_row.append(model_out[sample_i].unsqueeze(0))
-                                    input_list = []
-                                    cond_list = []
-                                    img_list = []
-                                    noise_preds.append(model_out)
-                        # Stitch noise predictions for all tiles
-                        noise_pred = torch.zeros(latent_model_input.shape, device=latent_model_input.device)
-                        contributors = torch.zeros(latent_model_input.shape, device=latent_model_input.device)
-                        # Add each tile contribution to overall latents
-                        for row in range(grid_rows):
-                            for col in range(grid_cols):
-                                if col < grid_cols-1 or row < grid_rows-1:
-                                    # extract tile from input image
-                                    ofs_x = max(row * tile_size-tile_overlap * row, 0)
-                                    ofs_y = max(col * tile_size-tile_overlap * col, 0)
-                                    # input tile area on total image
-                                if row == grid_rows-1:
-                                    ofs_x = w - tile_size
-                                if col == grid_cols-1:
-                                    ofs_y = h - tile_size
-                                input_start_x = ofs_x
-                                input_end_x = ofs_x + tile_size
-                                input_start_y = ofs_y
-                                input_end_y = ofs_y + tile_size
-                                noise_pred[:, :, input_start_y:input_end_y, input_start_x:input_end_x] += noise_preds[row*grid_cols + col] * tile_weights
-                                contributors[:, :, input_start_y:input_end_y, input_start_x:input_end_x] += tile_weights
-                        # Average overlapping areas with more than 1 contributor
-                        noise_pred /= contributors
-                    # perform guidance
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                if use_KDS:
-                    # 2) Compute x₀ prediction for all particles
-                    beta_t   = 1 - self.scheduler.alphas_cumprod[t]
-                    alpha_t  = self.scheduler.alphas_cumprod[t].sqrt()
-                    sigma_t  = beta_t.sqrt()
-                    x0_pred  = (latents - sigma_t * noise_pred) / alpha_t  # shape [2N, C, H, W]
-                    # — split into unconditional vs. conditional
-                    x0_uncond, x0_cond = x0_pred.chunk(2, dim=0)           # each [N, C, H, W]
-                    # 3) Apply KDE steering *only* on the conditional batch
-                    m_shift_cond = kde_grad(x0_cond, bandwidth=bandwidth)  # [N, C, H, W]
-                    delta_t      = gamma_0 * (1 - i / (len(timesteps) - 1))
-                    x0_cond_steer = x0_cond + delta_t * m_shift_cond      # steered conditional
-                    # 4) Recombine the latents: leave uncond untouched, use steered cond
-                    x0_steer = torch.cat([x0_uncond, x0_cond_steer], dim=0)  # [2N, C, H, W]
-                    # 5) Recompute “noise” for DDIM step
-                    noise_pred_kds = (latents - alpha_t * x0_steer) / sigma_t
-                    # 6) Determine prev alphas and form next latent per DDIM
-                    if i < len(timesteps) - 1:
-                        next_t = timesteps[i + 1]
-                        alpha_prev = self.scheduler.alphas_cumprod[next_t].sqrt()
-                    else:
-                        alpha_prev = self.scheduler.final_alpha_cumprod.sqrt()
-                    sigma_prev = (1 - alpha_prev**2).sqrt()
-                    latents = (
-                        alpha_prev * x0_steer
-                        + sigma_prev * noise_pred_kds
-                    ).detach().requires_grad_(True)
-                else:
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -1269,53 +1193,33 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
-        with torch.no_grad():
-            if use_KDS:
-                # Final-latent selection (once!)
-                # latents shape: [2*N, C, H, W]
-                uncond_latents, cond_latents = latents.chunk(2, dim=0)  # each [N, C, H, W]
-                # 1) ensemble mean
-                mean_cond = cond_latents.mean(dim=0, keepdim=True)      # [1, C, H, W]
-                # 2) distances
-                dists = ((cond_latents - mean_cond)
-                          .view(cond_latents.size(0), -1)
-                          .pow(2)
-                          .sum(dim=1))                             # [N]
-                # 3) best index
-                best_idx = dists.argmin().item()
-                # 4) select that latent (and its uncond pair)
-                best_uncond = uncond_latents[best_idx:best_idx+1]
-                best_cond   = cond_latents  [best_idx:best_idx+1]
-                latents     = torch.cat([best_uncond, best_cond], dim=0)  # [2, C, H, W]
-            # If we do sequential model offloading, let's offload unet and controlnet
-            # manually for max memory savings
-            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-                self.unet.to("cpu")
-                self.controlnet.to("cpu")
-                torch.cuda.empty_cache()
             has_nsfw_concept = None
-            if not output_type == "latent":
-                image = self.vae.decode(latents.detach() / self.vae.config.scaling_factor, return_dict=False)[0]#.flip(1)
-                #image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-            else:
-                image = latents.detach()
-                has_nsfw_concept = None
-            if has_nsfw_concept is None:
-                do_denormalize = [True] * image.shape[0]
-            else:
-                do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-            image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-            # Offload last model to CPU
-            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-                self.final_offload_hook.offload()
-            if not return_dict:
-                return (image, has_nsfw_concept)
-            return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

 import PIL.Image
 import torch
 import torch.nn.functional as F
 from torchvision.utils import save_image
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
         ... ).images[0]
         ```
 """
 class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     r"""
         return torch.tile(torch.tensor(weights, device=self.device), (nbatches, self.unet.config.in_channels, 1, 1))
     @perfcount
+    @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         ram_encoder_hidden_states=None,
         latent_tiled_size=320,
         latent_tiled_overlap=4,
+        args=None
     ):
         r"""
         Function invoked when calling the pipeline for generation.
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
                 print(f"[Tiled Latent]: the input size is {image.shape[-2]}x{image.shape[-1]}, need to tiled")
             for i, t in enumerate(timesteps):
+                # pass, if the timestep is larger than start_steps
+                if t > start_steps:
+                    print(f'pass {t} steps.')
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # controlnet(s) inference
+                if guess_mode and do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    controlnet_latent_model_input = latents
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    controlnet_latent_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                if h*w<=tile_size*tile_size: # tiled latent input
+                    down_block_res_samples, mid_block_res_sample = [None]*10, None
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        controlnet_latent_model_input,
+                        t,
+                        encoder_hidden_states=controlnet_prompt_embeds,
+                        controlnet_cond=image,
+                        conditioning_scale=conditioning_scale,
+                        guess_mode=guess_mode,
+                        return_dict=False,
+                        image_encoder_hidden_states = ram_encoder_hidden_states,
+                    )
                     if guess_mode and do_classifier_free_guidance:
+                        # Infered ControlNet only for the conditional batch.
+                        # To apply the output of ControlNet to both the unconditional and conditional batches,
+                        # add 0 to the unconditional batch to keep it unchanged.
+                        down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                        mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        down_block_additional_residuals=down_block_res_samples,
+                        mid_block_additional_residual=mid_block_res_sample,
+                        return_dict=False,
+                        image_encoder_hidden_states = ram_encoder_hidden_states,
+                    )[0]
+                else:
+                    tile_weights = self._gaussian_weights(tile_size, tile_size, 1)
+                    tile_size = min(tile_size, min(h, w))
+                    tile_weights = self._gaussian_weights(tile_size, tile_size, 1)
+                    grid_rows = 0
+                    cur_x = 0
+                    while cur_x < latent_model_input.size(-1):
+                        cur_x = max(grid_rows * tile_size-tile_overlap * grid_rows, 0)+tile_size
+                        grid_rows += 1
+                    grid_cols = 0
+                    cur_y = 0
+                    while cur_y < latent_model_input.size(-2):
+                        cur_y = max(grid_cols * tile_size-tile_overlap * grid_cols, 0)+tile_size
+                        grid_cols += 1
+                    input_list = []
+                    cond_list = []
+                    img_list = []
+                    noise_preds = []
+                    for row in range(grid_rows):
+                        noise_preds_row = []
+                        for col in range(grid_cols):
+                            if col < grid_cols-1 or row < grid_rows-1:
+                                # extract tile from input image
+                                ofs_x = max(row * tile_size-tile_overlap * row, 0)
+                                ofs_y = max(col * tile_size-tile_overlap * col, 0)
+                                # input tile area on total image
+                            if row == grid_rows-1:
+                                ofs_x = w - tile_size
+                            if col == grid_cols-1:
+                                ofs_y = h - tile_size
+                            input_start_x = ofs_x
+                            input_end_x = ofs_x + tile_size
+                            input_start_y = ofs_y
+                            input_end_y = ofs_y + tile_size
+                            # input tile dimensions
+                            input_tile = latent_model_input[:, :, input_start_y:input_end_y, input_start_x:input_end_x]
+                            input_list.append(input_tile)
+                            cond_tile = controlnet_latent_model_input[:, :, input_start_y:input_end_y, input_start_x:input_end_x]
+                            cond_list.append(cond_tile)
+                            img_tile = image[:, :, input_start_y*8:input_end_y*8, input_start_x*8:input_end_x*8]
+                            img_list.append(img_tile)
+                            if len(input_list) == batch_size or col == grid_cols-1:
+                                input_list_t = torch.cat(input_list, dim=0)
+                                cond_list_t = torch.cat(cond_list, dim=0)
+                                img_list_t = torch.cat(img_list, dim=0)
+                                #print(input_list_t.shape, cond_list_t.shape, img_list_t.shape, fg_mask_list_t.shape)
+                                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                                    cond_list_t,
+                                    t,
+                                    encoder_hidden_states=controlnet_prompt_embeds,
+                                    controlnet_cond=img_list_t,
+                                    conditioning_scale=conditioning_scale,
+                                    guess_mode=guess_mode,
+                                    return_dict=False,
+                                    image_encoder_hidden_states = ram_encoder_hidden_states,
+                                )
+                                if guess_mode and do_classifier_free_guidance:
+                                    # Infered ControlNet only for the conditional batch.
+                                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                                    # add 0 to the unconditional batch to keep it unchanged.
+                                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+                                # predict the noise residual
+                                model_out = self.unet(
+                                    input_list_t,
+                                    t,
+                                    encoder_hidden_states=prompt_embeds,
+                                    cross_attention_kwargs=cross_attention_kwargs,
+                                    down_block_additional_residuals=down_block_res_samples,
+                                    mid_block_additional_residual=mid_block_res_sample,
+                                    return_dict=False,
+                                    image_encoder_hidden_states = ram_encoder_hidden_states,
+                                )[0]
+                                #for sample_i in range(model_out.size(0)):
+                                #    noise_preds_row.append(model_out[sample_i].unsqueeze(0))
+                                input_list = []
+                                cond_list = []
+                                img_list = []
+                            noise_preds.append(model_out)
+                    # Stitch noise predictions for all tiles
+                    noise_pred = torch.zeros(latent_model_input.shape, device=latent_model_input.device)
+                    contributors = torch.zeros(latent_model_input.shape, device=latent_model_input.device)
+                    # Add each tile contribution to overall latents
+                    for row in range(grid_rows):
+                        for col in range(grid_cols):
+                            if col < grid_cols-1 or row < grid_rows-1:
+                                # extract tile from input image
+                                ofs_x = max(row * tile_size-tile_overlap * row, 0)
+                                ofs_y = max(col * tile_size-tile_overlap * col, 0)
+                                # input tile area on total image
+                            if row == grid_rows-1:
+                                ofs_x = w - tile_size
+                            if col == grid_cols-1:
+                                ofs_y = h - tile_size
+                            input_start_x = ofs_x
+                            input_end_x = ofs_x + tile_size
+                            input_start_y = ofs_y
+                            input_end_y = ofs_y + tile_size
+                            noise_pred[:, :, input_start_y:input_end_y, input_start_x:input_end_x] += noise_preds[row*grid_cols + col] * tile_weights
+                            contributors[:, :, input_start_y:input_end_y, input_start_x:input_end_x] += tile_weights
+                    # Average overlapping areas with more than 1 contributor
+                    noise_pred /= contributors
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+        has_nsfw_concept = None
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]#.flip(1)
+            #image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
             has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)