Spaces:

alexnasa
/

SuperResolution

Running on Zero

App Files Files Community

alexnasa commited on Jul 12

Commit

454560a

verified ·

1 Parent(s): 548082b

Update pipelines/pipeline_seesr.py

Browse files

Files changed (1) hide show

pipelines/pipeline_seesr.py +204 -202

pipelines/pipeline_seesr.py CHANGED Viewed

@@ -1047,187 +1047,188 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
                 print(f"[Tiled Latent]: the input size is {image.shape[-2]}x{image.shape[-1]}, need to tiled")
             for i, t in enumerate(timesteps):
-                # pass, if the timestep is larger than start_steps
-                if t > start_steps:
-                    print(f'pass {t} steps.')
-                    continue
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                # controlnet(s) inference
-                if guess_mode and do_classifier_free_guidance:
-                    # Infer ControlNet only for the conditional batch.
-                    controlnet_latent_model_input = latents
-                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
-                    print("well unexpected")
-                else:
-                    controlnet_latent_model_input = latent_model_input
-                    controlnet_prompt_embeds = prompt_embeds
-                    print("a possiblity")
-                if h*w<=tile_size*tile_size: # tiled latent input
-                    down_block_res_samples, mid_block_res_sample = [None]*10, None
-                    print(f"controlnet 1 started with {controlnet_latent_model_input.shape}:{ram_encoder_hidden_states.shape}")
-                    down_block_res_samples, mid_block_res_sample = self.controlnet(
-                        controlnet_latent_model_input,
-                        t,
-                        encoder_hidden_states=controlnet_prompt_embeds,
-                        controlnet_cond=image,
-                        conditioning_scale=conditioning_scale,
-                        guess_mode=guess_mode,
-                        return_dict=False,
-                        image_encoder_hidden_states = ram_encoder_hidden_states,
-                    )
                     if guess_mode and do_classifier_free_guidance:
-                        # Infered ControlNet only for the conditional batch.
-                        # To apply the output of ControlNet to both the unconditional and conditional batches,
-                        # add 0 to the unconditional batch to keep it unchanged.
-                        down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
-                        mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
-                    # predict the noise residual
-                    print(f"unet started with {latent_model_input.shape}:{prompt_embeds.shape}")
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                        down_block_additional_residuals=down_block_res_samples,
-                        mid_block_additional_residual=mid_block_res_sample,
-                        return_dict=False,
-                        image_encoder_hidden_states = ram_encoder_hidden_states,
-                    )[0]
-                else:
-                    tile_weights = self._gaussian_weights(tile_size, tile_size, batch_size)
-                    tile_size = min(tile_size, min(h, w))
-                    tile_weights = self._gaussian_weights(tile_size, tile_size, batch_size)
-                    grid_rows = 0
-                    cur_x = 0
-                    while cur_x < latent_model_input.size(-1):
-                        cur_x = max(grid_rows * tile_size-tile_overlap * grid_rows, 0)+tile_size
-                        grid_rows += 1
-                    grid_cols = 0
-                    cur_y = 0
-                    while cur_y < latent_model_input.size(-2):
-                        cur_y = max(grid_cols * tile_size-tile_overlap * grid_cols, 0)+tile_size
-                        grid_cols += 1
-                    input_list = []
-                    cond_list = []
-                    img_list = []
-                    noise_preds = []
-                    for row in range(grid_rows):
-                        noise_preds_row = []
-                        for col in range(grid_cols):
-                            if col < grid_cols-1 or row < grid_rows-1:
-                                # extract tile from input image
-                                ofs_x = max(row * tile_size-tile_overlap * row, 0)
-                                ofs_y = max(col * tile_size-tile_overlap * col, 0)
-                                # input tile area on total image
-                            if row == grid_rows-1:
-                                ofs_x = w - tile_size
-                            if col == grid_cols-1:
-                                ofs_y = h - tile_size
-                            input_start_x = ofs_x
-                            input_end_x = ofs_x + tile_size
-                            input_start_y = ofs_y
-                            input_end_y = ofs_y + tile_size
-                            # input tile dimensions
-                            input_tile = latent_model_input[:, :, input_start_y:input_end_y, input_start_x:input_end_x]
-                            input_list.append(input_tile)
-                            cond_tile = controlnet_latent_model_input[:, :, input_start_y:input_end_y, input_start_x:input_end_x]
-                            cond_list.append(cond_tile)
-                            img_tile = image[:, :, input_start_y*8:input_end_y*8, input_start_x*8:input_end_x*8]
-                            img_list.append(img_tile)
-                            if len(input_list) == batch_size or col == grid_cols-1:
-                                input_list_t = torch.cat(input_list, dim=0)
-                                cond_list_t = torch.cat(cond_list, dim=0)
-                                img_list_t = torch.cat(img_list, dim=0)
-                                #print(input_list_t.shape, cond_list_t.shape, img_list_t.shape, fg_mask_list_t.shape)
-                                print(f"controlnet 2 started with {cond_list_t.shape}:{controlnet_prompt_embeds.shape}")
-                                down_block_res_samples, mid_block_res_sample = self.controlnet(
-                                    cond_list_t,
-                                    t,
-                                    encoder_hidden_states=controlnet_prompt_embeds,
-                                    controlnet_cond=img_list_t,
-                                    conditioning_scale=conditioning_scale,
-                                    guess_mode=guess_mode,
-                                    return_dict=False,
-                                    image_encoder_hidden_states = ram_encoder_hidden_states,
-                                )
-                                if guess_mode and do_classifier_free_guidance:
-                                    # Infered ControlNet only for the conditional batch.
-                                    # To apply the output of ControlNet to both the unconditional and conditional batches,
-                                    # add 0 to the unconditional batch to keep it unchanged.
-                                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
-                                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
-                                # predict the noise residual
-                                print(f"unet started with {input_list_t.shape}:{prompt_embeds.shape}")
-                                model_out = self.unet(
-                                    input_list_t,
-                                    t,
-                                    encoder_hidden_states=prompt_embeds,
-                                    cross_attention_kwargs=cross_attention_kwargs,
-                                    down_block_additional_residuals=down_block_res_samples,
-                                    mid_block_additional_residual=mid_block_res_sample,
-                                    return_dict=False,
-                                    image_encoder_hidden_states = ram_encoder_hidden_states,
-                                )[0]
-                                #for sample_i in range(model_out.size(0)):
-                                #    noise_preds_row.append(model_out[sample_i].unsqueeze(0))
-                                input_list = []
-                                cond_list = []
-                                img_list = []
-                            noise_preds.append(model_out)
-                    # Stitch noise predictions for all tiles
-                    noise_pred = torch.zeros(latent_model_input.shape, device=latent_model_input.device)
-                    contributors = torch.zeros(latent_model_input.shape, device=latent_model_input.device)
-                    # Add each tile contribution to overall latents
-                    for row in range(grid_rows):
-                        for col in range(grid_cols):
-                            if col < grid_cols-1 or row < grid_rows-1:
-                                # extract tile from input image
-                                ofs_x = max(row * tile_size-tile_overlap * row, 0)
-                                ofs_y = max(col * tile_size-tile_overlap * col, 0)
-                                # input tile area on total image
-                            if row == grid_rows-1:
-                                ofs_x = w - tile_size
-                            if col == grid_cols-1:
-                                ofs_y = h - tile_size
-                            input_start_x = ofs_x
-                            input_end_x = ofs_x + tile_size
-                            input_start_y = ofs_y
-                            input_end_y = ofs_y + tile_size
-                            noise_pred[:, :, input_start_y:input_end_y, input_start_x:input_end_x] += noise_preds[row*grid_cols + col] * tile_weights
-                            contributors[:, :, input_start_y:input_end_y, input_start_x:input_end_x] += tile_weights
-                    # Average overlapping areas with more than 1 contributor
-                    noise_pred /= contributors
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 if use_KDS:
@@ -1285,33 +1286,34 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
-        # If we do sequential model offloading, let's offload unet and controlnet
-        # manually for max memory savings
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.unet.to("cpu")
-            self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
-        has_nsfw_concept = None
-        if not output_type == "latent":
-            image = self.vae.decode(latents.detach() / self.vae.config.scaling_factor, return_dict=False)[0]#.flip(1)
-            #image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        else:
-            image = latents.detach()
             has_nsfw_concept = None
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
-        if not return_dict:
-            return (image, has_nsfw_concept)
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

                 print(f"[Tiled Latent]: the input size is {image.shape[-2]}x{image.shape[-1]}, need to tiled")
             for i, t in enumerate(timesteps):
+                with torch.no_grad():
+                    # pass, if the timestep is larger than start_steps
+                    if t > start_steps:
+                        print(f'pass {t} steps.')
+                        continue
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    # controlnet(s) inference
                     if guess_mode and do_classifier_free_guidance:
+                        # Infer ControlNet only for the conditional batch.
+                        controlnet_latent_model_input = latents
+                        controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                        print("well unexpected")
+                    else:
+                        controlnet_latent_model_input = latent_model_input
+                        controlnet_prompt_embeds = prompt_embeds
+                        print("a possiblity")
+                    if h*w<=tile_size*tile_size: # tiled latent input
+                        down_block_res_samples, mid_block_res_sample = [None]*10, None
+                        print(f"controlnet 1 started with {controlnet_latent_model_input.shape}:{ram_encoder_hidden_states.shape}")
+                        down_block_res_samples, mid_block_res_sample = self.controlnet(
+                            controlnet_latent_model_input,
+                            t,
+                            encoder_hidden_states=controlnet_prompt_embeds,
+                            controlnet_cond=image,
+                            conditioning_scale=conditioning_scale,
+                            guess_mode=guess_mode,
+                            return_dict=False,
+                            image_encoder_hidden_states = ram_encoder_hidden_states,
+                        )
+                        if guess_mode and do_classifier_free_guidance:
+                            # Infered ControlNet only for the conditional batch.
+                            # To apply the output of ControlNet to both the unconditional and conditional batches,
+                            # add 0 to the unconditional batch to keep it unchanged.
+                            down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                            mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+                        # predict the noise residual
+                        print(f"unet started with {latent_model_input.shape}:{prompt_embeds.shape}")
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=prompt_embeds,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            down_block_additional_residuals=down_block_res_samples,
+                            mid_block_additional_residual=mid_block_res_sample,
+                            return_dict=False,
+                            image_encoder_hidden_states = ram_encoder_hidden_states,
+                        )[0]
+                    else:
+                        tile_weights = self._gaussian_weights(tile_size, tile_size, batch_size)
+                        tile_size = min(tile_size, min(h, w))
+                        tile_weights = self._gaussian_weights(tile_size, tile_size, batch_size)
+                        grid_rows = 0
+                        cur_x = 0
+                        while cur_x < latent_model_input.size(-1):
+                            cur_x = max(grid_rows * tile_size-tile_overlap * grid_rows, 0)+tile_size
+                            grid_rows += 1
+                        grid_cols = 0
+                        cur_y = 0
+                        while cur_y < latent_model_input.size(-2):
+                            cur_y = max(grid_cols * tile_size-tile_overlap * grid_cols, 0)+tile_size
+                            grid_cols += 1
+                        input_list = []
+                        cond_list = []
+                        img_list = []
+                        noise_preds = []
+                        for row in range(grid_rows):
+                            noise_preds_row = []
+                            for col in range(grid_cols):
+                                if col < grid_cols-1 or row < grid_rows-1:
+                                    # extract tile from input image
+                                    ofs_x = max(row * tile_size-tile_overlap * row, 0)
+                                    ofs_y = max(col * tile_size-tile_overlap * col, 0)
+                                    # input tile area on total image
+                                if row == grid_rows-1:
+                                    ofs_x = w - tile_size
+                                if col == grid_cols-1:
+                                    ofs_y = h - tile_size
+                                input_start_x = ofs_x
+                                input_end_x = ofs_x + tile_size
+                                input_start_y = ofs_y
+                                input_end_y = ofs_y + tile_size
+                                # input tile dimensions
+                                input_tile = latent_model_input[:, :, input_start_y:input_end_y, input_start_x:input_end_x]
+                                input_list.append(input_tile)
+                                cond_tile = controlnet_latent_model_input[:, :, input_start_y:input_end_y, input_start_x:input_end_x]
+                                cond_list.append(cond_tile)
+                                img_tile = image[:, :, input_start_y*8:input_end_y*8, input_start_x*8:input_end_x*8]
+                                img_list.append(img_tile)
+                                if len(input_list) == batch_size or col == grid_cols-1:
+                                    input_list_t = torch.cat(input_list, dim=0)
+                                    cond_list_t = torch.cat(cond_list, dim=0)
+                                    img_list_t = torch.cat(img_list, dim=0)
+                                    #print(input_list_t.shape, cond_list_t.shape, img_list_t.shape, fg_mask_list_t.shape)
+                                    print(f"controlnet 2 started with {cond_list_t.shape}:{controlnet_prompt_embeds.shape}")
+                                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                                        cond_list_t,
+                                        t,
+                                        encoder_hidden_states=controlnet_prompt_embeds,
+                                        controlnet_cond=img_list_t,
+                                        conditioning_scale=conditioning_scale,
+                                        guess_mode=guess_mode,
+                                        return_dict=False,
+                                        image_encoder_hidden_states = ram_encoder_hidden_states,
+                                    )
+                                    if guess_mode and do_classifier_free_guidance:
+                                        # Infered ControlNet only for the conditional batch.
+                                        # To apply the output of ControlNet to both the unconditional and conditional batches,
+                                        # add 0 to the unconditional batch to keep it unchanged.
+                                        down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                                        mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+                                    # predict the noise residual
+                                    print(f"unet started with {input_list_t.shape}:{prompt_embeds.shape}")
+                                    model_out = self.unet(
+                                        input_list_t,
+                                        t,
+                                        encoder_hidden_states=prompt_embeds,
+                                        cross_attention_kwargs=cross_attention_kwargs,
+                                        down_block_additional_residuals=down_block_res_samples,
+                                        mid_block_additional_residual=mid_block_res_sample,
+                                        return_dict=False,
+                                        image_encoder_hidden_states = ram_encoder_hidden_states,
+                                    )[0]
+                                    #for sample_i in range(model_out.size(0)):
+                                    #    noise_preds_row.append(model_out[sample_i].unsqueeze(0))
+                                    input_list = []
+                                    cond_list = []
+                                    img_list = []
+                                noise_preds.append(model_out)
+                        # Stitch noise predictions for all tiles
+                        noise_pred = torch.zeros(latent_model_input.shape, device=latent_model_input.device)
+                        contributors = torch.zeros(latent_model_input.shape, device=latent_model_input.device)
+                        # Add each tile contribution to overall latents
+                        for row in range(grid_rows):
+                            for col in range(grid_cols):
+                                if col < grid_cols-1 or row < grid_rows-1:
+                                    # extract tile from input image
+                                    ofs_x = max(row * tile_size-tile_overlap * row, 0)
+                                    ofs_y = max(col * tile_size-tile_overlap * col, 0)
+                                    # input tile area on total image
+                                if row == grid_rows-1:
+                                    ofs_x = w - tile_size
+                                if col == grid_cols-1:
+                                    ofs_y = h - tile_size
+                                input_start_x = ofs_x
+                                input_end_x = ofs_x + tile_size
+                                input_start_y = ofs_y
+                                input_end_y = ofs_y + tile_size
+                                noise_pred[:, :, input_start_y:input_end_y, input_start_x:input_end_x] += noise_preds[row*grid_cols + col] * tile_weights
+                                contributors[:, :, input_start_y:input_end_y, input_start_x:input_end_x] += tile_weights
+                        # Average overlapping areas with more than 1 contributor
+                        noise_pred /= contributors
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 if use_KDS:
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
+        with torch.no_grad():
+            # If we do sequential model offloading, let's offload unet and controlnet
+            # manually for max memory savings
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.unet.to("cpu")
+                self.controlnet.to("cpu")
+                torch.cuda.empty_cache()
             has_nsfw_concept = None
+            if not output_type == "latent":
+                image = self.vae.decode(latents.detach() / self.vae.config.scaling_factor, return_dict=False)[0]#.flip(1)
+                #image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            else:
+                image = latents.detach()
+                has_nsfw_concept = None
+            if has_nsfw_concept is None:
+                do_denormalize = [True] * image.shape[0]
+            else:
+                do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+            image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+            # Offload last model to CPU
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.final_offload_hook.offload()
+            if not return_dict:
+                return (image, has_nsfw_concept)
+            return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)