Spaces:

yuvalalaluf
/

cross-image-attention

Runtime error

App Files Files Community

yuvalalaluf commited on Nov 24, 2023

Commit

82ef366

1 Parent(s): 991d8d3

initial commit

Browse files

Files changed (23) hide show

appearance_transfer_model.py +177 -0
config.py +66 -0
constants.py +3 -0
demo.py +96 -0
environment/environment.yaml +10 -0
environment/requirements.txt +17 -0
inputs/chocolate_cake.jpg +0 -0
inputs/duomo.png +0 -0
inputs/giraffe.png +0 -0
inputs/red_velvet_cake.jpg +0 -0
inputs/taj_mahal.jpg +0 -0
inputs/zebra.png +0 -0
models/__init__.py +0 -0
models/stable_diffusion.py +240 -0
models/unet_2d_condition.py +345 -0
utils/__init__.py +0 -0
utils/adain.py +45 -0
utils/attention_utils.py +37 -0
utils/ddpm_inversion.py +323 -0
utils/image_utils.py +59 -0
utils/latent_utils.py +81 -0
utils/model_utils.py +16 -0
utils/segmentation.py +111 -0

appearance_transfer_model.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from typing import List, Optional, Callable
+import torch
+import torch.nn.functional as F
+from config import RunConfig
+from constants import OUT_INDEX, STRUCT_INDEX, STYLE_INDEX
+from models.stable_diffusion import CrossImageAttentionStableDiffusionPipeline
+from utils import attention_utils
+from utils.adain import masked_adain
+from utils.model_utils import get_stable_diffusion_model
+from utils.segmentation import Segmentor
+class AppearanceTransferModel:
+    def __init__(self, config: RunConfig, pipe: Optional[CrossImageAttentionStableDiffusionPipeline] = None):
+        self.config = config
+        self.pipe = get_stable_diffusion_model() if pipe is None else pipe
+        self.register_attention_control()
+        self.segmentor = Segmentor(prompt=config.prompt, object_nouns=[config.object_noun])
+        self.latents_app, self.latents_struct = None, None
+        self.zs_app, self.zs_struct = None, None
+        self.image_app_mask_32, self.image_app_mask_64 = None, None
+        self.image_struct_mask_32, self.image_struct_mask_64 = None, None
+        self.enable_edit = False
+        self.step = 0
+    def set_latents(self, latents_app: torch.Tensor, latents_struct: torch.Tensor):
+        self.latents_app = latents_app
+        self.latents_struct = latents_struct
+    def set_noise(self, zs_app: torch.Tensor, zs_struct: torch.Tensor):
+        self.zs_app = zs_app
+        self.zs_struct = zs_struct
+    def set_masks(self, masks: List[torch.Tensor]):
+        self.image_app_mask_32, self.image_struct_mask_32, self.image_app_mask_64, self.image_struct_mask_64 = masks
+    def get_adain_callback(self):
+        def callback(st: int, timestep: int, latents: torch.FloatTensor) -> Callable:
+            self.step = st
+            # Compute the masks using prompt mixing self-segmentation and use the masks for AdaIN operation
+            if self.step == self.config.adain_range.start:
+                masks = self.segmentor.get_object_masks()
+                self.set_masks(masks)
+            # Apply AdaIN operation using the computed masks
+            if self.config.adain_range.start <= self.step < self.config.adain_range.end:
+                latents[0] = masked_adain(latents[0], latents[1], self.image_struct_mask_64, self.image_app_mask_64)
+        return callback
+    def register_attention_control(self):
+        model_self = self
+        class AttentionProcessor:
+            def __init__(self, place_in_unet: str):
+                self.place_in_unet = place_in_unet
+                if not hasattr(F, "scaled_dot_product_attention"):
+                    raise ImportError("AttnProcessor2_0 requires torch 2.0, to use it, please upgrade torch to 2.0.")
+            def __call__(self,
+                         attn,
+                         hidden_states: torch.Tensor,
+                         encoder_hidden_states: Optional[torch.Tensor] = None,
+                         attention_mask=None,
+                         temb=None,
+                         perform_swap: bool = False):
+                residual = hidden_states
+                if attn.spatial_norm is not None:
+                    hidden_states = attn.spatial_norm(hidden_states, temb)
+                input_ndim = hidden_states.ndim
+                if input_ndim == 4:
+                    batch_size, channel, height, width = hidden_states.shape
+                    hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+                batch_size, sequence_length, _ = (
+                    hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+                )
+                if attention_mask is not None:
+                    attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+                    attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+                if attn.group_norm is not None:
+                    hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+                query = attn.to_q(hidden_states)
+                is_cross = encoder_hidden_states is not None
+                if not is_cross:
+                    encoder_hidden_states = hidden_states
+                elif attn.norm_cross:
+                    encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+                key = attn.to_k(encoder_hidden_states)
+                value = attn.to_v(encoder_hidden_states)
+                inner_dim = key.shape[-1]
+                head_dim = inner_dim // attn.heads
+                should_mix = False
+                # Potentially apply our cross image attention operation
+                # To do so, we need to be in a self-attention alyer in the decoder part of the denoising network
+                if perform_swap and not is_cross and "up" in self.place_in_unet and model_self.enable_edit:
+                    if attention_utils.should_mix_keys_and_values(model_self, hidden_states):
+                        should_mix = True
+                        if model_self.step % 5 == 0 and model_self.step < 40:
+                            # Inject the structure's keys and values
+                            key[OUT_INDEX] = key[STRUCT_INDEX]
+                            value[OUT_INDEX] = value[STRUCT_INDEX]
+                        else:
+                            # Inject the appearance's keys and values
+                            key[OUT_INDEX] = key[STYLE_INDEX]
+                            value[OUT_INDEX] = value[STYLE_INDEX]
+                query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                # Compute the cross attention and apply our contrasting operation
+                hidden_states, attn_weight = attention_utils.compute_scaled_dot_product_attention(
+                    query, key, value,
+                    edit_map=perform_swap and model_self.enable_edit and should_mix,
+                    is_cross=is_cross,
+                    contrast_strength=model_self.config.contrast_strength,
+                )
+                # Update attention map for segmentation
+                if model_self.config.use_masked_adain and model_self.step == model_self.config.adain_range.start - 1:
+                    model_self.segmentor.update_attention(attn_weight, is_cross)
+                hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+                hidden_states = hidden_states.to(query[OUT_INDEX].dtype)
+                # linear proj
+                hidden_states = attn.to_out[0](hidden_states)
+                # dropout
+                hidden_states = attn.to_out[1](hidden_states)
+                if input_ndim == 4:
+                    hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+                if attn.residual_connection:
+                    hidden_states = hidden_states + residual
+                hidden_states = hidden_states / attn.rescale_output_factor
+                return hidden_states
+        def register_recr(net_, count, place_in_unet):
+            if net_.__class__.__name__ == 'ResnetBlock2D':
+                pass
+            if net_.__class__.__name__ == 'Attention':
+                net_.set_processor(AttentionProcessor(place_in_unet + f"_{count + 1}"))
+                return count + 1
+            elif hasattr(net_, 'children'):
+                for net__ in net_.children():
+                    count = register_recr(net__, count, place_in_unet)
+            return count
+        cross_att_count = 0
+        sub_nets = self.pipe.unet.named_children()
+        for net in sub_nets:
+            if "down" in net[0]:
+                cross_att_count += register_recr(net[1], 0, "down")
+            elif "up" in net[0]:
+                cross_att_count += register_recr(net[1], 0, "up")
+            elif "mid" in net[0]:
+                cross_att_count += register_recr(net[1], 0, "mid")

config.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from dataclasses import dataclass
+from pathlib import Path
+from typing import NamedTuple, Optional
+class Range(NamedTuple):
+    start: int
+    end: int
+@dataclass
+class RunConfig:
+    # Appearance image path
+    app_image_path: Path
+    # Struct image path
+    struct_image_path: Path
+    # Domain name (e.g., buildings, animals)
+    domain_name: Optional[str] = None
+    # Output path
+    output_path: Path = Path('./output')
+    # Random seed
+    seed: int = 42
+    # Input prompt for inversion (will use domain name as default)
+    prompt: Optional[str] = None
+    # Number of timesteps
+    num_timesteps: int = 100
+    # Whether to use a binary mask for performing AdaIN
+    use_masked_adain: bool = True
+    # Timesteps to apply cross-attention on 64x64 layers
+    cross_attn_64_range: Range = Range(start=10, end=90)
+    # Timesteps to apply cross-attention on 32x32 layers
+    cross_attn_32_range: Range = Range(start=10, end=70)
+    # Timesteps to apply AdaIn
+    adain_range: Range = Range(start=20, end=100)
+    # Guidance scale
+    guidance_scale: float = 7.5
+    # Swap guidance scale
+    swap_guidance_scale: float = 3.5
+    # Attention contrasting strength
+    contrast_strength: float = 1.67
+    # Object nouns to use for self-segmentation (will use the domain name as default)
+    object_noun: Optional[str] = None
+    # Whether to load previously saved inverted latent codes
+    load_latents: bool = True
+    # Number of steps to skip in the denoising process (used value from original edit-friendly DDPM paper)
+    skip_steps: int = 32
+    def __post_init__(self):
+        self.output_path = self.output_path / self.domain_name
+        self.output_path.mkdir(parents=True, exist_ok=True)
+        # Handle the domain name, prompt, and object nouns used for masking, etc.
+        if self.use_masked_adain and self.domain_name is None:
+            raise ValueError("Must provide --domain_name and --prompt when using masked AdaIN")
+        if not self.use_masked_adain and self.domain_name is None:
+            self.domain_name = "object"
+        if self.prompt is None:
+            self.prompt = f"A photo of a {self.domain_name}"
+        if self.object_noun is None:
+            self.object_noun = self.domain_name
+        # Define the paths to store the inverted latents to
+        self.latents_path = Path(self.output_path) / "latents"
+        self.latents_path.mkdir(parents=True, exist_ok=True)
+        self.app_latent_save_path = self.latents_path / f"{self.app_image_path.stem}.pt"
+        self.struct_latent_save_path = self.latents_path / f"{self.struct_image_path.stem}.pt"

constants.py ADDED Viewed

	@@ -0,0 +1,3 @@

+OUT_INDEX = 0
+STYLE_INDEX = 1
+STRUCT_INDEX = 2

demo.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import sys
+from pathlib import Path
+from typing import Optional
+import gradio as gr
+from PIL import Image
+from appearance_transfer_model import AppearanceTransferModel
+from run import run_appearance_transfer
+from utils.latent_utils import load_latents_or_invert_images
+from utils.model_utils import get_stable_diffusion_model
+sys.path.append(".")
+sys.path.append("..")
+from config import RunConfig
+DESCRIPTION = '''
+<h1 style="text-align: center;"> Cross-Image Attention for Zero-Shot Appearance Transfer </h1>
+<p style="text-align: center;">
+    This is a demo for our <a href="https://arxiv.org/abs/2311.03335">paper</a>:
+    ''Cross-Image Attention for Zero-Shot Appearance Transfer''.
+    <br>
+    Given two images depicting a source structure and a target appearance, our method generates an image merging
+    the structure of one image with the appearance of the other.
+    <br>
+    We do so in a zero-shot manner, with no optimization or model training required while supporting appearance
+    transfer across images that may differ in size and shape.
+</p>
+'''
+pipe = get_stable_diffusion_model()
+def main_pipeline(app_image_path: str,
+                  struct_image_path: str,
+                  domain_name: str,
+                  seed: int,
+                  prompt: Optional[str] = None) -> Image.Image:
+    if prompt == "":
+        prompt = None
+    config = RunConfig(
+        app_image_path=Path(app_image_path),
+        struct_image_path=Path(struct_image_path),
+        domain_name=domain_name,
+        prompt=prompt,
+        seed=seed,
+        load_latents=False
+    )
+    model = AppearanceTransferModel(config=config, pipe=pipe)
+    latents_app, latents_struct, noise_app, noise_struct = load_latents_or_invert_images(model=model, cfg=config)
+    model.set_latents(latents_app, latents_struct)
+    model.set_noise(noise_app, noise_struct)
+    print("Running appearance transfer...")
+    images = run_appearance_transfer(model=model, cfg=config)
+    print("Done.")
+    return [images[0]]
+with gr.Blocks(css='style.css') as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.HTML('''<a href="https://huggingface.co/spaces/yuvalalaluf/cross-image-attention?duplicate=true"><img src="https://bit.ly/3gLdBN6"
+            alt="Duplicate Space"></a>''')
+    with gr.Row():
+        with gr.Column():
+            app_image_path = gr.Image(label="Upload appearance image", type="filepath")
+            struct_image_path = gr.Image(label="Upload structure image", type="filepath")
+            domain_name = gr.Text(label="Domain name", max_lines=1,
+                                  info="Specifies the domain the objects are coming from (e.g., 'animal', 'building', etc).")
+            prompt = gr.Text(label="Prompt to use for inversion.", value='',
+                             info='If this kept empty, we will use the domain name to define '
+                                  'the prompt as "A photo of a <domain_name>".')
+            random_seed = gr.Number(value=42, label="Random seed", precision=0)
+            run_button = gr.Button('Generate')
+        with gr.Column():
+            result = gr.Gallery(label='Result')
+            inputs = [app_image_path, struct_image_path, domain_name, random_seed, prompt]
+            outputs = [result]
+            run_button.click(fn=main_pipeline, inputs=inputs, outputs=outputs)
+    with gr.Row():
+        examples = [
+            ['inputs/zebra.png', 'inputs/giraffe.png', 'animal', 20, None],
+            ['inputs/taj_mahal.jpg', 'inputs/duomo.png', 'building', 42, None],
+            ['inputs/red_velvet_cake.jpg', 'inputs/chocolate_cake.jpg', 'cake', 42, 'A photo of cake'],
+        ]
+        gr.Examples(examples=examples,
+                    inputs=[app_image_path, struct_image_path, domain_name, random_seed, prompt],
+                    outputs=[result],
+                    fn=main_pipeline,
+                    cache_examples=True)
+demo.launch(share=False, server_name="127.0.0.1", server_port=8888)

environment/environment.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+name: cross_image
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.8.5
+  - pip=20.3
+  - cudatoolkit=11.3
+  - pip:
+    - -r requirements.txt

environment/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+matplotlib==3.6.3
+matplotlib-inline==0.1.6
+jupyter==1.0.0
+numpy==1.24.1
+pyrallis==0.3.1
+torch==2.0.1
+torchvision==0.15.2
+diffusers==0.19.3
+transformers==4.30.2
+accelerate==0.20.3
+huggingface-hub==0.16.4
+xformers==0.0.21
+tokenizers==0.13.3
+nltk==3.8.1
+Pillow==10.1.0
+scikit_learn==1.3.0
+tqdm==4.64.1

inputs/chocolate_cake.jpg ADDED Viewed

inputs/duomo.png ADDED Viewed

inputs/giraffe.png ADDED Viewed

inputs/red_velvet_cake.jpg ADDED Viewed

inputs/taj_mahal.jpg ADDED Viewed

inputs/zebra.png ADDED Viewed

models/__init__.py ADDED Viewed

File without changes

models/stable_diffusion.py ADDED Viewed

	@@ -0,0 +1,240 @@

+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from tqdm import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor
+from config import Range
+from models.unet_2d_condition import FreeUUNet2DConditionModel
+class CrossImageAttentionStableDiffusionPipeline(StableDiffusionPipeline):
+    """ A modification of the standard StableDiffusionPipeline to incorporate our cross-image attention."""
+    def __init__(self, vae: AutoencoderKL,
+                 text_encoder: CLIPTextModel,
+                 tokenizer: CLIPTokenizer,
+                 unet: FreeUUNet2DConditionModel,
+                 scheduler: KarrasDiffusionSchedulers,
+                 safety_checker: StableDiffusionSafetyChecker,
+                 feature_extractor: CLIPImageProcessor,
+                 requires_safety_checker: bool = True):
+        super().__init__(
+            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
+        )
+    @torch.no_grad()
+    def __call__(
+            self,
+            prompt: Union[str, List[str]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            guidance_scale: float = 7.5,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            num_images_per_prompt: Optional[int] = 1,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+            callback_steps: int = 1,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            guidance_rescale: float = 0.0,
+            swap_guidance_scale: float = 1.0,
+            cross_image_attention_range: Range = Range(10, 90),
+            # DDPM addition
+            zs: Optional[List[torch.Tensor]] = None
+    ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        t_to_idx = {int(v): k for k, v in enumerate(timesteps[-zs[0].shape[0]:])}
+        timesteps = timesteps[-zs[0].shape[0]:]
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        op = tqdm(timesteps[-zs[0].shape[0]:])
+        n_timesteps = len(timesteps[-zs[0].shape[0]:])
+        count = 0
+        for t in op:
+            i = t_to_idx[int(t)]
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            noise_pred_swap = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                cross_attention_kwargs={'perform_swap': True},
+                return_dict=False,
+            )[0]
+            noise_pred_no_swap = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                cross_attention_kwargs={'perform_swap': False},
+                return_dict=False,
+            )[0]
+            # perform guidance
+            if do_classifier_free_guidance:
+                _, noise_swap_pred_text = noise_pred_swap.chunk(2)
+                noise_no_swap_pred_uncond, _ = noise_pred_no_swap.chunk(2)
+                noise_pred = noise_no_swap_pred_uncond + guidance_scale * (
+                        noise_swap_pred_text - noise_no_swap_pred_uncond)
+            else:
+                is_cross_image_step = cross_image_attention_range.start <= i <= cross_image_attention_range.end
+                if swap_guidance_scale > 1.0 and is_cross_image_step:
+                    swapping_strengths = np.linspace(swap_guidance_scale,
+                                                     max(swap_guidance_scale / 2, 1.0),
+                                                     n_timesteps)
+                    swapping_strength = swapping_strengths[count]
+                    noise_pred = noise_pred_no_swap + swapping_strength * (noise_pred_swap - noise_pred_no_swap)
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_swap, guidance_rescale=guidance_rescale)
+                else:
+                    noise_pred = noise_pred_swap
+            latents = torch.stack([
+                self.perform_ddpm_step(t_to_idx, zs[latent_idx], latents[latent_idx], t, noise_pred[latent_idx], eta)
+                for latent_idx in range(latents.shape[0])
+            ])
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                # progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, latents)
+            count += 1
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    def perform_ddpm_step(self, t_to_idx, zs, latents, t, noise_pred, eta):
+        idx = t_to_idx[int(t)]
+        z = zs[idx] if not zs is None else None
+        # 1. get previous step value (=t-1)
+        prev_timestep = t - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        # 2. compute alphas, betas
+        alpha_prod_t = self.scheduler.alphas_cumprod[t]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[
+            prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        # variance = self.scheduler._get_variance(timestep, prev_timestep)
+        variance = self.get_variance(t)
+        std_dev_t = eta * variance ** (0.5)
+        # Take care of asymetric reverse process (asyrp)
+        model_output_direction = noise_pred
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        # pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output_direction
+        pred_sample_direction = (1 - alpha_prod_t_prev - eta * variance) ** (0.5) * model_output_direction
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+        # 8. Add noice if eta > 0
+        if eta > 0:
+            if z is None:
+                z = torch.randn(noise_pred.shape, device=self.device)
+            sigma_z = eta * variance ** (0.5) * z
+            prev_sample = prev_sample + sigma_z
+        return prev_sample
+    def get_variance(self, timestep):
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[
+            prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+        return variance

models/unet_2d_condition.py ADDED Viewed

	@@ -0,0 +1,345 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from diffusers import UNet2DConditionModel
+from diffusers.models.unet_2d_condition import UNet2DConditionOutput
+from diffusers.utils import logging
+from torch.fft import fftn, ifftn, fftshift, ifftshift
+"""
+This is a small extension of the standard UNet2DConditionModel with the small addition of the
+Free-U trick (https://github.com/ChenyangSi/FreeU).
+"""
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def Fourier_filter(x, threshold, scale):
+    # FFT
+    x_freq = fftn(x, dim=(-2, -1))
+    x_freq = fftshift(x_freq, dim=(-2, -1))
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W)).cuda()  # CUDA için
+    crow, ccol = H // 2, W // 2
+    mask[..., crow - threshold:crow + threshold, ccol - threshold:ccol + threshold] = scale
+    x_freq = x_freq * mask
+    # IFFT
+    x_freq = ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = ifftn(x_freq, dim=(-2, -1)).real
+    return x_filtered
+class FreeUUNet2DConditionModel(UNet2DConditionModel):
+    def forward(
+            self,
+            sample: torch.FloatTensor,
+            timestep: Union[torch.Tensor, float, int],
+            encoder_hidden_states: torch.Tensor,
+            class_labels: Optional[torch.Tensor] = None,
+            timestep_cond: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+            down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+            mid_block_additional_residual: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2 ** self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    sample += down_block_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                    down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # Add the Free-U trick here!
+            # Fourier Filter
+            if sample.shape[1] == 1280:
+                sample[:, :640] *= 1.2  # 1.1  # For SD2.1
+                sample = Fourier_filter(sample, threshold=1, scale=0.9)
+            if sample.shape[1] == 640:
+                sample[:, :320] *= 1.4  # 1.2  # For SD2.1
+                sample = Fourier_filter(sample, threshold=1, scale=0.2)
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)

utils/__init__.py ADDED Viewed

File without changes

utils/adain.py ADDED Viewed

	@@ -0,0 +1,45 @@

+def masked_adain(content_feat, style_feat, content_mask, style_mask):
+    assert (content_feat.size()[:2] == style_feat.size()[:2])
+    size = content_feat.size()
+    style_mean, style_std = calc_mean_std(style_feat, mask=style_mask)
+    content_mean, content_std = calc_mean_std(content_feat, mask=content_mask)
+    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
+    style_normalized_feat = normalized_feat * style_std.expand(size) + style_mean.expand(size)
+    return content_feat * (1 - content_mask) + style_normalized_feat * content_mask
+def calc_mean_std(feat, eps=1e-5, mask=None):
+    # eps is a small value added to the variance to avoid divide-by-zero.
+    size = feat.size()
+    if len(size) == 2:
+        return calc_mean_std_2d(feat, eps, mask)
+    assert (len(size) == 3)
+    C = size[0]
+    if mask is not None:
+        feat_var = feat.view(C, -1)[:, mask.view(-1) == 1].var(dim=1) + eps
+        feat_std = feat_var.sqrt().view(C, 1, 1)
+        feat_mean = feat.view(C, -1)[:, mask.view(-1) == 1].mean(dim=1).view(C, 1, 1)
+    else:
+        feat_var = feat.view(C, -1).var(dim=1) + eps
+        feat_std = feat_var.sqrt().view(C, 1, 1)
+        feat_mean = feat.view(C, -1).mean(dim=1).view(C, 1, 1)
+    return feat_mean, feat_std
+def calc_mean_std_2d(feat, eps=1e-5, mask=None):
+    # eps is a small value added to the variance to avoid divide-by-zero.
+    size = feat.size()
+    assert (len(size) == 2)
+    C = size[0]
+    if mask is not None:
+        feat_var = feat.view(C, -1)[:, mask.view(-1) == 1].var(dim=1) + eps
+        feat_std = feat_var.sqrt().view(C, 1)
+        feat_mean = feat.view(C, -1)[:, mask.view(-1) == 1].mean(dim=1).view(C, 1)
+    else:
+        feat_var = feat.view(C, -1).var(dim=1) + eps
+        feat_std = feat_var.sqrt().view(C, 1)
+        feat_mean = feat.view(C, -1).mean(dim=1).view(C, 1)
+    return feat_mean, feat_std

utils/attention_utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import math
+import torch
+from constants import OUT_INDEX
+def should_mix_keys_and_values(model, hidden_states: torch.Tensor) -> bool:
+    """ Verify whether we should perform the mixing in the current timestep. """
+    is_in_32_timestep_range = (
+            model.config.cross_attn_32_range.start <= model.step < model.config.cross_attn_32_range.end
+    )
+    is_in_64_timestep_range = (
+            model.config.cross_attn_64_range.start <= model.step < model.config.cross_attn_64_range.end
+    )
+    is_hidden_states_32_square = (hidden_states.shape[1] == 32 ** 2)
+    is_hidden_states_64_square = (hidden_states.shape[1] == 64 ** 2)
+    should_mix = (is_in_32_timestep_range and is_hidden_states_32_square) or \
+                 (is_in_64_timestep_range and is_hidden_states_64_square)
+    return should_mix
+def compute_scaled_dot_product_attention(Q, K, V, edit_map=False, is_cross=False, contrast_strength=1.0):
+    """ Compute the scale dot product attention, potentially with our contrasting operation. """
+    attn_weight = torch.softmax((Q @ K.transpose(-2, -1) / math.sqrt(Q.size(-1))), dim=-1)
+    if edit_map and not is_cross:
+        attn_weight[OUT_INDEX] = torch.stack([
+            torch.clip(enhance_tensor(attn_weight[OUT_INDEX][head_idx], contrast_factor=contrast_strength),
+                       min=0.0, max=1.0)
+            for head_idx in range(attn_weight.shape[1])
+        ])
+    return attn_weight @ V, attn_weight
+def enhance_tensor(tensor: torch.Tensor, contrast_factor: float = 1.67) -> torch.Tensor:
+    """ Compute the attention map contrasting. """
+    adjusted_tensor = (tensor - tensor.mean(dim=-1)) * contrast_factor + tensor.mean(dim=-1)
+    return adjusted_tensor

utils/ddpm_inversion.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import abc
+import torch
+from torch import inference_mode
+from tqdm import tqdm
+"""
+Inversion code taken from:
+1. The official implementation of Edit-Friendly DDPM Inversion: https://github.com/inbarhub/DDPM_inversion
+2. The LEDITS demo: https://huggingface.co/spaces/editing-images/ledits/tree/main
+"""
+LOW_RESOURCE = True
+def invert(x0, pipe, prompt_src="", num_diffusion_steps=100, cfg_scale_src=3.5, eta=1):
+    #  inverts a real image according to Algorihm 1 in https://arxiv.org/pdf/2304.06140.pdf,
+    #  based on the code in https://github.com/inbarhub/DDPM_inversion
+    #  returns wt, zs, wts:
+    #  wt - inverted latent
+    #  wts - intermediate inverted latents
+    #  zs - noise maps
+    pipe.scheduler.set_timesteps(num_diffusion_steps)
+    with inference_mode():
+        w0 = (pipe.vae.encode(x0).latent_dist.mode() * 0.18215).float()
+    wt, zs, wts = inversion_forward_process(pipe, w0, etas=eta, prompt=prompt_src, cfg_scale=cfg_scale_src,
+                                            prog_bar=True, num_inference_steps=num_diffusion_steps)
+    return zs, wts
+def inversion_forward_process(model, x0,
+                              etas=None,
+                              prog_bar=False,
+                              prompt="",
+                              cfg_scale=3.5,
+                              num_inference_steps=50, eps=None
+                              ):
+    if not prompt == "":
+        text_embeddings = encode_text(model, prompt)
+    uncond_embedding = encode_text(model, "")
+    timesteps = model.scheduler.timesteps.to(model.device)
+    variance_noise_shape = (
+        num_inference_steps,
+        model.unet.in_channels,
+        model.unet.sample_size,
+        model.unet.sample_size)
+    if etas is None or (type(etas) in [int, float] and etas == 0):
+        eta_is_zero = True
+        zs = None
+    else:
+        eta_is_zero = False
+        if type(etas) in [int, float]: etas = [etas] * model.scheduler.num_inference_steps
+        xts = sample_xts_from_x0(model, x0, num_inference_steps=num_inference_steps)
+        alpha_bar = model.scheduler.alphas_cumprod
+        zs = torch.zeros(size=variance_noise_shape, device=model.device)
+    t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
+    xt = x0
+    op = tqdm(reversed(timesteps)) if prog_bar else reversed(timesteps)
+    for t in op:
+        idx = t_to_idx[int(t)]
+        # 1. predict noise residual
+        if not eta_is_zero:
+            xt = xts[idx][None]
+        with torch.no_grad():
+            out = model.unet.forward(xt, timestep=t, encoder_hidden_states=uncond_embedding)
+            if not prompt == "":
+                cond_out = model.unet.forward(xt, timestep=t, encoder_hidden_states=text_embeddings)
+        if not prompt == "":
+            ## classifier free guidance
+            noise_pred = out.sample + cfg_scale * (cond_out.sample - out.sample)
+        else:
+            noise_pred = out.sample
+        if eta_is_zero:
+            # 2. compute more noisy image and set x_t -> x_t+1
+            xt = forward_step(model, noise_pred, t, xt)
+        else:
+            xtm1 = xts[idx + 1][None]
+            # pred of x0
+            pred_original_sample = (xt - (1 - alpha_bar[t]) ** 0.5 * noise_pred) / alpha_bar[t] ** 0.5
+            # direction to xt
+            prev_timestep = t - model.scheduler.config.num_train_timesteps // model.scheduler.num_inference_steps
+            alpha_prod_t_prev = model.scheduler.alphas_cumprod[
+                prev_timestep] if prev_timestep >= 0 else model.scheduler.final_alpha_cumprod
+            variance = get_variance(model, t)
+            pred_sample_direction = (1 - alpha_prod_t_prev - etas[idx] * variance) ** (0.5) * noise_pred
+            mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+            z = (xtm1 - mu_xt) / (etas[idx] * variance ** 0.5)
+            zs[idx] = z
+            # correction to avoid error accumulation
+            xtm1 = mu_xt + (etas[idx] * variance ** 0.5) * z
+            xts[idx + 1] = xtm1
+    if not zs is None:
+        zs[-1] = torch.zeros_like(zs[-1])
+    return xt, zs, xts
+def encode_text(model, prompts):
+    text_input = model.tokenizer(
+        prompts,
+        padding="max_length",
+        max_length=model.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        text_encoding = model.text_encoder(text_input.input_ids.to(model.device))[0]
+    return text_encoding
+def sample_xts_from_x0(model, x0, num_inference_steps=50):
+    """
+    Samples from P(x_1:T|x_0)
+    """
+    # torch.manual_seed(43256465436)
+    alpha_bar = model.scheduler.alphas_cumprod
+    sqrt_one_minus_alpha_bar = (1 - alpha_bar) ** 0.5
+    alphas = model.scheduler.alphas
+    betas = 1 - alphas
+    variance_noise_shape = (
+        num_inference_steps,
+        model.unet.in_channels,
+        model.unet.sample_size,
+        model.unet.sample_size)
+    timesteps = model.scheduler.timesteps.to(model.device)
+    t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
+    xts = torch.zeros(variance_noise_shape).to(x0.device)
+    for t in reversed(timesteps):
+        idx = t_to_idx[int(t)]
+        xts[idx] = x0 * (alpha_bar[t] ** 0.5) + torch.randn_like(x0) * sqrt_one_minus_alpha_bar[t]
+    xts = torch.cat([xts, x0], dim=0)
+    return xts
+def forward_step(model, model_output, timestep, sample):
+    next_timestep = min(model.scheduler.config.num_train_timesteps - 2,
+                        timestep + model.scheduler.config.num_train_timesteps // model.scheduler.num_inference_steps)
+    # 2. compute alphas, betas
+    alpha_prod_t = model.scheduler.alphas_cumprod[timestep]
+    beta_prod_t = 1 - alpha_prod_t
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+    next_sample = model.scheduler.add_noise(pred_original_sample,
+                                            model_output,
+                                            torch.LongTensor([next_timestep]))
+    return next_sample
+def get_variance(model, timestep):
+    prev_timestep = timestep - model.scheduler.config.num_train_timesteps // model.scheduler.num_inference_steps
+    alpha_prod_t = model.scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = model.scheduler.alphas_cumprod[
+        prev_timestep] if prev_timestep >= 0 else model.scheduler.final_alpha_cumprod
+    beta_prod_t = 1 - alpha_prod_t
+    beta_prod_t_prev = 1 - alpha_prod_t_prev
+    variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+    return variance
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        return self.num_att_layers if LOW_RESOURCE else 0
+    @abc.abstractmethod
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            if LOW_RESOURCE:
+                attn = self.forward(attn, is_cross, place_in_unet)
+            else:
+                h = attn.shape[0]
+                attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.between_steps()
+        return attn
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [],
+                "down_self": [], "mid_self": [], "up_self": []}
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= 32 ** 2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+        return attn
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                for i in range(len(self.attention_store[key])):
+                    self.attention_store[key][i] += self.step_store[key][i]
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in
+                             self.attention_store}
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+    def __init__(self):
+        super(AttentionStore, self).__init__()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+def register_attention_control(model, controller):
+    def ca_forward(self, place_in_unet):
+        to_out = self.to_out
+        if type(to_out) is torch.nn.modules.container.ModuleList:
+            to_out = self.to_out[0]
+        else:
+            to_out = self.to_out
+        def forward(x, context=None, mask=None):
+            batch_size, sequence_length, dim = x.shape
+            h = self.heads
+            q = self.to_q(x)
+            is_cross = context is not None
+            context = context if is_cross else x
+            k = self.to_k(context)
+            v = self.to_v(context)
+            q = self.reshape_heads_to_batch_dim(q)
+            k = self.reshape_heads_to_batch_dim(k)
+            v = self.reshape_heads_to_batch_dim(v)
+            sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
+            if mask is not None:
+                mask = mask.reshape(batch_size, -1)
+                max_neg_value = -torch.finfo(sim.dtype).max
+                mask = mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~mask, max_neg_value)
+            # attention, what we cannot get enough of
+            attn = sim.softmax(dim=-1)
+            attn = controller(attn, is_cross, place_in_unet)
+            out = torch.einsum("b i j, b j d -> b i d", attn, v)
+            out = self.reshape_batch_dim_to_heads(out)
+            return to_out(out)
+        return forward
+    class DummyController:
+        def __call__(self, *args):
+            return args[0]
+        def __init__(self):
+            self.num_att_layers = 0
+    if controller is None:
+        controller = DummyController()
+    def register_recr(net_, count, place_in_unet):
+        if net_.__class__.__name__ == 'CrossAttention':
+            net_.forward = ca_forward(net_, place_in_unet)
+            return count + 1
+        elif hasattr(net_, 'children'):
+            for net__ in net_.children():
+                count = register_recr(net__, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    sub_nets = model.unet.named_children()
+    for net in sub_nets:
+        if "down" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "down")
+        elif "up" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "up")
+        elif "mid" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "mid")
+    controller.num_att_layers = cross_att_count

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import pathlib
+from typing import Optional, Tuple
+import numpy as np
+from PIL import Image
+from config import RunConfig
+def load_images(cfg: RunConfig, save_path: Optional[pathlib.Path] = None) -> Tuple[Image.Image, Image.Image]:
+    image_style = load_size(cfg.app_image_path)
+    image_struct = load_size(cfg.struct_image_path)
+    if save_path is not None:
+        Image.fromarray(image_style).save(save_path / f"in_style.png")
+        Image.fromarray(image_struct).save(save_path / f"in_struct.png")
+    return image_style, image_struct
+def load_size(image_path: pathlib.Path,
+              left: int = 0,
+              right: int = 0,
+              top: int = 0,
+              bottom: int = 0,
+              size: int = 512) -> Image.Image:
+    if type(image_path) is str or type(image_path) is pathlib.PosixPath:
+        image = np.array(Image.open(image_path).convert('RGB'))
+    else:
+        image = image_path
+    h, w, c = image.shape
+    left = min(left, w - 1)
+    right = min(right, w - left - 1)
+    top = min(top, h - left - 1)
+    bottom = min(bottom, h - top - 1)
+    image = image[top:h - bottom, left:w - right]
+    h, w, c = image.shape
+    if h < w:
+        offset = (w - h) // 2
+        image = image[:, offset:offset + h]
+    elif w < h:
+        offset = (h - w) // 2
+        image = image[offset:offset + w]
+    image = np.array(Image.fromarray(image).resize((size, size)))
+    return image
+def save_generated_masks(model, cfg: RunConfig):
+    tensor2im(model.image_app_mask_32).save(cfg.output_path / f"mask_style_32.png")
+    tensor2im(model.image_struct_mask_32).save(cfg.output_path / f"mask_struct_32.png")
+    tensor2im(model.image_app_mask_64).save(cfg.output_path / f"mask_style_64.png")
+    tensor2im(model.image_struct_mask_64).save(cfg.output_path / f"mask_struct_64.png")
+def tensor2im(x) -> Image.Image:
+    return Image.fromarray(x.cpu().numpy().astype(np.uint8) * 255)

utils/latent_utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from pathlib import Path
+from typing import Tuple
+import numpy as np
+import torch
+from PIL import Image
+from appearance_transfer_model import AppearanceTransferModel
+from config import RunConfig
+from utils import image_utils
+from utils.ddpm_inversion import invert
+def load_latents_or_invert_images(model: AppearanceTransferModel, cfg: RunConfig):
+    if cfg.load_latents and cfg.app_latent_save_path.exists() and cfg.struct_latent_save_path.exists():
+        print("Loading existing latents...")
+        latents_app, latents_struct = load_latents(cfg.app_latent_save_path, cfg.struct_latent_save_path)
+        noise_app, noise_struct = load_noise(cfg.app_latent_save_path, cfg.struct_latent_save_path)
+        print("Done.")
+    else:
+        print("Inverting images...")
+        app_image, struct_image = image_utils.load_images(cfg=cfg, save_path=cfg.output_path)
+        model.enable_edit = False  # Deactivate the cross-image attention layers
+        latents_app, latents_struct, noise_app, noise_struct = invert_images(app_image=app_image,
+                                                                             struct_image=struct_image,
+                                                                             sd_model=model.pipe,
+                                                                             cfg=cfg)
+        model.enable_edit = True
+        print("Done.")
+    return latents_app, latents_struct, noise_app, noise_struct
+def load_latents(app_latent_save_path: Path, struct_latent_save_path: Path) -> Tuple[torch.Tensor, torch.Tensor]:
+    latents_app = torch.load(app_latent_save_path)
+    latents_struct = torch.load(struct_latent_save_path)
+    if type(latents_struct) == list:
+        latents_app = [l.to("cuda") for l in latents_app]
+        latents_struct = [l.to("cuda") for l in latents_struct]
+    else:
+        latents_app = latents_app.to("cuda")
+        latents_struct = latents_struct.to("cuda")
+    return latents_app, latents_struct
+def load_noise(app_latent_save_path: Path, struct_latent_save_path: Path) -> Tuple[torch.Tensor, torch.Tensor]:
+    latents_app = torch.load(app_latent_save_path.parent / (app_latent_save_path.stem + "_ddpm_noise.pt"))
+    latents_struct = torch.load(struct_latent_save_path.parent / (struct_latent_save_path.stem + "_ddpm_noise.pt"))
+    latents_app = latents_app.to("cuda")
+    latents_struct = latents_struct.to("cuda")
+    return latents_app, latents_struct
+def invert_images(sd_model: AppearanceTransferModel, app_image: Image.Image, struct_image: Image.Image, cfg: RunConfig):
+    input_app = torch.from_numpy(np.array(app_image)).float() / 127.5 - 1.0
+    input_struct = torch.from_numpy(np.array(struct_image)).float() / 127.5 - 1.0
+    zs_app, latents_app = invert(x0=input_app.permute(2, 0, 1).unsqueeze(0).to('cuda'),
+                                 pipe=sd_model,
+                                 prompt_src=cfg.prompt,
+                                 num_diffusion_steps=cfg.num_timesteps,
+                                 cfg_scale_src=3.5)
+    zs_struct, latents_struct = invert(x0=input_struct.permute(2, 0, 1).unsqueeze(0).to('cuda'),
+                                       pipe=sd_model,
+                                       prompt_src=cfg.prompt,
+                                       num_diffusion_steps=cfg.num_timesteps,
+                                       cfg_scale_src=3.5)
+    # Save the inverted latents and noises
+    torch.save(latents_app, cfg.latents_path / f"{cfg.app_image_path.stem}.pt")
+    torch.save(latents_struct, cfg.latents_path / f"{cfg.struct_image_path.stem}.pt")
+    torch.save(zs_app, cfg.latents_path / f"{cfg.app_image_path.stem}_ddpm_noise.pt")
+    torch.save(zs_struct, cfg.latents_path / f"{cfg.struct_image_path.stem}_ddpm_noise.pt")
+    return latents_app, latents_struct, zs_app, zs_struct
+def get_init_latents_and_noises(model: AppearanceTransferModel, cfg: RunConfig) -> Tuple[torch.Tensor, torch.Tensor]:
+    # If we stored all the latents along the diffusion process, select the desired one based on the skip_steps
+    if model.latents_struct.dim() == 4 and model.latents_app.dim() == 4 and model.latents_app.shape[0] > 1:
+        model.latents_struct = model.latents_struct[cfg.skip_steps]
+        model.latents_app = model.latents_app[cfg.skip_steps]
+    init_latents = torch.stack([model.latents_struct, model.latents_app, model.latents_struct])
+    init_zs = [model.zs_struct[cfg.skip_steps:], model.zs_app[cfg.skip_steps:], model.zs_struct[cfg.skip_steps:]]
+    return init_latents, init_zs

utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+from diffusers import DDIMScheduler
+from models.stable_diffusion import CrossImageAttentionStableDiffusionPipeline
+from models.unet_2d_condition import FreeUUNet2DConditionModel
+def get_stable_diffusion_model() -> CrossImageAttentionStableDiffusionPipeline:
+    print("Loading Stable Diffusion model...")
+    device = torch.device(f'cuda') if torch.cuda.is_available() else torch.device('cpu')
+    pipe = CrossImageAttentionStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",
+                                                                      safety_checker=None).to(device)
+    pipe.unet = FreeUUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet").to(device)
+    pipe.scheduler = DDIMScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+    print("Done.")
+    return pipe

utils/segmentation.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from typing import Tuple, List
+import nltk
+import numpy as np
+import torch
+from sklearn.cluster import KMeans
+from constants import STYLE_INDEX, STRUCT_INDEX
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+"""
+Self-segmentation technique taken from Prompt Mixing: https://github.com/orpatashnik/local-prompt-mixing
+"""
+class Segmentor:
+    def __init__(self, prompt: str, object_nouns: List[str], num_segments: int = 5, res: int = 32):
+        self.prompt = prompt
+        self.num_segments = num_segments
+        self.resolution = res
+        self.object_nouns = object_nouns
+        tokenized_prompt = nltk.word_tokenize(prompt)
+        forbidden_words = [word.upper() for word in ["photo", "image", "picture"]]
+        self.nouns = [(i, word) for (i, (word, pos)) in enumerate(nltk.pos_tag(tokenized_prompt))
+                      if pos[:2] == 'NN' and word.upper() not in forbidden_words]
+    def update_attention(self, attn, is_cross):
+        res = int(attn.shape[2] ** 0.5)
+        if is_cross:
+            if res == 16:
+                self.cross_attention_32 = attn
+            elif res == 32:
+                self.cross_attention_64 = attn
+        else:
+            if res == 32:
+                self.self_attention_32 = attn
+            elif res == 64:
+                self.self_attention_64 = attn
+    def __call__(self, *args, **kwargs):
+        clusters = self.cluster()
+        cluster2noun = self.cluster2noun(clusters)
+        return cluster2noun
+    def cluster(self, res: int = 32):
+        np.random.seed(1)
+        self_attn = self.self_attention_32 if res == 32 else self.self_attention_64
+        style_attn = self_attn[STYLE_INDEX].mean(dim=0).cpu().numpy()
+        style_kmeans = KMeans(n_clusters=self.num_segments, n_init=10).fit(style_attn)
+        style_clusters = style_kmeans.labels_.reshape(res, res)
+        struct_attn = self_attn[STRUCT_INDEX].mean(dim=0).cpu().numpy()
+        struct_kmeans = KMeans(n_clusters=self.num_segments, n_init=10).fit(struct_attn)
+        struct_clusters = struct_kmeans.labels_.reshape(res, res)
+        return style_clusters, struct_clusters
+    def cluster2noun(self, clusters, cross_attn, attn_index):
+        result = {}
+        res = int(cross_attn.shape[2] ** 0.5)
+        nouns_indices = [index for (index, word) in self.nouns]
+        cross_attn = cross_attn[attn_index].mean(dim=0).reshape(res, res, -1)
+        nouns_maps = cross_attn.cpu().numpy()[:, :, [i + 1 for i in nouns_indices]]
+        normalized_nouns_maps = np.zeros_like(nouns_maps).repeat(2, axis=0).repeat(2, axis=1)
+        for i in range(nouns_maps.shape[-1]):
+            curr_noun_map = nouns_maps[:, :, i].repeat(2, axis=0).repeat(2, axis=1)
+            normalized_nouns_maps[:, :, i] = (curr_noun_map - np.abs(curr_noun_map.min())) / curr_noun_map.max()
+        max_score = 0
+        all_scores = []
+        for c in range(self.num_segments):
+            cluster_mask = np.zeros_like(clusters)
+            cluster_mask[clusters == c] = 1
+            score_maps = [cluster_mask * normalized_nouns_maps[:, :, i] for i in range(len(nouns_indices))]
+            scores = [score_map.sum() / cluster_mask.sum() for score_map in score_maps]
+            all_scores.append(max(scores))
+            max_score = max(max(scores), max_score)
+        all_scores.remove(max_score)
+        mean_score = sum(all_scores) / len(all_scores)
+        for c in range(self.num_segments):
+            cluster_mask = np.zeros_like(clusters)
+            cluster_mask[clusters == c] = 1
+            score_maps = [cluster_mask * normalized_nouns_maps[:, :, i] for i in range(len(nouns_indices))]
+            scores = [score_map.sum() / cluster_mask.sum() for score_map in score_maps]
+            result[c] = self.nouns[np.argmax(np.array(scores))] if max(scores) > 1.4 * mean_score else "BG"
+        return result
+    def create_mask(self, clusters, cross_attention, attn_index):
+        cluster2noun = self.cluster2noun(clusters, cross_attention, attn_index)
+        mask = clusters.copy()
+        obj_segments = [c for c in cluster2noun if cluster2noun[c][1] in self.object_nouns]
+        for c in range(self.num_segments):
+            mask[clusters == c] = 1 if c in obj_segments else 0
+        return torch.from_numpy(mask).to("cuda")
+    def get_object_masks(self) -> Tuple[torch.Tensor]:
+        clusters_style_32, clusters_struct_32 = self.cluster(res=32)
+        clusters_style_64, clusters_struct_64 = self.cluster(res=64)
+        mask_style_32 = self.create_mask(clusters_style_32, self.cross_attention_32, STYLE_INDEX)
+        mask_struct_32 = self.create_mask(clusters_struct_32, self.cross_attention_32, STRUCT_INDEX)
+        mask_style_64 = self.create_mask(clusters_style_64, self.cross_attention_64, STYLE_INDEX)
+        mask_struct_64 = self.create_mask(clusters_struct_64, self.cross_attention_64, STRUCT_INDEX)
+        return mask_style_32, mask_struct_32, mask_style_64, mask_struct_64