ConsistentID-SDXL

Runtime error

App Files Files Community

JackAILab commited on Jan 4

Commit

17d73b1

verified ·

1 Parent(s): 8c7338c

Upload 2 files

Browse files

Files changed (2) hide show

attention.py +294 -0
functions.py +605 -0

attention.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.lora import LoRALinearLayer
+from functions import AttentionMLP
+from diffusers.utils.import_utils import is_xformers_available
+if is_xformers_available():
+    import xformers
+class FuseModule(nn.Module):
+    def __init__(self, embed_dim):
+        super().__init__()
+        self.mlp1 = MLP(embed_dim * 2, embed_dim, embed_dim, use_residual=False)
+        self.mlp2 = MLP(embed_dim, embed_dim, embed_dim, use_residual=True)
+        self.layer_norm = nn.LayerNorm(embed_dim)
+    def fuse_fn(self, prompt_embeds, id_embeds):
+        stacked_id_embeds = torch.cat([prompt_embeds, id_embeds], dim=-1)
+        stacked_id_embeds = self.mlp1(stacked_id_embeds) + prompt_embeds
+        stacked_id_embeds = self.mlp2(stacked_id_embeds)
+        stacked_id_embeds = self.layer_norm(stacked_id_embeds)
+        return stacked_id_embeds
+    def forward(
+        self,
+        prompt_embeds,
+        id_embeds,
+        class_tokens_mask,
+        valid_id_mask,
+    ) -> torch.Tensor:
+        id_embeds = id_embeds.to(prompt_embeds.dtype)
+        batch_size, max_num_inputs = id_embeds.shape[:2]
+        seq_length = prompt_embeds.shape[1]
+        flat_id_embeds = id_embeds.view(-1, id_embeds.shape[-2], id_embeds.shape[-1])
+        valid_id_embeds = flat_id_embeds[valid_id_mask.flatten()]
+        prompt_embeds = prompt_embeds.view(-1, prompt_embeds.shape[-1])
+        class_tokens_mask = class_tokens_mask.view(-1)
+        valid_id_embeds = valid_id_embeds.view(-1, valid_id_embeds.shape[-1])
+        image_token_embeds = prompt_embeds[class_tokens_mask]
+        stacked_id_embeds = self.fuse_fn(image_token_embeds, valid_id_embeds)
+        assert class_tokens_mask.sum() == stacked_id_embeds.shape[0], f"{class_tokens_mask.sum()} != {stacked_id_embeds.shape[0]}"
+        prompt_embeds.masked_scatter_(class_tokens_mask[:, None], stacked_id_embeds.to(prompt_embeds.dtype))
+        updated_prompt_embeds = prompt_embeds.view(batch_size, seq_length, -1)
+        return updated_prompt_embeds
+class MLP(nn.Module):
+    def __init__(self, in_dim, out_dim, hidden_dim, use_residual=True):
+        super().__init__()
+        if use_residual:
+            assert in_dim == out_dim
+        self.layernorm = nn.LayerNorm(in_dim)
+        self.fc1 = nn.Linear(in_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, out_dim)
+        self.use_residual = use_residual
+        self.act_fn = nn.GELU()
+    def forward(self, x):
+        residual = x
+        x = self.layernorm(x)
+        x = self.fc1(x)
+        x = self.act_fn(x)
+        x = self.fc2(x)
+        if self.use_residual:
+            x = x + residual
+        return x
+class FacialEncoder(nn.Module):
+    def __init__(self,image_CLIPModel_encoder=None,embedding_dim=1280, output_dim=768, embed_dim=768):
+        super().__init__()
+        self.visual_projection = AttentionMLP(embedding_dim=embedding_dim, output_dim=output_dim)
+        self.fuse_module = FuseModule(embed_dim=embed_dim)
+    def forward(self, prompt_embeds, multi_image_embeds, class_tokens_mask, valid_id_mask):
+        bs, num_inputs, token_length, image_dim = multi_image_embeds.shape
+        multi_image_embeds_view = multi_image_embeds.view(bs * num_inputs, token_length, image_dim)
+        id_embeds = self.visual_projection(multi_image_embeds_view)
+        id_embeds = id_embeds.view(bs, num_inputs, 1, -1)
+        updated_prompt_embeds = self.fuse_module(prompt_embeds, id_embeds, class_tokens_mask, valid_id_mask)
+        return updated_prompt_embeds
+class Consistent_AttProcessor(nn.Module):
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+    ):
+        super().__init__()
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        if is_xformers_available():
+            ### xformers
+            hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class Consistent_IPAttProcessor(nn.Module):
+    def __init__(
+            self,
+            hidden_size,
+            cross_attention_dim=None,
+            rank=4,
+            network_alpha=None,
+            lora_scale=1.0,
+            scale=1.0,
+            num_tokens=4):
+        super().__init__()
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.num_tokens = num_tokens
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        scale=1.0,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

functions.py ADDED Viewed

	@@ -0,0 +1,605 @@

+import numpy as np
+import math
+import types
+import torch
+import torch.nn as nn
+import numpy as np
+import cv2
+import re
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from PIL import Image
+def extract_first_sentence(text):
+    end_index = text.find('.')
+    if end_index != -1:
+        first_sentence = text[:end_index + 1]
+        return first_sentence.strip()
+    else:
+        return text.strip()
+import re
+def remove_duplicate_keywords(text, keywords): ### This function can continue to be optimized
+    keyword_counts = {}
+    words = re.findall(r'\b\w+\b|[.,;!?]', text)
+    for keyword in keywords:
+        keyword_counts[keyword] = 0
+        for i, word in enumerate(words):
+            if word.lower() == keyword.lower():
+                keyword_counts[keyword] += 1
+                if keyword_counts[keyword] > 1:
+                    words[i] = ""
+    processed_text = " ".join(words)
+    return processed_text
+def process_text_with_markers(text, parsing_mask_list):
+    keywords = ["face", "ears", "eyes", "nose", "mouth"]
+    text = remove_duplicate_keywords(text, keywords)
+    key_parsing_mask_markers = ["Face", "Left_Ear", "Right_Ear", "Left_Eye", "Right_Eye", "Nose", "Upper_Lip", "Lower_Lip"]
+    mapping = {
+        "Face": "face",
+        "Left_Ear": "ears",
+        "Right_Ear": "ears",
+        "Left_Eye": "eyes",
+        "Right_Eye": "eyes",
+        "Nose": "nose",
+        "Upper_Lip": "mouth",
+        "Lower_Lip": "mouth",
+    }
+    facial_features_align = []
+    markers_align = []
+    for key in key_parsing_mask_markers:
+        if key in parsing_mask_list:
+            mapped_key = mapping.get(key, key.lower())
+            if mapped_key not in facial_features_align:
+                facial_features_align.append(mapped_key)
+                markers_align.append("<|"+mapped_key+"|>")
+    text_marked = text
+    align_parsing_mask_list = parsing_mask_list
+    for feature, marker in zip(facial_features_align[::-1], markers_align[::-1]):
+        pattern = rf'\b{feature}\b'
+        text_marked_new = re.sub(pattern, f'{feature} {marker}', text_marked, count=1)
+        if text_marked == text_marked_new:
+            for key, value in mapping.items():
+                if value == feature:
+                    if key in align_parsing_mask_list:
+                        del align_parsing_mask_list[key]
+        text_marked = text_marked_new
+    text_marked = text_marked.replace('\n', '')
+    ordered_text = []
+    text_none_makers = []
+    facial_marked_count = 0
+    skip_count = 0
+    for marker in markers_align:
+        start_idx = text_marked.find(marker)
+        end_idx = start_idx + len(marker)
+        while start_idx > 0 and text_marked[start_idx - 1] not in [",", ".", ";"]:
+            start_idx -= 1
+        while end_idx < len(text_marked) and text_marked[end_idx] not in [",", ".", ";"]:
+            end_idx += 1
+        context = text_marked[start_idx:end_idx].strip()
+        if context == "":
+            text_none_makers.append(text_marked[:end_idx])
+        else:
+            if skip_count!=0:
+                skip_count -= 1
+                continue
+            else:
+                ordered_text.append(context + ",")
+                text_delete_makers = text_marked[:start_idx] + text_marked[end_idx:]
+                text_marked = text_delete_makers
+                facial_marked_count += 1
+    align_marked_text = " ".join(ordered_text)
+    replace_list = ["<|face|>", "<|ears|>", "<|nose|>", "<|eyes|>", "<|mouth|>"]
+    for item in replace_list:
+        align_marked_text = align_marked_text.replace(item, "<|facial|>")
+    return align_marked_text, align_parsing_mask_list
+def tokenize_and_mask_noun_phrases_ends(text, image_token_id, facial_token_id, tokenizer):
+    input_ids = tokenizer.encode(text)
+    image_noun_phrase_end_mask = [False for _ in input_ids]
+    facial_noun_phrase_end_mask = [False for _ in input_ids]
+    clean_input_ids = []
+    clean_index = 0
+    image_num = 0
+    for i, id in enumerate(input_ids):
+        if id == image_token_id:
+            image_noun_phrase_end_mask[clean_index + image_num - 1] = True
+            image_num += 1
+        elif id == facial_token_id:
+            facial_noun_phrase_end_mask[clean_index - 1] = True
+        else:
+            clean_input_ids.append(id)
+            clean_index += 1
+    max_len = tokenizer.model_max_length
+    if len(clean_input_ids) > max_len:
+        clean_input_ids = clean_input_ids[:max_len]
+    else:
+        clean_input_ids = clean_input_ids + [tokenizer.pad_token_id] * (
+            max_len - len(clean_input_ids)
+        )
+    if len(image_noun_phrase_end_mask) > max_len:
+        image_noun_phrase_end_mask = image_noun_phrase_end_mask[:max_len]
+    else:
+        image_noun_phrase_end_mask = image_noun_phrase_end_mask + [False] * (
+            max_len - len(image_noun_phrase_end_mask)
+        )
+    if len(facial_noun_phrase_end_mask) > max_len:
+        facial_noun_phrase_end_mask = facial_noun_phrase_end_mask[:max_len]
+    else:
+        facial_noun_phrase_end_mask = facial_noun_phrase_end_mask + [False] * (
+            max_len - len(facial_noun_phrase_end_mask)
+        )
+    clean_input_ids = torch.tensor(clean_input_ids, dtype=torch.long)
+    image_noun_phrase_end_mask = torch.tensor(image_noun_phrase_end_mask, dtype=torch.bool)
+    facial_noun_phrase_end_mask = torch.tensor(facial_noun_phrase_end_mask, dtype=torch.bool)
+    return clean_input_ids.unsqueeze(0), image_noun_phrase_end_mask.unsqueeze(0), facial_noun_phrase_end_mask.unsqueeze(0)
+def prepare_image_token_idx(image_token_mask, facial_token_mask, max_num_objects=2, max_num_facials=5):
+    image_token_idx = torch.nonzero(image_token_mask, as_tuple=True)[1]
+    image_token_idx_mask = torch.ones_like(image_token_idx, dtype=torch.bool)
+    if len(image_token_idx) < max_num_objects:
+        image_token_idx = torch.cat(
+            [
+                image_token_idx,
+                torch.zeros(max_num_objects - len(image_token_idx), dtype=torch.long),
+            ]
+        )
+        image_token_idx_mask = torch.cat(
+            [
+                image_token_idx_mask,
+                torch.zeros(
+                    max_num_objects - len(image_token_idx_mask),
+                    dtype=torch.bool,
+                ),
+            ]
+        )
+    facial_token_idx = torch.nonzero(facial_token_mask, as_tuple=True)[1]
+    facial_token_idx_mask = torch.ones_like(facial_token_idx, dtype=torch.bool)
+    if len(facial_token_idx) < max_num_facials:
+        facial_token_idx = torch.cat(
+            [
+                facial_token_idx,
+                torch.zeros(max_num_facials - len(facial_token_idx), dtype=torch.long),
+            ]
+        )
+        facial_token_idx_mask = torch.cat(
+            [
+                facial_token_idx_mask,
+                torch.zeros(
+                    max_num_facials - len(facial_token_idx_mask),
+                    dtype=torch.bool,
+                ),
+            ]
+        )
+    image_token_idx = image_token_idx.unsqueeze(0)
+    image_token_idx_mask = image_token_idx_mask.unsqueeze(0)
+    facial_token_idx = facial_token_idx.unsqueeze(0)
+    facial_token_idx_mask = facial_token_idx_mask.unsqueeze(0)
+    return image_token_idx, image_token_idx_mask, facial_token_idx, facial_token_idx_mask
+def get_object_localization_loss_for_one_layer(
+    cross_attention_scores,
+    object_segmaps,
+    object_token_idx,
+    object_token_idx_mask,
+    loss_fn,
+):
+    bxh, num_noise_latents, num_text_tokens = cross_attention_scores.shape
+    b, max_num_objects, _, _ = object_segmaps.shape
+    size = int(num_noise_latents**0.5)
+    object_segmaps = F.interpolate(object_segmaps, size=(size, size), mode="bilinear", antialias=True)
+    object_segmaps = object_segmaps.view(
+        b, max_num_objects, -1
+    )
+    num_heads = bxh // b
+    cross_attention_scores = cross_attention_scores.view(b, num_heads, num_noise_latents, num_text_tokens)
+    object_token_attn_prob = torch.gather(
+        cross_attention_scores,
+        dim=3,
+        index=object_token_idx.view(b, 1, 1, max_num_objects).expand(
+            b, num_heads, num_noise_latents, max_num_objects
+        ),
+    )
+    object_segmaps = (
+        object_segmaps.permute(0, 2, 1)
+        .unsqueeze(1)
+        .expand(b, num_heads, num_noise_latents, max_num_objects)
+    )
+    loss = loss_fn(object_token_attn_prob, object_segmaps)
+    loss = loss * object_token_idx_mask.view(b, 1, max_num_objects)
+    object_token_cnt = object_token_idx_mask.sum(dim=1).view(b, 1) + 1e-5
+    loss = (loss.sum(dim=2) / object_token_cnt).mean()
+    return loss
+def get_object_localization_loss(
+    cross_attention_scores,
+    object_segmaps,
+    image_token_idx,
+    image_token_idx_mask,
+    loss_fn,
+):
+    num_layers = len(cross_attention_scores)
+    loss = 0
+    for k, v in cross_attention_scores.items():
+        layer_loss = get_object_localization_loss_for_one_layer(
+            v, object_segmaps, image_token_idx, image_token_idx_mask, loss_fn
+        )
+        loss += layer_loss
+    return loss / num_layers
+def unet_store_cross_attention_scores(unet, attention_scores, layers=5):
+    from diffusers.models.attention_processor import Attention
+    UNET_LAYER_NAMES = [
+        "down_blocks.0",
+        "down_blocks.1",
+        "down_blocks.2",
+        "mid_block",
+        "up_blocks.1",
+        "up_blocks.2",
+        "up_blocks.3",
+    ]
+    start_layer = (len(UNET_LAYER_NAMES) - layers) // 2
+    end_layer = start_layer + layers
+    applicable_layers = UNET_LAYER_NAMES[start_layer:end_layer]
+    def make_new_get_attention_scores_fn(name):
+        def new_get_attention_scores(module, query, key, attention_mask=None):
+            attention_probs = module.old_get_attention_scores(
+                query, key, attention_mask
+            )
+            attention_scores[name] = attention_probs
+            return attention_probs
+        return new_get_attention_scores
+    for name, module in unet.named_modules():
+        if isinstance(module, Attention) and "attn1" in name:
+            if not any(layer in name for layer in applicable_layers):
+                continue
+            module.old_get_attention_scores = module.get_attention_scores
+            module.get_attention_scores = types.MethodType(
+                make_new_get_attention_scores_fn(name), module
+            )
+    return unet
+class BalancedL1Loss(nn.Module):
+    def __init__(self, threshold=1.0, normalize=False):
+        super().__init__()
+        self.threshold = threshold
+        self.normalize = normalize
+    def forward(self, object_token_attn_prob, object_segmaps):
+        if self.normalize:
+            object_token_attn_prob = object_token_attn_prob / (
+                object_token_attn_prob.max(dim=2, keepdim=True)[0] + 1e-5
+            )
+        background_segmaps = 1 - object_segmaps
+        background_segmaps_sum = background_segmaps.sum(dim=2) + 1e-5
+        object_segmaps_sum = object_segmaps.sum(dim=2) + 1e-5
+        background_loss = (object_token_attn_prob * background_segmaps).sum(
+            dim=2
+        ) / background_segmaps_sum
+        object_loss = (object_token_attn_prob * object_segmaps).sum(
+            dim=2
+        ) / object_segmaps_sum
+        return background_loss - object_loss
+def fetch_mask_raw_image(raw_image, mask_image):
+    mask_image = mask_image.resize(raw_image.size)
+    mask_raw_image = Image.composite(raw_image, Image.new('RGB', raw_image.size, (0, 0, 0)), mask_image)
+    return mask_raw_image
+mapping_table = [
+    {"Mask Value": 0, "Body Part": "Background", "RGB Color": [0, 0, 0]},
+    {"Mask Value": 1, "Body Part": "Face", "RGB Color": [255, 0, 0]},
+    {"Mask Value": 2, "Body Part": "Left_Eyebrow", "RGB Color": [255, 85, 0]},
+    {"Mask Value": 3, "Body Part": "Right_Eyebrow", "RGB Color": [255, 170, 0]},
+    {"Mask Value": 4, "Body Part": "Left_Eye", "RGB Color": [255, 0, 85]},
+    {"Mask Value": 5, "Body Part": "Right_Eye", "RGB Color": [255, 0, 170]},
+    {"Mask Value": 6, "Body Part": "Hair", "RGB Color": [0, 0, 255]},
+    {"Mask Value": 7, "Body Part": "Left_Ear", "RGB Color": [85, 0, 255]},
+    {"Mask Value": 8, "Body Part": "Right_Ear", "RGB Color": [170, 0, 255]},
+    {"Mask Value": 9, "Body Part": "Mouth_External Contour", "RGB Color": [0, 255, 85]},
+    {"Mask Value": 10, "Body Part": "Nose", "RGB Color": [0, 255, 0]},
+    {"Mask Value": 11, "Body Part": "Mouth_Inner_Contour", "RGB Color": [0, 255, 170]},
+    {"Mask Value": 12, "Body Part": "Upper_Lip", "RGB Color": [85, 255, 0]},
+    {"Mask Value": 13, "Body Part": "Lower_Lip", "RGB Color": [170, 255, 0]},
+    {"Mask Value": 14, "Body Part": "Neck", "RGB Color": [0, 85, 255]},
+    {"Mask Value": 15, "Body Part": "Neck_Inner Contour", "RGB Color": [0, 170, 255]},
+    {"Mask Value": 16, "Body Part": "Cloth", "RGB Color": [255, 255, 0]},
+    {"Mask Value": 17, "Body Part": "Hat", "RGB Color": [255, 0, 255]},
+    {"Mask Value": 18, "Body Part": "Earring", "RGB Color": [255, 85, 255]},
+    {"Mask Value": 19, "Body Part": "Necklace", "RGB Color": [255, 255, 85]},
+    {"Mask Value": 20, "Body Part": "Glasses", "RGB Color": [255, 170, 255]},
+    {"Mask Value": 21, "Body Part": "Hand", "RGB Color": [255, 0, 255]},
+    {"Mask Value": 22, "Body Part": "Wristband", "RGB Color": [0, 255, 255]},
+    {"Mask Value": 23, "Body Part": "Clothes_Upper", "RGB Color": [85, 255, 255]},
+    {"Mask Value": 24, "Body Part": "Clothes_Lower", "RGB Color": [170, 255, 255]}
+]
+def masks_for_unique_values(image_raw_mask):
+    image_array = np.array(image_raw_mask)
+    unique_values, counts = np.unique(image_array, return_counts=True)
+    masks_dict = {}
+    for value in unique_values:
+        binary_image = np.uint8(image_array == value) * 255
+        contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        mask = np.zeros_like(image_array)
+        for contour in contours:
+            cv2.drawContours(mask, [contour], -1, (255), thickness=cv2.FILLED)
+        if value == 0:
+            body_part="WithoutBackground"
+            mask2 = np.where(mask == 255, 0, 255).astype(mask.dtype)
+            masks_dict[body_part] = Image.fromarray(mask2)
+        body_part = next((entry["Body Part"] for entry in mapping_table if entry["Mask Value"] == value), f"Unknown_{value}")
+        if body_part.startswith("Unknown_"):
+            continue
+        masks_dict[body_part] = Image.fromarray(mask)
+    return masks_dict
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    x = x.view(bs, length, heads, -1)
+    x = x.transpose(1, 2)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class FacePerceiverResampler(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        dim=768,
+        depth=4,
+        dim_head=64,
+        heads=16,
+        embedding_dim=1280,
+        output_dim=768,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.proj_in = torch.nn.Linear(embedding_dim, dim)
+        self.proj_out = torch.nn.Linear(dim, output_dim)
+        self.norm_out = torch.nn.LayerNorm(output_dim)
+        self.layers = torch.nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                torch.nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, latents, x):
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+class ProjPlusModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, clip_embeddings_dim=1280, num_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        self.perceiver_resampler = FacePerceiverResampler(
+            dim=cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=cross_attention_dim // 64,
+            embedding_dim=clip_embeddings_dim,
+            output_dim=cross_attention_dim,
+            ff_mult=4,
+        )
+    def forward(self, id_embeds, clip_embeds, shortcut=False, scale=1.0):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        out = self.perceiver_resampler(x, clip_embeds)
+        if shortcut:
+            out = x + scale * out
+        return out
+class AttentionMLP(nn.Module):
+    def __init__(
+        self,
+        dtype=torch.float16,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        single_num_tokens=1,
+        embedding_dim=1280,
+        output_dim=768,
+        ff_mult=4,
+        max_seq_len: int = 257*2,
+        apply_pos_emb: bool = False,
+        num_latents_mean_pooled: int = 0,
+    ):
+        super().__init__()
+        self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None
+        self.single_num_tokens = single_num_tokens
+        self.latents = nn.Parameter(torch.randn(1, self.single_num_tokens, dim) / dim**0.5)
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        self.to_latents_from_mean_pooled_seq = (
+            nn.Sequential(
+                nn.LayerNorm(dim),
+                nn.Linear(dim, dim * num_latents_mean_pooled),
+                Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled),
+            )
+            if num_latents_mean_pooled > 0
+            else None
+        )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x):
+        if self.pos_emb is not None:
+            n, device = x.shape[1], x.device
+            pos_emb = self.pos_emb(torch.arange(n, device=device))
+            x = x + pos_emb
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+        if self.to_latents_from_mean_pooled_seq:
+            meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool))
+            meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
+            latents = torch.cat((meanpooled_latents, latents), dim=-2)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+def masked_mean(t, *, dim, mask=None):
+    if mask is None:
+        return t.mean(dim=dim)
+    denom = mask.sum(dim=dim, keepdim=True)
+    mask = rearrange(mask, "b n -> b n 1")
+    masked_t = t.masked_fill(~mask, 0.0)
+    return masked_t.sum(dim=dim) / denom.clamp(min=1e-5)