Spaces:

yiren98
/

OmniConsistency

Running on Zero

App Files Files Community

yiren98 commited on 6 days ago

Commit

cc6558b

1 Parent(s): 33b3c45

update app.py

Browse files

Files changed (13) hide show

.gitignore +9 -0
README.md +3 -4
app.py +147 -0
requirements.txt +17 -0
src_inference/__init__.py +0 -0
src_inference/layers_cache.py +366 -0
src_inference/lora_helper.py +194 -0
src_inference/pipeline.py +746 -0
test_imgs/00.png +0 -0
test_imgs/01.png +0 -0
test_imgs/02.png +0 -0
test_imgs/03.png +0 -0
test_imgs/04.png +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+output/
+results/
+datasets/
+wandb/
+scripts/
+__pycache__/
+default_config.yaml
+getDataset.py
+train.py

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
 title: OmniConsistency
-emoji: 🐨
-colorFrom: blue
-colorTo: gray
 sdk: gradio
 sdk_version: 5.31.0
 app_file: app.py
 pinned: false
-license: mit
 short_description: Generate styled image from reference image and external LoRA
 ---

 ---
 title: OmniConsistency
+emoji: 🚀
+colorFrom: gray
+colorTo: pink
 sdk: gradio
 sdk_version: 5.31.0
 app_file: app.py
 pinned: false
 short_description: Generate styled image from reference image and external LoRA
 ---

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import spaces
+import time
+import torch
+import gradio as gr
+from PIL import Image
+from huggingface_hub import hf_hub_download
+from src_inference.pipeline import FluxPipeline
+from src_inference.lora_helper import set_single_lora
+import random
+base_path = "black-forest-labs/FLUX.1-dev"
+# Download OmniConsistency LoRA using hf_hub_download
+omni_consistency_path = hf_hub_download(repo_id="showlab/OmniConsistency",
+                                        filename="OmniConsistency.safetensors",
+                                        local_dir="./Model")
+# Initialize the pipeline with the model
+pipe = FluxPipeline.from_pretrained(base_path, torch_dtype=torch.bfloat16).to("cuda")
+# Set LoRA weights
+set_single_lora(pipe.transformer, omni_consistency_path, lora_weights=[1], cond_size=512)
+# Function to clear cache
+def clear_cache(transformer):
+    for name, attn_processor in transformer.attn_processors.items():
+        attn_processor.bank_kv.clear()
+# Function to download all LoRAs in advance
+def download_all_loras():
+    lora_names = [
+        "3D_Chibi", "American_Cartoon", "Chinese_Ink",
+        "Clay_Toy", "Fabric", "Ghibli", "Irasutoya",
+        "Jojo", "LEGO", "Line", "Macaron",
+        "Oil_Painting", "Origami", "Paper_Cutting",
+        "Picasso", "Pixel", "Poly", "Pop_Art",
+        "Rick_Morty", "Snoopy", "Van_Gogh", "Vector"
+    ]
+    for lora_name in lora_names:
+        hf_hub_download(repo_id="showlab/OmniConsistency",
+                        filename=f"LoRAs/{lora_name}_rank128_bf16.safetensors",
+                        local_dir="./LoRAs")
+# Download all LoRAs in advance before the interface is launched
+download_all_loras()
+# Main function to generate the image
+@spaces.GPU()
+def generate_image(lora_name, prompt, uploaded_image, width, height, guidance_scale, num_inference_steps, seed):
+    # Download specific LoRA based on selection (use local directory as LoRAs are already downloaded)
+    lora_path = f"./LoRAs/LoRAs/{lora_name}_rank128_bf16.safetensors"
+    # Load the specific LoRA weights
+    pipe.unload_lora_weights()
+    pipe.load_lora_weights("./LoRAs/LoRAs", weight_name=f"{lora_name}_rank128_bf16.safetensors")
+    # Prepare input image
+    spatial_image = [uploaded_image.convert("RGB")]
+    subject_images = []
+    start_time = time.time()
+    # Generate the image
+    image = pipe(
+        prompt,
+        height=(int(height) // 8) * 8,
+        width=(int(width) // 8) * 8,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        max_sequence_length=512,
+        generator=torch.Generator("cpu").manual_seed(seed),
+        spatial_images=spatial_image,
+        subject_images=subject_images,
+        cond_size=512,
+    ).images[0]
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print(f"code running time: {elapsed_time} s")
+    # Clear cache after generation
+    clear_cache(pipe.transformer)
+    return image
+# Example data
+examples = [
+    ["3D_Chibi", "3D Chibi style",                  Image.open("./test_imgs/00.png"), 680, 1024, 3.5, 24, 42],
+    ["Origami", "Origami style",                    Image.open("./test_imgs/01.png"), 560, 1024, 3.5, 24, 42],
+    ["American_Cartoon", "American Cartoon style",  Image.open("./test_imgs/02.png"), 568, 1024, 3.5, 24, 42],
+    ["Origami", "Origami style",                    Image.open("./test_imgs/03.png"), 768, 672, 3.5, 24, 42],
+    ["Paper_Cutting", "Paper Cutting style",        Image.open("./test_imgs/04.png"), 696, 1024, 3.5, 24, 42]
+]
+# Gradio interface setup
+def create_gradio_interface():
+    lora_names = [
+        "3D_Chibi", "American_Cartoon", "Chinese_Ink",
+        "Clay_Toy", "Fabric", "Ghibli", "Irasutoya",
+        "Jojo", "LEGO", "Line", "Macaron",
+        "Oil_Painting", "Origami", "Paper_Cutting",
+        "Picasso", "Pixel", "Poly", "Pop_Art",
+        "Rick_Morty", "Snoopy", "Van_Gogh", "Vector"
+    ]
+    with gr.Blocks() as demo:
+        gr.Markdown("# OmniConsistency LoRA Image Generation")
+        gr.Markdown("Select a LoRA, enter a prompt, and upload an image to generate a new image with OmniConsistency.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                lora_dropdown = gr.Dropdown(lora_names, label="Select LoRA")
+                prompt_box = gr.Textbox(label="Prompt", placeholder="Enter a prompt...")
+                image_input = gr.Image(type="pil", label="Upload Image")
+            with gr.Column(scale=1):
+                width_box = gr.Textbox(label="Width", value="1024")
+                height_box = gr.Textbox(label="Height", value="1024")
+                guidance_slider = gr.Slider(minimum=0.1, maximum=20, value=3.5, step=0.1, label="Guidance Scale")
+                steps_slider = gr.Slider(minimum=1, maximum=50, value=25, step=1, label="Inference Steps")
+                seed_slider = gr.Slider(minimum=1, maximum=10000000000, value=42, step=1, label="Seed")
+                generate_button = gr.Button("Generate")
+                output_image = gr.Image(type="pil", label="Generated Image")
+        # Add examples for Generation
+        gr.Examples(
+            examples=examples,
+            inputs=[lora_dropdown, prompt_box, image_input, height_box, width_box, guidance_slider, steps_slider, seed_slider],
+            outputs=output_image,
+            fn=generate_image,
+            cache_examples=False,
+            label="Examples"
+        )
+        generate_button.click(
+            fn=generate_image,
+            inputs=[
+                lora_dropdown, prompt_box, image_input,
+                width_box, height_box, guidance_slider,
+                steps_slider, seed_slider
+            ],
+            outputs=output_image
+        )
+    return demo
+# Launch the Gradio interface
+interface = create_gradio_interface()
+interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+--extra-index-url https://download.pytorch.org/whl/cu124
+torch
+torchvision
+torchaudio==2.3.1
+diffusers==0.32.2
+easydict==1.13
+einops==0.8.1
+peft==0.14.0
+pillow==11.0.0
+protobuf==5.29.3
+requests==2.32.3
+safetensors==0.5.2
+sentencepiece==0.2.0
+spaces==0.34.1
+transformers==4.49.0
+datasets
+wandb

src_inference/__init__.py ADDED Viewed

File without changes

src_inference/layers_cache.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import inspect
+import math
+from typing import Callable, List, Optional, Tuple, Union
+from einops import rearrange
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch import Tensor
+from diffusers.models.attention_processor import Attention
+class LoRALinearLayer(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        device: Optional[Union[torch.device, str]] = None,
+        dtype: Optional[torch.dtype] = None,
+        cond_width=512,
+        cond_height=512,
+        number=0,
+        n_loras=1
+    ):
+        super().__init__()
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+        self.cond_height = cond_height
+        self.cond_width = cond_width
+        self.number = number
+        self.n_loras = n_loras
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        ####
+        batch_size = hidden_states.shape[0]
+        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
+        block_size =  hidden_states.shape[1] - cond_size * self.n_loras
+        shape = (batch_size, hidden_states.shape[1], 3072)
+        mask = torch.ones(shape, device=hidden_states.device, dtype=dtype)
+        mask[:, :block_size+self.number*cond_size, :] = 0
+        mask[:, block_size+(self.number+1)*cond_size:, :] = 0
+        hidden_states = mask * hidden_states
+        ####
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.to(orig_dtype)
+class MultiSingleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.cond_width = cond_width
+        self.cond_height = cond_height
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+        self.bank_attn = None
+        self.bank_kv = []
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond = False,
+        image_emb: torch.FloatTensor = None
+    ) -> torch.FloatTensor:
+        scaled_cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        scaled_seq_len = hidden_states.shape[1]
+        block_size =  scaled_seq_len - scaled_cond_size * self.n_loras
+        if len(self.bank_kv)== 0:
+            cache = True
+        else:
+            cache = False
+        if cache:
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            for i in range(self.n_loras):
+                query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+                key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+                value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+            inner_dim = key.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            self.bank_kv.append(key[:, :, block_size:, :])
+            self.bank_kv.append(value[:, :, block_size:, :])
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            num_cond_blocks = self.n_loras
+            mask = torch.ones((scaled_seq_len, scaled_seq_len), device=hidden_states.device)
+            mask[ :block_size, :] = 0  # First block_size row
+            for i in range(num_cond_blocks):
+                start = i * scaled_cond_size + block_size
+                end = (i + 1) * scaled_cond_size + block_size
+                mask[start:end, start:end] = 0  # Diagonal blocks
+            mask = mask * -1e20
+            mask = mask.to(query.dtype)
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask)
+        else:
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            inner_dim = query.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            zero_pad = torch.zeros_like(self.bank_kv[0], dtype=query.dtype, device=query.device)
+            key = torch.concat([key[:, :, :scaled_seq_len, :], self.bank_kv[0]], dim=-2)
+            value = torch.concat([value[:, :, :scaled_seq_len, :], self.bank_kv[1]], dim=-2)
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            query = torch.concat([query[:, :, :scaled_seq_len, :], zero_pad], dim=-2)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            query = query[:, :, :scaled_seq_len, :]
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = hidden_states[:, : scaled_seq_len,:]
+        return hidden_states
+class MultiDoubleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.cond_width = cond_width
+        self.cond_height = cond_height
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.proj_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+        self.bank_attn = None
+        self.bank_kv = []
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond=False,
+        image_emb: torch.FloatTensor = None
+    ) -> torch.FloatTensor:
+        scaled_cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        block_size =  hidden_states.shape[1]
+        scaled_seq_len = encoder_hidden_states.shape[1] + hidden_states.shape[1]
+        scaled_block_size = scaled_seq_len
+        # `context` projections.
+        inner_dim = 3072
+        head_dim = inner_dim // attn.heads
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        if len(self.bank_kv)== 0:
+            cache = True
+        else:
+            cache = False
+        if cache:
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            for i in range(self.n_loras):
+                query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+                key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+                value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+            inner_dim = key.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            self.bank_kv.append(key)
+            self.bank_kv.append(value)
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            num_cond_blocks = self.n_loras
+            mask = torch.ones((scaled_seq_len, scaled_seq_len), device=hidden_states.device)
+            mask[ :scaled_block_size-block_size, :] = 0  # First block_size row
+            for i in range(num_cond_blocks):
+                start = i * scaled_cond_size + scaled_block_size-block_size
+                end = (i + 1) * scaled_cond_size + scaled_block_size-block_size
+                mask[start:end, start:end] = 0  # Diagonal blocks
+            mask = mask * -1e20
+            mask = mask.to(query.dtype)
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask)
+        else:
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            inner_dim = query.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            zero_pad = torch.zeros_like(self.bank_kv[0], dtype=query.dtype, device=query.device)
+            key = torch.concat([key[:, :, :block_size, :], self.bank_kv[0]], dim=-2)
+            value = torch.concat([value[:, :, :block_size, :], self.bank_kv[1]], dim=-2)
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            query = torch.concat([query[:, :, :block_size, :], zero_pad], dim=-2)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            query = query[:, :, :scaled_block_size, :]
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        encoder_hidden_states, hidden_states = (
+            hidden_states[:, : encoder_hidden_states.shape[1]],
+            hidden_states[:, encoder_hidden_states.shape[1] :],
+        )
+        # Linear projection (with LoRA weight applied to each proj layer)
+        hidden_states = attn.to_out[0](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        hidden_states = hidden_states[:, :block_size,:]
+        return hidden_states, encoder_hidden_states

src_inference/lora_helper.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+from safetensors import safe_open
+import re
+import torch
+from .layers_cache import MultiDoubleStreamBlockLoraProcessor, MultiSingleStreamBlockLoraProcessor
+device = "cuda"
+def load_safetensors(path):
+    tensors = {}
+    with safe_open(path, framework="pt", device="cpu") as f:
+        for key in f.keys():
+            tensors[key] = f.get_tensor(key)
+    return tensors
+def get_lora_rank(checkpoint):
+    for k in checkpoint.keys():
+        if k.endswith(".down.weight"):
+            return checkpoint[k].shape[0]
+def load_checkpoint(local_path):
+    if local_path is not None:
+        if '.safetensors' in local_path:
+            print(f"Loading .safetensors checkpoint from {local_path}")
+            checkpoint = load_safetensors(local_path)
+        else:
+            print(f"Loading checkpoint from {local_path}")
+            checkpoint = torch.load(local_path, map_location='cpu')
+    return checkpoint
+def update_model_with_lora(checkpoint, lora_weights, transformer, cond_size):
+        number = len(lora_weights)
+        ranks = [get_lora_rank(checkpoint) for _ in range(number)]
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    # Match based on the layer index in the key (assuming the key contains layer index)
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].proj_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].proj_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].to(device)
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=number
+                )
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].to(device)
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+        transformer.set_attn_processor(lora_attn_procs)
+def update_model_with_multi_lora(checkpoints, lora_weights, transformer, cond_size):
+        ck_number = len(checkpoints)
+        cond_lora_number = [len(ls) for ls in lora_weights]
+        cond_number = sum(cond_lora_number)
+        ranks = [get_lora_rank(checkpoint) for checkpoint in checkpoints]
+        multi_lora_weight = []
+        for ls in lora_weights:
+            for n in ls:
+                multi_lora_weight.append(n)
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = [{} for _ in range(ck_number)]
+                for idx, checkpoint in enumerate(checkpoints):
+                    for key, value in checkpoint.items():
+                        # Match based on the layer index in the key (assuming the key contains layer index)
+                        if re.search(r'\.(\d+)\.', key):
+                            checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                            if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                                lora_state_dicts[idx][key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=multi_lora_weight, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=cond_number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                num = 0
+                for idx in range(ck_number):
+                    for n in range(cond_lora_number[idx]):
+                        lora_attn_procs[name].q_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].q_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].k_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].k_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].v_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].v_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].proj_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.proj_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].proj_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.proj_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].to(device)
+                        num += 1
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = [{} for _ in range(ck_number)]
+                for idx, checkpoint in enumerate(checkpoints):
+                    for key, value in checkpoint.items():
+                        # Match based on the layer index in the key (assuming the key contains layer index)
+                        if re.search(r'\.(\d+)\.', key):
+                            checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                            if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                                lora_state_dicts[idx][key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=multi_lora_weight, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=cond_number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                num = 0
+                for idx in range(ck_number):
+                    for n in range(cond_lora_number[idx]):
+                        lora_attn_procs[name].q_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].q_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].k_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].k_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].v_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].v_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].to(device)
+                        num += 1
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+        transformer.set_attn_processor(lora_attn_procs)
+def set_single_lora(transformer, local_path, lora_weights=[], cond_size=512):
+    checkpoint = load_checkpoint(local_path)
+    update_model_with_lora(checkpoint, lora_weights, transformer, cond_size)
+def set_multi_lora(transformer, local_paths, lora_weights=[[]], cond_size=512):
+    checkpoints = [load_checkpoint(local_path) for local_path in local_paths]
+    update_model_with_multi_lora(checkpoints, lora_weights, transformer, cond_size)
+def unset_lora(transformer):
+    lora_attn_procs = {}
+    for name, attn_processor in transformer.attn_processors.items():
+        lora_attn_procs[name] = FluxAttnProcessor2_0()
+    transformer.set_attn_processor(lora_attn_procs)
+'''
+unset_lora(pipe.transformer)
+lora_path = "./lora.safetensors"
+lora_weights = [1, 1]
+set_lora(pipe.transformer, local_path=lora_path, lora_weights=lora_weights, cond_size=512)
+'''

src_inference/pipeline.py ADDED Viewed

	@@ -0,0 +1,746 @@

+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from diffusers.image_processor import (VaeImageProcessor)
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from torchvision.transforms.functional import pad
+from diffusers import FluxTransformer2DModel
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def calculate_shift(
+        image_seq_len,
+        base_seq_len: int = 256,
+        max_seq_len: int = 4096,
+        base_shift: float = 0.5,
+        max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def prepare_latent_image_ids_(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height//2, width//2, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None]  # y
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :]   # x
+    return latent_image_ids
+def prepare_latent_subject_ids(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height // 2, width // 2, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2, device=device)[:, None]
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2, device=device)[None, :]
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    return latent_image_ids.to(device=device, dtype=dtype)
+def resize_position_encoding(batch_size, original_height, original_width, target_height, target_width, device, dtype):
+    latent_image_ids = prepare_latent_image_ids_(original_height, original_width, device, dtype)
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    scale_h = original_height / target_height
+    scale_w = original_width / target_width
+    latent_image_ids_resized = torch.zeros(target_height//2, target_width//2, 3, device=device, dtype=dtype)
+    latent_image_ids_resized[..., 1] = latent_image_ids_resized[..., 1] + torch.arange(target_height//2, device=device)[:, None] * scale_h
+    latent_image_ids_resized[..., 2] = latent_image_ids_resized[..., 2] + torch.arange(target_width//2, device=device)[None, :] * scale_w
+    cond_latent_image_id_height, cond_latent_image_id_width, cond_latent_image_id_channels = latent_image_ids_resized.shape
+    cond_latent_image_ids = latent_image_ids_resized.reshape(
+            cond_latent_image_id_height * cond_latent_image_id_width, cond_latent_image_id_channels
+        )
+    return latent_image_ids, cond_latent_image_ids
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+        encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+        scheduler,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        timesteps: Optional[List[int]] = None,
+        sigmas: Optional[List[float]] = None,
+        **kwargs,
+):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    def __init__(
+            self,
+            scheduler: FlowMatchEulerDiscreteScheduler,
+            vae: AutoencoderKL,
+            text_encoder: CLIPTextModel,
+            tokenizer: CLIPTokenizer,
+            text_encoder_2: T5EncoderModel,
+            tokenizer_2: T5TokenizerFast,
+            transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+    def _get_t5_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]] = None,
+            num_images_per_prompt: int = 1,
+            max_sequence_length: int = 512,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]],
+            num_images_per_prompt: int = 1,
+            device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    def encode_prompt(
+            self,
+            prompt: Union[str, List[str]],
+            prompt_2: Union[str, List[str]],
+            device: Optional[torch.device] = None,
+            num_images_per_prompt: int = 1,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            max_sequence_length: int = 512,
+            lora_scale: Optional[float] = None,
+    ):
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i: i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    def check_inputs(
+            self,
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=None,
+            pooled_prompt_embeds=None,
+            callback_on_step_end_tensor_inputs=None,
+            max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+        return latents
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+            self,
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            subject_image,
+            condition_image,
+            latents=None,
+            cond_number=1,
+            sub_number=1
+    ):
+        height_cond = 2 * (self.cond_size // self.vae_scale_factor)
+        width_cond = 2 * (self.cond_size // self.vae_scale_factor)
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        shape = (batch_size, num_channels_latents, height, width)  # 1 16 106 80
+        noise_latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        noise_latents = self._pack_latents(noise_latents, batch_size, num_channels_latents, height, width)
+        noise_latent_image_ids, cond_latent_image_ids = resize_position_encoding(
+                batch_size,
+                height,
+                width,
+                height_cond,
+                width_cond,
+                device,
+                dtype,
+            )
+        latents_to_concat = []
+        latents_ids_to_concat = [noise_latent_image_ids]
+        # subject
+        if subject_image is not None:
+            shape_subject = (batch_size, num_channels_latents, height_cond*sub_number, width_cond)
+            subject_image = subject_image.to(device=device, dtype=dtype)
+            subject_image_latents = self._encode_vae_image(image=subject_image, generator=generator)
+            subject_latents = self._pack_latents(subject_image_latents, batch_size, num_channels_latents, height_cond*sub_number, width_cond)
+            mask2 = torch.zeros(shape_subject, device=device, dtype=dtype)
+            mask2 = self._pack_latents(mask2, batch_size, num_channels_latents, height_cond*sub_number, width_cond)
+            latent_subject_ids = prepare_latent_subject_ids(height_cond, width_cond, device, dtype)
+            latent_subject_ids[:, 1] += 64  # fixed offset
+            subject_latent_image_ids = torch.concat([latent_subject_ids for _ in range(sub_number)], dim=-2)
+            latents_to_concat.append(subject_latents)
+            latents_ids_to_concat.append(subject_latent_image_ids)
+        # spatial
+        if condition_image is not None:
+            shape_cond = (batch_size, num_channels_latents, height_cond*cond_number, width_cond)
+            condition_image = condition_image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=condition_image, generator=generator)
+            cond_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height_cond*cond_number, width_cond)
+            mask3 = torch.zeros(shape_cond, device=device, dtype=dtype)
+            mask3 = self._pack_latents(mask3, batch_size, num_channels_latents, height_cond*cond_number, width_cond)
+            cond_latent_image_ids = cond_latent_image_ids
+            cond_latent_image_ids = torch.concat([cond_latent_image_ids for _ in range(cond_number)], dim=-2)
+            latents_ids_to_concat.append(cond_latent_image_ids)
+            latents_to_concat.append(cond_latents)
+        cond_latents = torch.concat(latents_to_concat, dim=-2)
+        latent_image_ids = torch.concat(latents_ids_to_concat, dim=-2)
+        return cond_latents, latent_image_ids, noise_latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    def __call__(
+            self,
+            prompt: Union[str, List[str]] = None,
+            prompt_2: Optional[Union[str, List[str]]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 28,
+            timesteps: List[int] = None,
+            guidance_scale: float = 3.5,
+            num_images_per_prompt: Optional[int] = 1,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            max_sequence_length: int = 512,
+            spatial_images=[],
+            subject_images=[],
+            cond_size=512,
+    ):
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        self.cond_size = cond_size
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        cond_number = len(spatial_images)
+        sub_number = len(subject_images)
+        if sub_number > 0:
+            subject_image_ls = []
+            for subject_image in subject_images:
+                w, h = subject_image.size[:2]
+                scale = self.cond_size / max(h, w)
+                new_h, new_w = int(h * scale), int(w * scale)
+                subject_image = self.image_processor.preprocess(subject_image, height=new_h, width=new_w)
+                subject_image = subject_image.to(dtype=torch.float32)
+                pad_h = cond_size - subject_image.shape[-2]
+                pad_w = cond_size - subject_image.shape[-1]
+                subject_image = pad(
+                    subject_image,
+                    padding=(int(pad_w / 2), int(pad_h / 2), int(pad_w / 2), int(pad_h / 2)),
+                    fill=0
+                )
+                subject_image_ls.append(subject_image)
+            subject_image = torch.concat(subject_image_ls, dim=-2)
+        else:
+            subject_image = None
+        if cond_number > 0:
+            condition_image_ls = []
+            for img in spatial_images:
+                print(img)
+                condition_image = self.image_processor.preprocess(img, height=self.cond_size, width=self.cond_size)
+                condition_image = condition_image.to(dtype=torch.float32)
+                condition_image_ls.append(condition_image)
+            condition_image = torch.concat(condition_image_ls, dim=-2)
+        else:
+            condition_image = None
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4  # 16
+        cond_latents, latent_image_ids, noise_latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            subject_image,
+            condition_image,
+            latents,
+            cond_number,
+            sub_number
+        )
+        latents = noise_latents
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        ## Caching conditions
+        # clean the cache
+        try:
+            for name, attn_processor in self.transformer.attn_processors.items():
+                attn_processor.bank_kv.clear()
+        except:
+            pass
+        # cache with warmup latents
+        t = torch.tensor([timesteps[0]], device=device)
+        timestep = t.expand(cond_latents.shape[0]).to(latents.dtype)
+        warmup_image_ids = latent_image_ids[latents.shape[1]:, :]
+        _ = self.transformer(
+                    hidden_states=cond_latents,
+                    timestep=torch.ones_like(timestep) * 0,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=warmup_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+        del cond_latents, spatial_images, condition_image, condition_image_ls, img, _
+        torch.cuda.empty_cache()
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents.to(dtype=self.vae.dtype), return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

test_imgs/00.png ADDED Viewed

test_imgs/01.png ADDED Viewed

test_imgs/02.png ADDED Viewed

test_imgs/03.png ADDED Viewed

test_imgs/04.png ADDED Viewed