Spaces:

kaupane
/

diffusion-wikiart

Sleeping

App Files Files Community

kaupane commited on Apr 5

Commit

c1b39e1

verified ·

1 Parent(s): 1af442b

Upload 5 files

Browse files

Initial commit.

Files changed (5) hide show

app.py +245 -0
ckpts/DiT_S_final.pth +3 -0
mapping.py +235 -0
models/DiT.py +184 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import torch
+import gradio as gr
+from pyngrok import ngrok
+import numpy as np
+import os
+import random
+from mapping import reduced_genre_mapping, reduced_style_mapping, reverse_reduced_genre_mapping, reverse_reduced_style_mapping
+from diffusers import AutoencoderKL
+from models.DiT import DiT
+# Global settings
+num_timesteps = 1000
+beta_start = 1e-4
+beta_end = 0.02
+latent_scale_factor = 0.18215  # Same as in DiTTrainer
+# For tracking progress in UI
+global_progress = 0
+def load_dit_model(dit_size):
+    """Load DiT model of specified size"""
+    ckpt_path = f"./ckpts/DiT_{dit_size}_final.pth"
+    if not os.path.exists(ckpt_path):
+        raise FileNotFoundError(f"Checkpoint not found at {ckpt_path}")
+    # Configure model based on size
+    if dit_size == "S":
+        model = DiT(num_blocks=8, hidden_size=384, num_heads=6)
+    elif dit_size == "B":
+        model = DiT(num_blocks=12, hidden_size=640, num_heads=10)
+    elif dit_size == "L":
+        model = DiT(num_blocks=16, hidden_size=896, num_heads=14)
+    else:
+        raise ValueError(f"Invalid DiT size: {dit_size}")
+    # Load checkpoint
+    checkpoint = torch.load(ckpt_path, map_location="cpu")
+    model.load_state_dict(checkpoint["model_state_dict"])
+    return model
+class DiffusionSampler:
+    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
+        self.device = device
+        self.vae = None
+        # Pre-compute diffusion parameters
+        self.betas = torch.linspace(beta_start, beta_end, num_timesteps)
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alpha_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas = torch.sqrt(1.0 / self.alphas)
+        self.alphas_cumprod_prev = torch.cat([torch.tensor([1.0]), self.alphas_cumprod[:-1]])
+        self.posterior_variance = self.betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        # Move to device
+        self.sqrt_alphas_cumprod = self.sqrt_alphas_cumprod.to(self.device)
+        self.sqrt_one_minus_alpha_cumprod = self.sqrt_one_minus_alpha_cumprod.to(self.device)
+        self.sqrt_recip_alphas = self.sqrt_recip_alphas.to(self.device)
+        self.betas = self.betas.to(self.device)
+        self.posterior_variance = self.posterior_variance.to(self.device)
+    def load_vae(self):
+        """Load VAE model (done lazily to save memory until needed)"""
+        if self.vae is None:
+            self.vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema").to(self.device)
+            self.vae.eval()
+    def generate_images(self, model, num_samples, genre, style, seed, progress=gr.Progress()):
+        """Generate images with the DiT model"""
+        global global_progress
+        global_progress = 0
+        # Set random seed for reproducibility
+        if seed is not None:
+            torch.manual_seed(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+            # Also set CUDA seed if using GPU
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed(seed)
+                torch.cuda.manual_seed_all(seed)
+        model.to(self.device)
+        model.eval()
+        # Convert genre and style to tensors
+        g_cond = torch.tensor([genre] * num_samples, device=self.device, dtype=torch.long)
+        s_cond = torch.tensor([style] * num_samples, device=self.device, dtype=torch.long)
+        g_null = torch.tensor([model.num_genres] * num_samples, device=self.device, dtype=torch.long)
+        s_null = torch.tensor([model.num_styles] * num_samples, device=self.device, dtype=torch.long)
+        # Start with random latents
+        latents = torch.randn((num_samples, 4, 32, 32), device=self.device)
+        # Use classifier-free guidance for better quality
+        cfg_scale = 2.5
+        # Go through the reverse diffusion process
+        timesteps = torch.arange(num_timesteps - 1, -1, -1, device=self.device)
+        total_steps = len(timesteps)
+        with torch.no_grad():
+            for i, t_val in enumerate(timesteps):
+                # Update progress
+                global_progress = int(100 * i / total_steps)
+                progress(global_progress / 100, desc="Generating images...")
+                t = torch.full((num_samples,), t_val, device=self.device, dtype=torch.long)
+                sqrt_recip_alphas_t = self.sqrt_recip_alphas[t].view(-1, 1, 1, 1)
+                sqrt_one_minus_alphas_cumprod_t = self.sqrt_one_minus_alpha_cumprod[t].view(-1, 1, 1, 1)
+                beta_t = self.betas[t].view(-1, 1, 1, 1)
+                posterior_variance_t = self.posterior_variance[t].view(-1, 1, 1, 1)
+                # Get noise prediction with classifier-free guidance
+                eps_theta_cond = model(latents, t, g_cond, s_cond)
+                eps_theta_uncond = model(latents, t, g_null, s_null)
+                eps_theta = eps_theta_uncond + cfg_scale * (eps_theta_cond - eps_theta_uncond)
+                # Update latents
+                mean = sqrt_recip_alphas_t * (latents - (beta_t / sqrt_one_minus_alphas_cumprod_t) * eps_theta)
+                noise = torch.randn_like(latents)
+                if t_val == 0:
+                    latents = mean
+                else:
+                    latents = mean + torch.sqrt(posterior_variance_t) * noise
+        # Decode latents to images
+        self.load_vae()
+        latents = latents / self.vae.config.scaling_factor
+        latents = latents.to(self.device)
+        progress(0.95, desc="Decoding images...")
+        with torch.no_grad():
+            images = self.vae.decode(latents).sample
+        images = (images / 2 + 0.5).clamp(0, 1)
+        images = images.permute(0, 2, 3, 1).cpu().numpy()
+        progress(1.0, desc="Done!")
+        global_progress = 100
+        # Create image gallery with labels
+        gallery_images = []
+        for i in range(num_samples):
+            # Convert numpy array to PIL Image
+            img = (images[i] * 255).astype(np.uint8)
+            caption = f"Genre: {reverse_reduced_genre_mapping[genre]}, Style: {reverse_reduced_style_mapping[style]}"
+            if seed is not None:
+                caption += f" (Seed: {seed})"
+            gallery_images.append((img, caption))
+        return gallery_images
+# Initialize sampler globally
+sampler = DiffusionSampler()
+def generate_random_seed():
+    """Generate a random seed between 0 and 2^32 - 1"""
+    return random.randint(0, 2**32 - 1)
+def generate_samples(num_samples, dit_size, genre_name, style_name, seed, progress=gr.Progress()):
+    """Main function for Gradio interface"""
+    if num_samples < 1 or num_samples > 16:
+        return None, gr.update(value="Number of samples must be between 1 and 16", visible=True)
+    # Get genre and style IDs from mappings
+    genre_id = reduced_genre_mapping.get(genre_name)
+    style_id = reduced_style_mapping.get(style_name)
+    if genre_id is None:
+        return None, gr.update(value=f"Unknown genre: {genre_name}", visible=True)
+    if style_id is None:
+        return None, gr.update(value=f"Unknown style: {style_name}", visible=True)
+    try:
+        # Load model
+        progress(0.05, desc="Loading DiT model...")
+        model = load_dit_model(dit_size)
+        # Generate images
+        gallery_images = sampler.generate_images(model, num_samples, genre_id, style_id, seed, progress)
+        return gallery_images, gr.update(value="", visible=False)
+    except Exception as e:
+        error_msg = f"Error: {str(e)}"
+        return None, gr.update(value=error_msg, visible=True)
+def clear_gallery():
+    """Clear the gallery display"""
+    return None, gr.update(value="", visible=False)
+# Create the Gradio interface
+with gr.Blocks(title="DiT Diffusion Model Generator", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# DiT Diffusion Model Generator")
+    gr.Markdown("Generate art images using a Diffusion Transformer (DiT) model")
+    with gr.Row():
+        with gr.Column(scale=1):
+            num_samples = gr.Slider(minimum=1, maximum=16, value=4, step=1, label="Number of Samples", info="How many images to generate (1-16)")
+            dit_size = gr.Radio(choices=["S", "B", "L"], value="S", label="DiT Model Size", info="Larger models produce better quality but take longer")
+            genre_names = list(reduced_genre_mapping.keys())
+            style_names = list(reduced_style_mapping.keys())
+            # Sort alphabetically, ensuring 'None' is at top
+            genre_names.sort()
+            style_names.sort()
+            genre = gr.Dropdown(choices=genre_names, value="landscape", label="Art Genre")
+            style = gr.Dropdown(choices=style_names, value="impressionism", label="Art Style")
+            with gr.Row():
+                seed = gr.Number(label="Seed", value=generate_random_seed(), precision=0, info="Set for reproducible results")
+                reset_seed_btn = gr.Button("🎲 New Seed")
+            with gr.Row():
+                generate_btn = gr.Button("Generate Images", variant="primary")
+                clear_btn = gr.Button("🗑️ Clear Gallery")
+            progress_bar = gr.Progress(track_tqdm=True)
+        with gr.Column(scale=2):
+            output_gallery = gr.Gallery(label="Generated Images", columns=4, rows=4, object_fit="contain", height=600)
+            error_message = gr.Textbox(label="Error", visible=False, max_lines=3, container=True, elem_id="error_box")
+    # Seed reset button functionality
+    reset_seed_btn.click(generate_random_seed, inputs=[], outputs=[seed])
+    # Clear gallery button functionality
+    clear_btn.click(clear_gallery, inputs=[], outputs=[output_gallery, error_message])
+    # Connect components
+    generate_btn.click(
+        fn=generate_samples,
+        inputs=[num_samples, dit_size, genre, style, seed],
+        outputs=[output_gallery, error_message],
+    )
+if __name__ == "__main__":
+    app.launch()

ckpts/DiT_S_final.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f20bce4f40e4112f73fcd89dd2d5d9b7e4a5560f265ca0741af134d1a7ab355
+size 264237994

mapping.py ADDED Viewed

	@@ -0,0 +1,235 @@

+genre_mapping = {
+    'None': 0,
+    'abstract': 1,
+    'advertisement': 2,
+    'allegorical painting': 3,
+    'animal painting': 4,
+    'battle painting': 5,
+    'bijinga': 6,
+    'bird-and-flower painting': 7,
+    'calligraphy': 8,
+    'capriccio': 9,
+    'caricature': 10,
+    'cityscape': 11,
+    'cloudscape': 12,
+    'design': 13,
+    'figurative': 14,
+    'flower painting': 15,
+    'genre painting': 16,
+    'history painting': 17,
+    'illustration': 18,
+    'interior': 19,
+    'landscape': 20,
+    'literary painting': 21,
+    'marina': 22,
+    'miniature': 23,
+    'mythological painting': 24,
+    'nude painting (nu)': 25,
+    'panorama': 26,
+    'pastorale': 27,
+    'portrait': 28,
+    'poster': 29,
+    'quadratura': 30,
+    'religious painting': 31,
+    'self-portrait': 32,
+    'shan shui': 33,
+    'sketch and study': 34,
+    'still life': 35,
+    'symbolic painting': 36,
+    'tessellation': 37,
+    'urushi-e': 38,
+    'vanitas': 39,
+    'veduta': 40,
+    'wildlife painting': 41,
+    'yakusha-e': 42
+}
+style_mapping = {
+    'Abstract Art': 0,
+    'Abstract Expressionism': 1,
+    'Academicism': 2,
+    'Action painting': 3,
+    'American Realism': 4,
+    'Analytical Cubism': 5,
+    'Analytical\xa0Realism': 6,
+    'Art Brut': 7,
+    'Art Deco': 8,
+    'Art Informel': 9,
+    'Art Nouveau (Modern)': 10,
+    'Automatic Painting': 11,
+    'Baroque': 12,
+    'Biedermeier': 13,
+    'Byzantine': 14,
+    'Cartographic Art': 15,
+    'Classicism': 16,
+    'Cloisonism': 17,
+    'Color Field Painting': 18,
+    'Conceptual Art': 19,
+    'Concretism': 20,
+    'Constructivism': 21,
+    'Contemporary Realism': 22,
+    'Costumbrismo': 23,
+    'Cubism': 24,
+    'Cubo-Expressionism': 25,
+    'Cubo-Futurism': 26,
+    'Dada': 27,
+    'Divisionism': 28,
+    'Early Renaissance': 29,
+    'Environmental (Land) Art': 30,
+    'Existential Art': 31,
+    'Expressionism': 32,
+    'Fantastic Realism': 33,
+    'Fauvism': 34,
+    'Feminist Art': 35,
+    'Figurative Expressionism': 36,
+    'Futurism': 37,
+    'Gongbi': 38,
+    'Gothic': 39,
+    'Hard Edge Painting': 40,
+    'High Renaissance': 41,
+    'Hyper-Realism': 42,
+    'Ilkhanid': 43,
+    'Impressionism': 44,
+    'Indian Space painting': 45,
+    'Ink and wash painting': 46,
+    'International Gothic': 47,
+    'Intimism': 48,
+    'Japonism': 49,
+    'Joseon Dynasty': 50,
+    'Kinetic Art': 51,
+    'Kitsch': 52,
+    'Lettrism': 53,
+    'Light and Space': 54,
+    'Luminism': 55,
+    'Lyrical Abstraction': 56,
+    'Magic Realism': 57,
+    'Mail Art': 58,
+    'Mannerism (Late Renaissance)': 59,
+    'Mechanistic Cubism': 60,
+    'Metaphysical art': 61,
+    'Minimalism': 62,
+    'Miserabilism': 63,
+    'Modernismo': 64,
+    'Mosan art': 65,
+    'Muralism': 66,
+    'Nanga (Bunjinga)': 67,
+    'Nats-Taliq': 68,
+    'Native Art': 69,
+    'Naturalism': 70,
+    'Naïve Art (Primitivism)': 71,
+    'Neo-Byzantine': 72,
+    'Neo-Concretism': 73,
+    'Neo-Dada': 74,
+    'Neo-Expressionism': 75,
+    'Neo-Figurative Art': 76,
+    'Neo-Rococo': 77,
+    'Neo-Romanticism': 78,
+    'Neo-baroque': 79,
+    'Neoclassicism': 80,
+    'Neoplasticism': 81,
+    'New Casualism': 82,
+    'New European Painting': 83,
+    'New Realism': 84,
+    'Nihonga': 85,
+    'None': 86,
+    'Northern Renaissance': 87,
+    'Nouveau Réalisme': 88,
+    'Op Art': 89,
+    'Orientalism': 90,
+    'Orphism': 91,
+    'Ottoman Period': 92,
+    'Outsider art': 93,
+    'Perceptism': 94,
+    'Photorealism': 95,
+    'Pointillism': 96,
+    'Pop Art': 97,
+    'Post-Impressionism': 98,
+    'Post-Minimalism': 99,
+    'Post-Painterly Abstraction': 100,
+    'Poster Art Realism': 101,
+    'Precisionism': 102,
+    'Primitivism': 103,
+    'Proto Renaissance': 104,
+    'Purism': 105,
+    'Rayonism': 106,
+    'Realism': 107,
+    'Regionalism': 108,
+    'Renaissance': 109,
+    'Rococo': 110,
+    'Romanesque': 111,
+    'Romanticism': 112,
+    'Safavid Period': 113,
+    'Shin-hanga': 114,
+    'Social Realism': 115,
+    'Socialist Realism': 116,
+    'Spatialism': 117,
+    'Spectralism': 118,
+    'Street art': 119,
+    'Suprematism': 120,
+    'Surrealism': 121,
+    'Symbolism': 122,
+    'Synchromism': 123,
+    'Synthetic Cubism': 124,
+    'Synthetism': 125,
+    'Sōsaku hanga': 126,
+    'Tachisme': 127,
+    'Tenebrism': 128,
+    'Timurid Period': 129,
+    'Tonalism': 130,
+    'Transautomatism': 131,
+    'Tubism': 132,
+    'Ukiyo-e': 133,
+    'Verism': 134,
+    'Yamato-e': 135,
+    'Zen': 136
+}
+reverse_genre_mapping = {v: k for k, v in genre_mapping.items()}
+reverse_style_mapping = {v: k for k, v in style_mapping.items()}
+reduced_genre_mapping = {
+    'abstract': 1,
+    'capriccio': 9,
+    'cityscape': 11,
+    'cloudscape': 12,
+    'flower painting': 15,
+    'genre painting': 16,
+    'interior': 19,
+    'landscape': 20,
+    'marina': 22,
+    'panorama': 26,
+    'pastorale': 27,
+    'quadratura': 30,
+    'shan shui': 33,
+    'sketch and study': 34,
+    'still life': 35,
+    'symbolic painting': 36,
+    'tesselation': 37,
+    'veduta': 40
+}
+reduced_style_mapping = {
+    'abstract expressionism': 1,
+    'art deco': 8,
+    'art nouveau': 10,
+    'baroque': 12,
+    'conceptual art': 19,
+    'cubism': 24,
+    'expressionism': 32,
+    'gothic': 39,
+    'impressionism': 44,
+    'minimalism': 62,
+    'modernism': 64,
+    'neoclassicism': 80,
+    'pop-art': 97,
+    'post-impressionism': 98,
+    'renaissance': 109,
+    'rococo': 110,
+    'romanticism': 112,
+    'surrealism': 121
+}
+reverse_reduced_genre_mapping = {v: k for k, v in reduced_genre_mapping.items()}
+reverse_reduced_style_mapping = {v: k for k, v in reduced_style_mapping.items()}

models/DiT.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from timm.models.vision_transformer import PatchEmbed
+class TimestepEmbedder(nn.Module):
+    """Module to create timestep's embedding."""
+    def __init__(self,hidden_size,frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size,hidden_size),
+            nn.SiLU(),
+            nn.Linear(hidden_size,hidden_size)
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    def forward(self, t):
+        half = self.frequency_embedding_size // 2
+        freqs = torch.exp(
+            -math.log(10000) * torch.arange(start=0,end=half) / half
+        ).to(device=t.device)
+        args = torch.einsum('i,j->ij', t, freqs.to(t.device))
+        freqs = torch.cat([torch.cos(args),torch.sin(args)],dim=-1)
+        return self.mlp(freqs)
+class ViTAttn(nn.Module):
+    def __init__(self,hidden_size,num_heads):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(hidden_size,num_heads,bias=True,add_bias_kv=True,batch_first=True)
+    def forward(self,x):
+        attn, _ = self.attn(x,x,x)
+        return attn
+class DiTBlock(nn.Module):
+    """
+    DiT Block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    Using post-norm
+    """
+    def __init__(self,hidden_size,num_heads):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size,elementwise_affine=False,eps=1e-6)
+        self.attn = ViTAttn(hidden_size,num_heads)
+        self.norm2 = nn.LayerNorm(hidden_size,elementwise_affine=False,eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(hidden_size,4*hidden_size),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(4*hidden_size,hidden_size)
+        )
+        self.adaLN = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size,6*hidden_size)
+        )
+    def forward(self,x,c):
+        gamma_1,beta_1,alpha_1,gamma_2,beta_2,alpha_2 = self.adaLN(c).chunk(6,dim=1)
+        x = self.norm1(x + alpha_1.unsqueeze(1) * self.attn(x))
+        x = x * (1+gamma_1.unsqueeze(1)) + beta_1.unsqueeze(1)
+        x = self.norm2(x + alpha_2.unsqueeze(1) * self.mlp(x))
+        x = x * (1+gamma_2.unsqueeze(1)) + beta_2.unsqueeze(1)
+        return x
+class DiT(nn.Module):
+    def __init__(self,
+                 num_blocks=10,
+                 hidden_size=640,
+                 num_heads=10,
+                 patch_size=2,
+                 num_channels=4,
+                 img_size=32,
+                 num_genres=42,
+                 num_styles=137):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.seq_len = (img_size // patch_size)**2
+        self.img_size = img_size
+        self.blocks = nn.ModuleList(
+            DiTBlock(hidden_size,num_heads) for _ in range(num_blocks)
+        )
+        self.timestep_embed = TimestepEmbedder(hidden_size)
+        self.num_genres = num_genres
+        self.num_styles = num_styles
+        self.genre_condition = nn.Embedding(num_genres+1,hidden_size) # +1 for null condition
+        self.style_condition = nn.Embedding(num_styles+1,hidden_size)
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.seq_len, hidden_size))
+        patch_dim = num_channels * patch_size * patch_size
+        self.proj_in = nn.Linear(patch_dim,hidden_size)
+        self.proj_out = nn.Linear(hidden_size,patch_dim)
+        self.norm_out = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.adaLN_final = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2*hidden_size)
+        )
+        self.initialize_weights()
+    def initialize_weights(self):
+        nn.init.normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.proj_out.weight, std=0.02)
+        nn.init.zeros_(self.proj_out.bias)
+        nn.init.normal_(self.proj_in.weight, std=0.02)
+        nn.init.zeros_(self.proj_in.bias)
+        nn.init.normal_(self.timestep_embed.mlp[0].weight, std=0.02)
+        nn.init.zeros_(self.timestep_embed.mlp[0].bias)
+        nn.init.normal_(self.timestep_embed.mlp[2].weight, std=0.02)
+        nn.init.zeros_(self.timestep_embed.mlp[2].bias)
+        for block in self.blocks:
+            nn.init.zeros_(block.adaLN[-1].weight)
+            nn.init.zeros_(block.adaLN[-1].bias)
+        nn.init.zeros_(self.adaLN_final[-1].weight)
+        nn.init.zeros_(self.adaLN_final[-1].bias)
+        nn.init.normal_(self.genre_condition.weight, std=0.02)
+        nn.init.normal_(self.style_condition.weight, std=0.02)
+    def patchify(self,z):
+        """
+        from (batch_size,6,32,32) -> (batch_size,256,24) -> (batch_size,256,hidden_size)
+        """
+        b,_,_,_ = z.shape
+        c = self.num_channels
+        p = self.patch_size
+        z = z.unfold(2,p,p).unfold(3,p,p) # (b,c,h//p,p,w//p,p)
+        z = z.contiguous().view(b,c,-1,p,p) # (b,c,hw//p**2,p,p)
+        z = torch.einsum('bcapq->bacpq',z).contiguous().view(b,-1,c*p**2) # (b,hw//p**2,c*p**2)
+        return self.proj_in(z) # (b,hw//p**2,hidden_size)
+    def unpatchify(self,z):
+        """
+        from (batch_size,256,hidden_size) -> (batch_size,256,24) -> (batch_size,6,32,32)
+        """
+        b,_,_ = z.shape
+        c = self.num_channels
+        p = self.patch_size
+        s = int(self.seq_len ** 0.5)
+        i = self.img_size
+        z = self.proj_out(z) # (b,hw//p**2,c*p**2)
+        z = z.view(b,s,s,c,p,p) # (b,h/p,w/p,c,p,p)
+        z = torch.einsum('befcpq->bcepfq',z) # (b,c,h/p,p,w/p,p)
+        z = z.contiguous().view(b,c,i,i)
+        return z
+    def forward(self,z,t,g,s):
+        t_embed = self.timestep_embed(t) # t_embed: (batch_size, hidden_size)
+        g_embed = self.genre_condition(g)
+        s_embed = self.style_condition(s)
+        c = t_embed + g_embed + s_embed
+        z = self.patchify(z)
+        z = z + self.pos_embed
+        for block in self.blocks:
+            z = block(z,c)
+        gamma, beta = self.adaLN_final(c).chunk(2,dim=-1)
+        z = self.norm_out(z)
+        z = z * (1+gamma.unsqueeze(1)) + beta.unsqueeze(1)
+        return self.unpatchify(z)
+if __name__ == "__main__":
+    model = DiT(1,768,12,2,6,32)
+    z = torch.randn(2,6,32,32)
+    c = torch.randn(2,768)
+    t = torch.randint(0,1000,(2,))
+    output = model(z,c,t)
+    print(z.shape,c.shape,t.shape,output.shape)
+    output_cfg = model.forward_cfg(z,t)
+    print(output_cfg.shape)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+diffusers
+gradio
+numpy
+tqdm
+matplotlib