Spaces:
Running
Running
File size: 5,907 Bytes
e0336bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
import copy
import os
import torch
from easydict import EasyDict
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from .shared_config import wan_shared_cfg
from .wan_i2v_14B import i2v_14B
from .wan_t2v_1_3B import t2v_1_3B
from .wan_t2v_14B import t2v_14B
# the config of t2i_14B is the same as t2v_14B
t2i_14B = copy.deepcopy(t2v_14B)
t2i_14B.__name__ = "Config: Wan T2I 14B"
# ================== START: Add New 1.3B I2V Model Config ==================
i2v_1_3B_new = EasyDict(__name__="Config: Wan I2V 1.3B New")
i2v_1_3B_new.update(wan_shared_cfg) # Start with shared defaults
# --- Core Model Parameters from your config.json ---
i2v_1_3B_new.dim = 1536
i2v_1_3B_new.ffn_dim = 8960
i2v_1_3B_new.num_heads = 12
i2v_1_3B_new.num_layers = 30
i2v_1_3B_new.in_dim = 36 # From config.json (latent + mask)
i2v_1_3B_new.out_dim = 16 # From config.json
i2v_1_3B_new.freq_dim = 256 # From config.json
i2v_1_3B_new.text_len = 512 # From config.json
i2v_1_3B_new.eps = 1e-06 # From config.json
# --- I2V Specific Settings ---
i2v_1_3B_new.i2v = True # Mark as I2V
i2v_1_3B_new.is_fun_control = False # This is NOT a FunControl model
# --- Assumed Component Checkpoints & Settings (ADJUST IF NEEDED) ---
# Assume it uses the same components as other models unless specified
# DiT: User MUST provide this path via --dit
# VAE: Assume standard VAE, user can override with --vae
i2v_1_3B_new.vae_checkpoint = "Wan2.1_VAE.pth" # Or specific VAE if different
i2v_1_3B_new.vae_stride = (4, 8, 8) # Standard stride
# T5: Assume standard T5, user can override with --t5
i2v_1_3B_new.t5_checkpoint = "models_t5_umt5-xxl-enc-bf16.pth" # Or smaller T5 if available
i2v_1_3B_new.t5_tokenizer = "google/umt5-xxl"
i2v_1_3B_new.t5_dtype = torch.bfloat16 # Default T5 dtype
# CLIP: Needed for I2V, assume standard CLIP, user can override with --clip
i2v_1_3B_new.clip_model = "clip_xlm_roberta_vit_h_14"
i2v_1_3B_new.clip_dtype = torch.float16 # Default CLIP dtype
i2v_1_3B_new.clip_checkpoint = "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
i2v_1_3B_new.clip_tokenizer = "xlm-roberta-large"
# Transformer structure (Assume standard based on WanModel)
i2v_1_3B_new.patch_size = (1, 2, 2) # Standard patch size
i2v_1_3B_new.window_size = (-1, -1) # Global attention
i2v_1_3B_new.qk_norm = True # Standard norm
i2v_1_3B_new.cross_attn_norm = True # Standard norm (often True for I2V)
# Default sample prompts (can be kept or adjusted)
i2v_1_3B_new.sample_prompts = ["cinematic video of a sports car"]
i2v_1_3B_new.sample_neg_prompt = "text, watermark, copyright, blurry, low quality, noisy"
i2v_1_3B_new.num_train_timesteps = 1000 # Standard diffusion timesteps
# ================== END: Add New 1.3B I2V Model Config ==================
# support Fun models: deepcopy and change some configs. FC denotes Fun Control
t2v_1_3B_FC = copy.deepcopy(t2v_1_3B)
t2v_1_3B_FC.__name__ = "Config: Wan-Fun-Control T2V 1.3B"
t2v_1_3B_FC.in_dim = 48
i2v_14B.is_fun_control = False
t2v_14B_FC = copy.deepcopy(t2v_14B)
t2v_14B_FC.__name__ = "Config: Wan-Fun-Control T2V 14B"
t2v_14B_FC.i2v = True # this is strange, but Fun-Control model needs this because it has img cross-attention
t2v_14B_FC.in_dim = 48 # same as i2v_14B, use zeros for image latents
t2v_14B_FC.is_fun_control = True
i2v_14B_FC = copy.deepcopy(i2v_14B)
i2v_14B_FC.__name__ = "Config: Wan-Fun-Control I2V 14B"
i2v_14B_FC.in_dim = 48
i2v_14B_FC.is_fun_control = True
i2v_14B_FC_1_1 = copy.deepcopy(i2v_14B_FC) # Copy the existing FunControl I2V 14B config
i2v_14B_FC_1_1.__name__ = "Config: Wan-Fun-Control I2V 14B v1.1"
# Explicitly add the flag for clarity, though loading logic will derive it
# i2v_14B_FC_1_1.add_ref_conv = True # This flag isn't directly used by the Python config struct, but good for documentation
# The key is that the loaded weights for this model WILL contain 'ref_conv.weight'
# All other parameters are inherited from i2v_14B_FC (in_dim=48, is_fun_control=True, etc.)
WAN_CONFIGS = {
"t2v-14B": t2v_14B,
"t2v-1.3B": t2v_1_3B,
"i2v-14B": i2v_14B,
"t2i-14B": t2i_14B,
"i2v-1.3B-new": i2v_1_3B_new,
# Fun Control models
"t2v-1.3B-FC": t2v_1_3B_FC,
"t2v-14B-FC": t2v_14B_FC,
"i2v-14B-FC": i2v_14B_FC,
"i2v-14B-FC-1.1": i2v_14B_FC_1_1,
}
SIZE_CONFIGS = {
"720*1280": (720, 1280),
"1280*720": (1280, 720),
"480*832": (480, 832),
"832*480": (832, 480),
"1024*1024": (1024, 1024),
"512*512": (512, 512), # <--- Example: Added 512x512 if used
"672*352": (672, 352), # <--- Added from your command line example
"352*672": (352, 672), # <--- Added from your command line example (vertical)
}
# --- ^^^ MODIFY THIS DICTIONARY ^^^ ---
# --- vvv MODIFY THIS DICTIONARY vvv ---
MAX_AREA_CONFIGS = {
"720*1280": 720 * 1280,
"1280*720": 1280 * 720,
"480*832": 480 * 832,
"832*480": 832 * 480,
"1024*1024": 1024 * 1024,
"512*512": 512 * 512, # <--- Added 512x512 if used
"672*352": 672 * 352, # <--- Added from your command line example
"352*672": 352 * 672, # <--- Added from your command line example (vertical)
}
# --- ^^^ MODIFY THIS DICTIONARY ^^^ ---
# --- vvv MODIFY THIS DICTIONARY vvv ---
SUPPORTED_SIZES = {
"t2v-14B": ("720*1280", "1280*720", "480*832", "832*480"),
"t2v-1.3B": ("480*832", "832*480"),
"i2v-14B": ("720*1280", "1280*720", "480*832", "832*480"),
"t2i-14B": tuple(SIZE_CONFIGS.keys()),
# Fun Control models
"t2v-1.3B-FC": ("480*832", "832*480"),
"t2v-14B-FC": ("720*1280", "1280*720", "480*832", "832*480"),
"i2v-14B-FC": ("720*1280", "1280*720", "480*832", "832*480"),
"i2v-14B-FC-1.1": ("720*1280", "1280*720", "480*832", "832*480"),
# Add supported sizes for the new model
"i2v-1.3B-new": ("480*832", "832*480", "512*512", "672*352", "352*672"),
}
|