File size: 5,907 Bytes
e0336bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
import copy
import os
import torch
from easydict import EasyDict

os.environ["TOKENIZERS_PARALLELISM"] = "false"
from .shared_config import wan_shared_cfg
from .wan_i2v_14B import i2v_14B
from .wan_t2v_1_3B import t2v_1_3B
from .wan_t2v_14B import t2v_14B

# the config of t2i_14B is the same as t2v_14B
t2i_14B = copy.deepcopy(t2v_14B)
t2i_14B.__name__ = "Config: Wan T2I 14B"

# ================== START: Add New 1.3B I2V Model Config ==================
i2v_1_3B_new = EasyDict(__name__="Config: Wan I2V 1.3B New")
i2v_1_3B_new.update(wan_shared_cfg) # Start with shared defaults

# --- Core Model Parameters from your config.json ---
i2v_1_3B_new.dim = 1536
i2v_1_3B_new.ffn_dim = 8960
i2v_1_3B_new.num_heads = 12
i2v_1_3B_new.num_layers = 30
i2v_1_3B_new.in_dim = 36  # From config.json (latent + mask)
i2v_1_3B_new.out_dim = 16 # From config.json
i2v_1_3B_new.freq_dim = 256 # From config.json
i2v_1_3B_new.text_len = 512 # From config.json
i2v_1_3B_new.eps = 1e-06 # From config.json

# --- I2V Specific Settings ---
i2v_1_3B_new.i2v = True # Mark as I2V
i2v_1_3B_new.is_fun_control = False # This is NOT a FunControl model

# --- Assumed Component Checkpoints & Settings (ADJUST IF NEEDED) ---
# Assume it uses the same components as other models unless specified
# DiT: User MUST provide this path via --dit
# VAE: Assume standard VAE, user can override with --vae
i2v_1_3B_new.vae_checkpoint = "Wan2.1_VAE.pth" # Or specific VAE if different
i2v_1_3B_new.vae_stride = (4, 8, 8) # Standard stride

# T5: Assume standard T5, user can override with --t5
i2v_1_3B_new.t5_checkpoint = "models_t5_umt5-xxl-enc-bf16.pth" # Or smaller T5 if available
i2v_1_3B_new.t5_tokenizer = "google/umt5-xxl"
i2v_1_3B_new.t5_dtype = torch.bfloat16 # Default T5 dtype

# CLIP: Needed for I2V, assume standard CLIP, user can override with --clip
i2v_1_3B_new.clip_model = "clip_xlm_roberta_vit_h_14"
i2v_1_3B_new.clip_dtype = torch.float16 # Default CLIP dtype
i2v_1_3B_new.clip_checkpoint = "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
i2v_1_3B_new.clip_tokenizer = "xlm-roberta-large"

# Transformer structure (Assume standard based on WanModel)
i2v_1_3B_new.patch_size = (1, 2, 2) # Standard patch size
i2v_1_3B_new.window_size = (-1, -1) # Global attention
i2v_1_3B_new.qk_norm = True # Standard norm
i2v_1_3B_new.cross_attn_norm = True # Standard norm (often True for I2V)

# Default sample prompts (can be kept or adjusted)
i2v_1_3B_new.sample_prompts = ["cinematic video of a sports car"]
i2v_1_3B_new.sample_neg_prompt = "text, watermark, copyright, blurry, low quality, noisy"
i2v_1_3B_new.num_train_timesteps = 1000 # Standard diffusion timesteps

# ================== END: Add New 1.3B I2V Model Config ==================

# support Fun models: deepcopy and change some configs. FC denotes Fun Control
t2v_1_3B_FC = copy.deepcopy(t2v_1_3B)
t2v_1_3B_FC.__name__ = "Config: Wan-Fun-Control T2V 1.3B"
t2v_1_3B_FC.in_dim = 48
i2v_14B.is_fun_control = False
t2v_14B_FC = copy.deepcopy(t2v_14B)
t2v_14B_FC.__name__ = "Config: Wan-Fun-Control T2V 14B"
t2v_14B_FC.i2v = True  # this is strange, but Fun-Control model needs this because it has img cross-attention
t2v_14B_FC.in_dim = 48  # same as i2v_14B, use zeros for image latents
t2v_14B_FC.is_fun_control = True
i2v_14B_FC = copy.deepcopy(i2v_14B)
i2v_14B_FC.__name__ = "Config: Wan-Fun-Control I2V 14B"
i2v_14B_FC.in_dim = 48
i2v_14B_FC.is_fun_control = True

i2v_14B_FC_1_1 = copy.deepcopy(i2v_14B_FC) # Copy the existing FunControl I2V 14B config
i2v_14B_FC_1_1.__name__ = "Config: Wan-Fun-Control I2V 14B v1.1"
# Explicitly add the flag for clarity, though loading logic will derive it
# i2v_14B_FC_1_1.add_ref_conv = True # This flag isn't directly used by the Python config struct, but good for documentation
# The key is that the loaded weights for this model WILL contain 'ref_conv.weight'
# All other parameters are inherited from i2v_14B_FC (in_dim=48, is_fun_control=True, etc.)

WAN_CONFIGS = {
    "t2v-14B": t2v_14B,
    "t2v-1.3B": t2v_1_3B,
    "i2v-14B": i2v_14B,
    "t2i-14B": t2i_14B,
    "i2v-1.3B-new": i2v_1_3B_new,
    # Fun Control models
    "t2v-1.3B-FC": t2v_1_3B_FC,
    "t2v-14B-FC": t2v_14B_FC,
    "i2v-14B-FC": i2v_14B_FC,
    "i2v-14B-FC-1.1": i2v_14B_FC_1_1,
}

SIZE_CONFIGS = {
    "720*1280": (720, 1280),
    "1280*720": (1280, 720),
    "480*832": (480, 832),
    "832*480": (832, 480),
    "1024*1024": (1024, 1024),
    "512*512": (512, 512),     # <--- Example: Added 512x512 if used
    "672*352": (672, 352),     # <--- Added from your command line example
    "352*672": (352, 672),     # <--- Added from your command line example (vertical)
}
# --- ^^^ MODIFY THIS DICTIONARY ^^^ ---


# --- vvv MODIFY THIS DICTIONARY vvv ---
MAX_AREA_CONFIGS = {
    "720*1280": 720 * 1280,
    "1280*720": 1280 * 720,
    "480*832": 480 * 832,
    "832*480": 832 * 480,
    "1024*1024": 1024 * 1024,
    "512*512": 512 * 512,         # <--- Added 512x512 if used
    "672*352": 672 * 352,         # <--- Added from your command line example
    "352*672": 352 * 672,         # <--- Added from your command line example (vertical)
}
# --- ^^^ MODIFY THIS DICTIONARY ^^^ ---


# --- vvv MODIFY THIS DICTIONARY vvv ---
SUPPORTED_SIZES = {
    "t2v-14B": ("720*1280", "1280*720", "480*832", "832*480"),
    "t2v-1.3B": ("480*832", "832*480"),
    "i2v-14B": ("720*1280", "1280*720", "480*832", "832*480"),
    "t2i-14B": tuple(SIZE_CONFIGS.keys()),
    # Fun Control models
    "t2v-1.3B-FC": ("480*832", "832*480"),
    "t2v-14B-FC": ("720*1280", "1280*720", "480*832", "832*480"),
    "i2v-14B-FC": ("720*1280", "1280*720", "480*832", "832*480"),
    "i2v-14B-FC-1.1": ("720*1280", "1280*720", "480*832", "832*480"),
    # Add supported sizes for the new model
    "i2v-1.3B-new": ("480*832", "832*480", "512*512", "672*352", "352*672"), 
    
}