File size: 1,300 Bytes
4f7b5ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
export WANDB_BASE_URL="https://api.wandb.ai"
#export WANDB_MODE=online
export WANDB_MODE=offline
export WANDB_API_KEY=xxx
export HF_ENDPOINT="https://hf-mirror.com"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True #
export TORCH_NCCL_TRACE_BUFFER_SIZE=1048576  # 1MB
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export CUBLAS_WORKSPACE_CONFIG=:4096:8

torchrun --nnodes 1 --nproc_per_node 8  --node_rank=0 --master_addr=127.0.0.1 --master_port=24431 \
    fastvideo/train.py \
    --seed 142 \
    --pretrained_model_name_or_path /DATA/bvac/personal/wan21/Wan2.1-I2V-14B-480P \
    --model_type "wan" \
    --data_json_path test_data/data.json \
    --gradient_checkpointing \
    --train_batch_size=1 \
    --num_latent_t 240 \
    --sp_size 8 \
    --train_sp_batch_size 1 \
    --dataloader_num_workers 4 \
    --gradient_accumulation_steps=1 \
    --max_train_steps=20000 \
    --learning_rate=1e-5 \
    --mixed_precision=bf16 \
    --checkpointing_steps=400 \
    --validation_steps 20000 \
    --validation_sampling_steps 64 \
    --checkpoints_total_limit 3 \
    --allow_tf32 \
    --ema_start_step 0 \
    --cfg 0.0 \
    --ema_decay 0.999 \
    --log_validation \
    --output_dir=outputs-sp8 \
    --tracker_project_name sp8 \
    --validation_guidance_scale "1.0" \
    --group_frame