File size: 3,106 Bytes
e8bdafd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
export TOKENIZERS_PARALLELISM=false

WORKSPACE=$(dirname "$0")
cd $WORKSPACE

ACCELERATE_CONFIG_FILE=${WORKSPACE}/accelerate_config.yaml

PRETRAINED_MODEL_DIR=$(dirname "$0")/pretrained

DATA_ROOT=$(dirname "$0")/data/RealCam-Vid

SPLIT=train


CHECKPOINT_DIR=$(dirname "$0")/checkpoints
EXPERIMENT_NAME=RealCam-I2V
SUB_EXPERIMENT_NAME=CogVideoX1.5-5B-ControlNetXs
LOG_DIR=${CHECKPOINT_DIR}/${EXPERIMENT_NAME}/${SUB_EXPERIMENT_NAME}
mkdir -p ${LOG_DIR}
export WANDB_DIR=${LOG_DIR}

# Model Configuration
MODEL_ARGS=(
    --model_path ${PRETRAINED_MODEL_DIR}/CogVideoX1.5-5B-I2V
    --model_name "cogvideox1.5-i2v"
    --model_type "i2v"
    --training_type "controlnetxs"
    --time_sampling_type "truncated_normal"
    --time_sampling_mean 0.8
    --time_sampling_type 0.075
    --keep_aspect_ratio
)

# Output Configuration
OUTPUT_ARGS=(
    --output_dir $LOG_DIR
    --report_to "wandb"
    --tracker_name $EXPERIMENT_NAME
    --sub_tracker_name $SUB_EXPERIMENT_NAME
)

# Training Configuration
TRAIN_ARGS=(
    --train_steps 50000
    --batch_size 1
    --gradient_accumulation_steps 1
    --learning_rate 4e-5
    --weight_decay 1e-4
    --mixed_precision "bf16"  # ["no", "fp16"]
    --gradient_checkpointing
    --enable_slicing
    --enable_tiling
    --seed 42
)

# System Configuration
SYSTEM_ARGS=(
    --num_workers 4
    --pin_memory
    --nccl_timeout 1800
)

# Checkpointing Configuration
CHECKPOINT_ARGS=(
    --checkpointing_steps 100
    --checkpointing_limit 100
)

# Validation Configuration
VALIDATION_ARGS=(
    --do_validation
    --validation_dir ${CHECKPOINT_DIR}
    --validation_steps 100
    --validation_prompts "prompts.txt"
    --validation_images "images.txt"
    --gen_fps 8
)

# extract video latents of 81x256x448 ; "768//3 x 1360//3 "
DATA_ARGS=(
    --data_root ${DATA_ROOT}
    --cache_root $(dirname "$0")/data/cache
    --metadata_path RealCam-Vid_new_${SPLIT}.npz
    --enable_align_factor
)

# distribution args for multi-node
DIST_ARGS=(
    --config_file $ACCELERATE_CONFIG_FILE
    --num_machines $HOST_NUM
    --num_processes $NODE_NUM
    --machine_rank $INDEX
    --main_process_ip $CHIEF_IP
    --main_process_port 29500
)

accelerate launch "${DIST_ARGS[@]}" train.py \
    "${MODEL_ARGS[@]}" \
    "${OUTPUT_ARGS[@]}" \
    "${DATA_ARGS[@]}" \
    "${TRAIN_ARGS[@]}" \ 
    "${SYSTEM_ARGS[@]}" \
    "${CHECKPOINT_ARGS[@]}" \
    "${VALIDATION_ARGS[@]}" \
    --train_resolution "81x768x1360"  \
    --precompute

# Optional for landscape/portrait joint training
# accelerate launch "${DIST_ARGS[@]}" train.py \
#     "${MODEL_ARGS[@]}" \
#     "${OUTPUT_ARGS[@]}" \
#     "${DATA_ARGS[@]}" \
#     "${TRAIN_ARGS[@]}" \
#     "${SYSTEM_ARGS[@]}" \
#     "${CHECKPOINT_ARGS[@]}" \
#     "${VALIDATION_ARGS[@]}" \
#     --train_resolution "81x1360x768"  \
#     --precompute

accelerate launch ${DIST_ARGS[@]} train.py \
    ${MODEL_ARGS[@]} \
    ${OUTPUT_ARGS[@]} \
    ${DATA_ARGS[@]} \
    ${TRAIN_ARGS[@]} \
    ${SYSTEM_ARGS[@]} \
    ${CHECKPOINT_ARGS[@]} \
    ${VALIDATION_ARGS[@]} \
    --train_resolution "81x768x1360"  \
    # --allow_switch_hw