File size: 3,128 Bytes
a0d91d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4f9cca
 
 
 
 
 
a0d91d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
_base_ = ['../_base_/datasets/kit_ml_bs128.py']

# checkpoint saving
checkpoint_config = dict(interval=1)

dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]

# optimizer
optimizer = dict(type='Adam', lr=2e-4)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(policy='CosineAnnealing', min_lr_ratio=2e-5, by_epoch=False)
runner = dict(type='EpochBasedRunner', max_epochs=20)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        # dict(type='TensorboardLoggerHook')
    ])

input_feats = 251
max_seq_len = 196
latent_dim = 512
time_embed_dim = 2048
text_latent_dim = 256
ff_size = 1024
num_heads = 8
dropout = 0
    
# model settings
model = dict(
    type='MotionDiffusion',
    model=dict(
        type='ReMoDiffuseTransformer',
        input_feats=input_feats,
        max_seq_len=max_seq_len,
        latent_dim=latent_dim,
        time_embed_dim=time_embed_dim,
        num_layers=4,
        ca_block_cfg=dict(
            type='SemanticsModulatedAttention',
            latent_dim=latent_dim,
            text_latent_dim=text_latent_dim,
            num_heads=num_heads,
            dropout=dropout,
            time_embed_dim=time_embed_dim
        ),
        ffn_cfg=dict(
            latent_dim=latent_dim,
            ffn_dim=ff_size,
            dropout=dropout,
            time_embed_dim=time_embed_dim
        ),
        text_encoder=dict(
            pretrained_model='clip',
            latent_dim=text_latent_dim,
            num_layers=2,
            ff_size=2048,
            dropout=dropout,
            use_text_proj=False
        ),
        retrieval_cfg=dict(
            num_retrieval=2,
            stride=4,
            num_layers=2,
            num_motion_layers=2,
            kinematic_coef=0.1,
            topk=2,
            retrieval_file='data/database/kit_text_train.npz',
            latent_dim=latent_dim,
            output_dim=latent_dim,
            max_seq_len=max_seq_len,
            num_heads=num_heads,
            ff_size=ff_size,
            dropout=dropout,
            ffn_cfg=dict(
                latent_dim=latent_dim,
                ffn_dim=ff_size,
                dropout=dropout,
            ),
            sa_block_cfg=dict(
                type='EfficientSelfAttention',
                latent_dim=latent_dim,
                num_heads=num_heads,
                dropout=dropout
            ),
        ),
        scale_func_cfg=dict(
            coarse_scale=4.0,
            both_coef=0.78123,
            text_coef=0.39284,
            retr_coef=-0.12475
        )
    ),
    loss_recon=dict(type='MSELoss', loss_weight=1, reduction='none'),
    diffusion_train=dict(
        beta_scheduler='linear',
        diffusion_steps=1000,
        model_mean_type='start_x',
        model_var_type='fixed_large',
    ),
    diffusion_test=dict(
        beta_scheduler='linear',
        diffusion_steps=1000,
        model_mean_type='start_x',
        model_var_type='fixed_large',
        respace='15,15,8,6,6',
    ),
    inference_type='ddim'
)