robinwitch commited on
Commit
3f17024
·
verified ·
1 Parent(s): 6f80f6a

Upload 35 files

Browse files
Files changed (35) hide show
  1. ckpt/beatx2_cospeech_diffusion/0403_212319_diffusion_rvqvae_128.txt +0 -0
  2. ckpt/beatx2_cospeech_diffusion/0403_212319_diffusion_rvqvae_128.yaml +54 -0
  3. ckpt/beatx2_cospeech_diffusion/1002_165120_diffusion_rvqvae_128.txt +450 -0
  4. ckpt/beatx2_cospeech_diffusion/1002_165120_diffusion_rvqvae_128.yaml +50 -0
  5. ckpt/beatx2_cospeech_diffusion/last_500.bin +3 -0
  6. ckpt/beatx2_rvqvae/RVQVAE_hands/net_300000.pth +3 -0
  7. ckpt/beatx2_rvqvae/RVQVAE_hands/run.log +0 -0
  8. ckpt/beatx2_rvqvae/RVQVAE_lower/net_300000.pth +3 -0
  9. ckpt/beatx2_rvqvae/RVQVAE_lower/run.log +0 -0
  10. ckpt/beatx2_rvqvae/RVQVAE_lower_trans/net_300000.pth +3 -0
  11. ckpt/beatx2_rvqvae/RVQVAE_lower_trans/run.log +0 -0
  12. ckpt/beatx2_rvqvae/RVQVAE_upper/net_300000.pth +3 -0
  13. ckpt/beatx2_rvqvae/RVQVAE_upper/run.log +0 -0
  14. ckpt/beatx_1-30_amass_h3d_diffusion/0402_095910_diffusion_h3d_new_main-tmr-h3d623.txt +0 -0
  15. ckpt/beatx_1-30_amass_h3d_diffusion/0402_095910_diffusion_h3d_new_main-tmr-h3d623.yaml +54 -0
  16. ckpt/beatx_1-30_amass_h3d_diffusion/events.out.tfevents.1712023151.FU09.51754.0 +3 -0
  17. ckpt/beatx_1-30_amass_h3d_diffusion/last_600.bin +3 -0
  18. ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/events.out.tfevents.1711936445.FU09.1750416.0 +3 -0
  19. ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/net_300000.pth +3 -0
  20. ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/run.log +0 -0
  21. ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/events.out.tfevents.1711936412.FU09.1747951.0 +3 -0
  22. ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/net_300000.pth +3 -0
  23. ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/run.log +0 -0
  24. ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/events.out.tfevents.1711936499.FU09.1757898.0 +3 -0
  25. ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/net_300000.pth +3 -0
  26. ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/run.log +0 -0
  27. ckpt/beatx_1-30_amass_h3d_tmr/log +44 -0
  28. ckpt/beatx_1-30_amass_h3d_tmr/motion_epoch=299.ckpt +3 -0
  29. ckpt/beatx_1-30_amass_h3d_tmr/text_epoch=299.ckpt +3 -0
  30. ckpt/distilbert-base-uncased/config.json +24 -0
  31. ckpt/distilbert-base-uncased/pytorch_model.bin +3 -0
  32. ckpt/distilbert-base-uncased/special_tokens_map.json +7 -0
  33. ckpt/distilbert-base-uncased/tokenizer.json +0 -0
  34. ckpt/distilbert-base-uncased/tokenizer_config.json +13 -0
  35. ckpt/distilbert-base-uncased/vocab.txt +0 -0
ckpt/beatx2_cospeech_diffusion/0403_212319_diffusion_rvqvae_128.txt ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/beatx2_cospeech_diffusion/0403_212319_diffusion_rvqvae_128.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {a_encoder: null, a_fix_pre: false, a_pre_encoder: null, acc: 1, acc_weight: 0.0,
2
+ additional_data: false, adv_weight: 20.0, ali_weight: 0.0, amsgrad: false, apex: false,
3
+ asmr: 0.0, atcont: 0.0, atmr: 0.0, aud_prob: 1.0, audio_dims: 1, audio_f: 256, audio_fps: 16000,
4
+ audio_norm: false, audio_rep: onset+amplitude, audio_sr: 16000, batch_size: 40,
5
+ beat_align: true, benchmark: true, cache_only: false, cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/,
6
+ cf: 0.0, ch: 1.0, cl: 1.0, clean_final_seconds: 0, clean_first_seconds: 0, commit: 0.02,
7
+ config: configs/diffusion_rvqvae_128.yaml, csv_name: a2g_0, cu: 1.0, cudnn_enabled: true,
8
+ d_lr_weight: 0.2, d_name: null, data_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/,
9
+ data_path_1: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/datasets/hub/,
10
+ dataset: beat_sep_lower, ddp: false, debug: false, decay_epochs: 200, decay_rate: 0.1,
11
+ decode_fusion: null, depth: 3, deterministic: true, dilation_growth_rate: 3, disable_filtering: false,
12
+ div_reg_weight: 0.0, downs_t: [3], dropout_prob: 0.3, e_name: VAESKConv, e_path: weights/AESKConv_240_100.bin,
13
+ emb_width: 512, emo_rep: null, emotion_dims: 8, emotion_f: 0, epoch_stage: 0, epochs: 1000,
14
+ eval_model: motion_representation, f_encoder: 'null', f_fix_pre: false, f_pre_encoder: 'null',
15
+ fac_prob: 1.0, facial_dims: 100, facial_f: 0, facial_fps: 15, facial_norm: false,
16
+ facial_rep: smplxflame_30, fid_weight: 0.0, finger_net: original, freeze_wordembed: false,
17
+ fsmr: 0.0, ftmr: 0.0, fusion_mode: sum, g_name: MDM, gap_weight: 0.0, gpus: [0],
18
+ grad_norm: 0.99, hidden_size: 768, hvqvae_multipliers: [1], id_rep: onehot, input_context: both,
19
+ is_train: true, ita_weight: 0.0, iwa_weight: 0.0, joint_channel: 3, kld_aud_weight: 0.0,
20
+ kld_fac_weight: 0.0, kld_weight: 0.0, l: 4, l_bins: 512, l_mu: 0.99, levels: 1,
21
+ lf: 3.0, lh: 3.0, ll: 3.0, loader_workers: 0, log_period: 10, loss_contrastive_neg_weight: 0.005,
22
+ loss_contrastive_pos_weight: 0.2, loss_gan_weight: 5.0, loss_kld_weight: 0.1, loss_physical_weight: 0.0,
23
+ loss_reg_weight: 0.05, loss_regression_weight: 70.0, lr_base: 5.0e-05, lr_min: 1.0e-07,
24
+ lr_policy: step, lu: 3.0, m_conv: 1.0, m_decoder: null, m_encoder: 'null', m_fix_pre: false,
25
+ m_pre_encoder: 'null', mean_pose_path: /mnt/fu09a/chenbohong/PantoMatrix/beatx_2_330_mean.npy,
26
+ mean_trans_path: /mnt/fu09a/chenbohong/PantoMatrix/beatx_2_trans_mean.npy, model: denoiser,
27
+ momentum: 0.8, motion_f: 256, msmr: 0.0, mtmr: 0.0, multi_length_training: [1.0],
28
+ n_layer: 1, n_poses: 34, n_pre_poses: 4, name: 0403_212319_diffusion_rvqvae_128,
29
+ nesterov: true, new_cache: false, no_adv_epoch: 999, notes: '', opt: adam, opt_betas: [
30
+ 0.5, 0.999], ori_joints: beat_smplx_joints, out_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/outputs/audio2pose/,
31
+ pos_encoding_type: sin, pos_prob: 1.0, pose_dims: 330, pose_fps: 30, pose_length: 128,
32
+ pose_norm: true, pose_rep: smplxflame_30, pre_frames: 4, pre_type: zero, pretrain: false,
33
+ project: s2g, queue_size: 1024, random_seed: 2021, rec_aud_weight: 0.0, rec_fac_weight: 0.0,
34
+ rec_pos_weight: 0.0, rec_txt_weight: 0.0, rec_ver_weight: 0.0, rec_weight: 1.0,
35
+ root_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/, root_weight: 1.0,
36
+ rot6d: true, sample_length: 34, sem_rep: null, sparse: 1, speaker_dims: 4, speaker_f: 0,
37
+ speaker_id: onehot, stat: ts, std_pose_path: /mnt/fu09a/chenbohong/PantoMatrix/beatx_2_330_std.npy,
38
+ std_trans_path: /mnt/fu09a/chenbohong/PantoMatrix/beatx_2_trans_std.npy, stride: 20,
39
+ strides_t: [2], t_encoder: 'null', t_fix_pre: false, t_pre_encoder: fasttext, tar_joints: beat_smplx_full,
40
+ test_ckpt: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/outputs/audio2pose/custom/0330_140056_diffusion_rvqvae/last_300.bin,
41
+ test_data_path: /datasets/trinity/test/, test_length: 128, test_period: 20, train_data_path: /datasets/trinity/train/,
42
+ train_trans: true, trainer: diffusion_rvqvae, training_speakers: [2], tsmr: 0.0,
43
+ ttmr: 0.0, txt_prob: 1.0, use_amass: false, use_aug: false, use_bottleneck: true,
44
+ use_trans: true, vae_codebook_size: 256, vae_grow: [1, 1, 2, 1], vae_layer: 4, vae_length: 240,
45
+ vae_quantizer_lambda: 1.0, vae_test_dim: 330, vae_test_len: 32, vae_test_stride: 20,
46
+ val_data_path: /datasets/trinity/val/, variational: false, vel: 1, vel_weight: 0.0,
47
+ vqvae_ckpt: null, vqvae_hands_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_beatx2/RVQVAE_hands/net_300000.pth,
48
+ vqvae_latent_scale: 5.0, vqvae_lower_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_beatx2/RVQVAE_lower/net_300000.pth,
49
+ vqvae_lower_trans_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_beatx2/RVQVAE_lower_trans/net_300000.pth,
50
+ vqvae_reverse_decoder_dilation: true, vqvae_squeeze_scale: 4, vqvae_type: rvqvae,
51
+ vqvae_upper_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_beatx2/RVQVAE_upper/net_300000.pth,
52
+ warmup_epochs: 0, warmup_lr: 0.0005, wei_weight: 0.0, weight_decay: 0.0, width: 512,
53
+ word_cache: false, word_dims: 300, word_f: 256, word_index_num: 11195, word_rep: textgrid,
54
+ z_type: speaker}
ckpt/beatx2_cospeech_diffusion/1002_165120_diffusion_rvqvae_128.txt ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 10-02 16:51:21 | {'a_encoder': None,
2
+ 'a_fix_pre': False,
3
+ 'a_pre_encoder': None,
4
+ 'acc': 1,
5
+ 'acc_weight': 0.0,
6
+ 'additional_data': False,
7
+ 'adv_weight': 20.0,
8
+ 'ali_weight': 0.0,
9
+ 'amsgrad': False,
10
+ 'apex': False,
11
+ 'asmr': 0.0,
12
+ 'atcont': 0.0,
13
+ 'atmr': 0.0,
14
+ 'aud_prob': 1.0,
15
+ 'audio_dims': 1,
16
+ 'audio_f': 256,
17
+ 'audio_fps': 16000,
18
+ 'audio_norm': False,
19
+ 'audio_rep': 'onset+amplitude',
20
+ 'audio_sr': 16000,
21
+ 'batch_size': 40,
22
+ 'beat_align': True,
23
+ 'benchmark': True,
24
+ 'cache_only': False,
25
+ 'cache_path': 'datasets/beat_cache/beat_smplx_en_emage_2_128/',
26
+ 'cf': 0.0,
27
+ 'ch': 1.0,
28
+ 'cl': 1.0,
29
+ 'clean_final_seconds': 0,
30
+ 'clean_first_seconds': 0,
31
+ 'commit': 0.02,
32
+ 'config': 'configs/diffusion_rvqvae_128.yaml',
33
+ 'csv_name': 'a2g_0',
34
+ 'cu': 1.0,
35
+ 'cudnn_enabled': True,
36
+ 'd_lr_weight': 0.2,
37
+ 'd_name': None,
38
+ 'data_path': './datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/',
39
+ 'data_path_1': './datasets/hub/',
40
+ 'dataset': 'beat_sep_lower',
41
+ 'ddp': False,
42
+ 'debug': False,
43
+ 'decay_epochs': 500,
44
+ 'decay_rate': 0.1,
45
+ 'decode_fusion': None,
46
+ 'depth': 3,
47
+ 'deterministic': True,
48
+ 'dilation_growth_rate': 3,
49
+ 'disable_filtering': False,
50
+ 'div_reg_weight': 0.0,
51
+ 'downs_t': [3],
52
+ 'dropout_prob': 0.3,
53
+ 'e_name': 'VAESKConv',
54
+ 'e_path': 'weights/AESKConv_240_100.bin',
55
+ 'emb_width': 512,
56
+ 'emo_rep': None,
57
+ 'emotion_dims': 8,
58
+ 'emotion_f': 0,
59
+ 'epoch_stage': 0,
60
+ 'epochs': 2000,
61
+ 'eval_model': 'motion_representation',
62
+ 'f_encoder': 'null',
63
+ 'f_fix_pre': False,
64
+ 'f_pre_encoder': 'null',
65
+ 'fac_prob': 1.0,
66
+ 'facial_dims': 100,
67
+ 'facial_f': 0,
68
+ 'facial_fps': 15,
69
+ 'facial_norm': False,
70
+ 'facial_rep': 'smplxflame_30',
71
+ 'fid_weight': 0.0,
72
+ 'finger_net': 'original',
73
+ 'freeze_wordembed': False,
74
+ 'fsmr': 0.0,
75
+ 'ftmr': 0.0,
76
+ 'fusion_mode': 'sum',
77
+ 'g_name': 'MDM',
78
+ 'gap_weight': 0.0,
79
+ 'gpus': [0],
80
+ 'grad_norm': 0.99,
81
+ 'hidden_size': 768,
82
+ 'hvqvae_multipliers': [1],
83
+ 'id_rep': 'onehot',
84
+ 'input_context': 'both',
85
+ 'is_train': True,
86
+ 'ita_weight': 0.0,
87
+ 'iwa_weight': 0.0,
88
+ 'joint_channel': 3,
89
+ 'kld_aud_weight': 0.0,
90
+ 'kld_fac_weight': 0.0,
91
+ 'kld_weight': 0.0,
92
+ 'l': 4,
93
+ 'l_bins': 512,
94
+ 'l_mu': 0.99,
95
+ 'levels': 1,
96
+ 'lf': 3.0,
97
+ 'lh': 3.0,
98
+ 'll': 3.0,
99
+ 'loader_workers': 0,
100
+ 'log_period': 10,
101
+ 'loss_contrastive_neg_weight': 0.005,
102
+ 'loss_contrastive_pos_weight': 0.2,
103
+ 'loss_gan_weight': 5.0,
104
+ 'loss_kld_weight': 0.1,
105
+ 'loss_physical_weight': 0.0,
106
+ 'loss_reg_weight': 0.05,
107
+ 'loss_regression_weight': 70.0,
108
+ 'lr_base': 5e-05,
109
+ 'lr_min': 1e-07,
110
+ 'lr_policy': 'step',
111
+ 'lu': 3.0,
112
+ 'm_conv': 1.0,
113
+ 'm_decoder': None,
114
+ 'm_encoder': 'null',
115
+ 'm_fix_pre': False,
116
+ 'm_pre_encoder': 'null',
117
+ 'mean_pose_path': './mean_std/beatx_2_330_mean.npy',
118
+ 'mean_trans_path': './mean_std/beatx_2_trans_mean.npy',
119
+ 'model': 'denoiser',
120
+ 'momentum': 0.8,
121
+ 'motion_f': 256,
122
+ 'msmr': 0.0,
123
+ 'mtmr': 0.0,
124
+ 'multi_length_training': [1.0],
125
+ 'n_layer': 1,
126
+ 'n_poses': 34,
127
+ 'n_pre_poses': 4,
128
+ 'name': '1002_165120_diffusion_rvqvae_128',
129
+ 'nesterov': True,
130
+ 'new_cache': False,
131
+ 'no_adv_epoch': 999,
132
+ 'notes': '',
133
+ 'opt': 'adam',
134
+ 'opt_betas': [0.5, 0.999],
135
+ 'ori_joints': 'beat_smplx_joints',
136
+ 'out_path': './outputs/audio2pose/',
137
+ 'pos_encoding_type': 'sin',
138
+ 'pos_prob': 1.0,
139
+ 'pose_dims': 330,
140
+ 'pose_fps': 30,
141
+ 'pose_length': 128,
142
+ 'pose_norm': True,
143
+ 'pose_rep': 'smplxflame_30',
144
+ 'pre_frames': 4,
145
+ 'pre_type': 'zero',
146
+ 'pretrain': False,
147
+ 'project': 's2g',
148
+ 'queue_size': 1024,
149
+ 'random_seed': 2021,
150
+ 'rec_aud_weight': 0.0,
151
+ 'rec_fac_weight': 0.0,
152
+ 'rec_pos_weight': 0.0,
153
+ 'rec_txt_weight': 0.0,
154
+ 'rec_ver_weight': 0.0,
155
+ 'rec_weight': 1.0,
156
+ 'root_path': './',
157
+ 'root_weight': 1.0,
158
+ 'rot6d': True,
159
+ 'sample_length': 34,
160
+ 'sem_rep': None,
161
+ 'sparse': 1,
162
+ 'speaker_dims': 4,
163
+ 'speaker_f': 0,
164
+ 'speaker_id': 'onehot',
165
+ 'stat': 'ts',
166
+ 'std_pose_path': './mean_std/beatx_2_330_std.npy',
167
+ 'std_trans_path': './mean_std/beatx_2_trans_std.npy',
168
+ 'stride': 20,
169
+ 'strides_t': [2],
170
+ 't_encoder': 'null',
171
+ 't_fix_pre': False,
172
+ 't_pre_encoder': 'fasttext',
173
+ 'tar_joints': 'beat_smplx_full',
174
+ 'test_ckpt': './ckpt/beatx2_cospeech_diffusion/last_500.bin',
175
+ 'test_data_path': '/datasets/trinity/test/',
176
+ 'test_length': 128,
177
+ 'test_period': 20,
178
+ 'train_data_path': '/datasets/trinity/train/',
179
+ 'train_trans': True,
180
+ 'trainer': 'diffusion_rvqvae',
181
+ 'training_speakers': [2],
182
+ 'tsmr': 0.0,
183
+ 'ttmr': 0.0,
184
+ 'txt_prob': 1.0,
185
+ 'use_amass': False,
186
+ 'use_aug': False,
187
+ 'use_bottleneck': True,
188
+ 'use_motionclip': False,
189
+ 'use_trans': True,
190
+ 'vae_codebook_size': 256,
191
+ 'vae_grow': [1, 1, 2, 1],
192
+ 'vae_layer': 4,
193
+ 'vae_length': 240,
194
+ 'vae_quantizer_lambda': 1.0,
195
+ 'vae_test_dim': 330,
196
+ 'vae_test_len': 32,
197
+ 'vae_test_stride': 20,
198
+ 'val_data_path': '/datasets/trinity/val/',
199
+ 'variational': False,
200
+ 'vel': 1,
201
+ 'vel_weight': 0.0,
202
+ 'vqvae_ckpt': None,
203
+ 'vqvae_hands_path': './ckpt/beatx2_rvqvae/RVQVAE_hands/net_300000.pth',
204
+ 'vqvae_latent_scale': 5.0,
205
+ 'vqvae_lower_path': './ckpt/beatx2_rvqvae/RVQVAE_lower/net_300000.pth',
206
+ 'vqvae_lower_trans_path': './ckpt/beatx2_rvqvae/RVQVAE_lower_trans/net_300000.pth',
207
+ 'vqvae_reverse_decoder_dilation': True,
208
+ 'vqvae_squeeze_scale': 4,
209
+ 'vqvae_type': 'rvqvae',
210
+ 'vqvae_upper_path': './ckpt/beatx2_rvqvae/RVQVAE_upper/net_300000.pth',
211
+ 'warmup_epochs': 0,
212
+ 'warmup_lr': 0.0005,
213
+ 'wei_weight': 0.0,
214
+ 'weight_decay': 0.0,
215
+ 'width': 512,
216
+ 'word_cache': False,
217
+ 'word_dims': 300,
218
+ 'word_f': 256,
219
+ 'word_index_num': 11195,
220
+ 'word_rep': 'textgrid',
221
+ 'z_type': 'speaker'}
222
+ 10-02 16:51:21 | # ------------ 1002_165120_diffusion_rvqvae_128 ----------- #
223
+ 10-02 16:51:21 | PyTorch version: 2.4.1+cu121
224
+ 10-02 16:51:21 | CUDA version: 12.1
225
+ 10-02 16:51:21 | 1 GPUs
226
+ 10-02 16:51:21 | Random Seed: 2021
227
+ 10-02 16:51:25 | Audio bit rate: 16000
228
+ 10-02 16:51:25 | Reading data './datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/'...
229
+ 10-02 16:51:25 | Creating the dataset cache...
230
+ 10-02 16:51:25 | Found the cache ./datasets/beat_cache/beat_smplx_en_emage_2_128/train/smplxflame_30_cache
231
+ 10-02 16:51:25 | Init train dataloader success
232
+ 10-02 16:51:25 | Init val dataloader success
233
+ 10-02 16:51:25 | Audio bit rate: 16000
234
+ 10-02 16:51:25 | Reading data './datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/'...
235
+ 10-02 16:51:25 | Creating the dataset cache...
236
+ 10-02 16:51:25 | Found the cache ./datasets/beat_cache/beat_smplx_en_emage_2_128/test/smplxflame_30_cache
237
+ 10-02 16:51:25 | Init test dataloader success
238
+ 10-02 16:51:25 | DataParallel(
239
+ (module): MDM(
240
+ (WavEncoder): WavEncoder(
241
+ (feat_extractor): Sequential(
242
+ (0): BasicBlock(
243
+ (conv1): Conv1d(2, 64, kernel_size=(15,), stride=(5,), padding=(1700,))
244
+ (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
245
+ (act1): LeakyReLU(negative_slope=0.01, inplace=True)
246
+ (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))
247
+ (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
248
+ (act2): LeakyReLU(negative_slope=0.01, inplace=True)
249
+ (downsample): Sequential(
250
+ (0): Conv1d(2, 64, kernel_size=(15,), stride=(5,), padding=(1700,))
251
+ (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
252
+ )
253
+ )
254
+ (1): BasicBlock(
255
+ (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,))
256
+ (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
257
+ (act1): LeakyReLU(negative_slope=0.01, inplace=True)
258
+ (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))
259
+ (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
260
+ (act2): LeakyReLU(negative_slope=0.01, inplace=True)
261
+ (downsample): Sequential(
262
+ (0): Conv1d(64, 64, kernel_size=(15,), stride=(6,))
263
+ (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
264
+ )
265
+ )
266
+ (2): BasicBlock(
267
+ (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))
268
+ (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
269
+ (act1): LeakyReLU(negative_slope=0.01, inplace=True)
270
+ (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))
271
+ (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
272
+ (act2): LeakyReLU(negative_slope=0.01, inplace=True)
273
+ )
274
+ (3): BasicBlock(
275
+ (conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,))
276
+ (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
277
+ (act1): LeakyReLU(negative_slope=0.01, inplace=True)
278
+ (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))
279
+ (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
280
+ (act2): LeakyReLU(negative_slope=0.01, inplace=True)
281
+ (downsample): Sequential(
282
+ (0): Conv1d(64, 128, kernel_size=(15,), stride=(6,))
283
+ (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
284
+ )
285
+ )
286
+ (4): BasicBlock(
287
+ (conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))
288
+ (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
289
+ (act1): LeakyReLU(negative_slope=0.01, inplace=True)
290
+ (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))
291
+ (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
292
+ (act2): LeakyReLU(negative_slope=0.01, inplace=True)
293
+ )
294
+ (5): BasicBlock(
295
+ (conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,))
296
+ (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
297
+ (act1): LeakyReLU(negative_slope=0.01, inplace=True)
298
+ (conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,))
299
+ (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
300
+ (act2): LeakyReLU(negative_slope=0.01, inplace=True)
301
+ (downsample): Sequential(
302
+ (0): Conv1d(128, 256, kernel_size=(15,), stride=(3,))
303
+ (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
304
+ )
305
+ )
306
+ )
307
+ )
308
+ (text_encoder_body): Linear(in_features=300, out_features=256, bias=True)
309
+ (text_pre_encoder_body): Embedding(11195, 300)
310
+ (sequence_pos_encoder): PositionalEncoding(
311
+ (dropout): Dropout(p=0.1, inplace=False)
312
+ )
313
+ (mytimmblocks): ModuleList(
314
+ (0-7): 8 x Block(
315
+ (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
316
+ (attn): Attention(
317
+ (qkv): Linear(in_features=512, out_features=1536, bias=False)
318
+ (q_norm): Identity()
319
+ (k_norm): Identity()
320
+ (attn_drop): Dropout(p=0.0, inplace=False)
321
+ (proj): Linear(in_features=512, out_features=512, bias=True)
322
+ (proj_drop): Dropout(p=0.0, inplace=False)
323
+ )
324
+ (ls1): Identity()
325
+ (drop_path1): DropPath(drop_prob=0.100)
326
+ (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
327
+ (mlp): Mlp(
328
+ (fc1): Linear(in_features=512, out_features=1024, bias=True)
329
+ (act): GELU(approximate='none')
330
+ (drop1): Dropout(p=0.0, inplace=False)
331
+ (norm): Identity()
332
+ (fc2): Linear(in_features=1024, out_features=512, bias=True)
333
+ (drop2): Dropout(p=0.0, inplace=False)
334
+ )
335
+ (ls2): Identity()
336
+ (drop_path2): DropPath(drop_prob=0.100)
337
+ )
338
+ )
339
+ (embed_timestep): TimestepEmbedder(
340
+ (sequence_pos_encoder): PositionalEncoding(
341
+ (dropout): Dropout(p=0.1, inplace=False)
342
+ )
343
+ (time_embed): Sequential(
344
+ (0): Linear(in_features=512, out_features=512, bias=True)
345
+ (1): SiLU()
346
+ (2): Linear(in_features=512, out_features=512, bias=True)
347
+ )
348
+ )
349
+ (embed_style): Linear(in_features=6, out_features=64, bias=True)
350
+ (embed_text): Linear(in_features=6144, out_features=512, bias=True)
351
+ (output_process): OutputProcess(
352
+ (poseFinal): Linear(in_features=512, out_features=1536, bias=True)
353
+ )
354
+ (rel_pos): SinusoidalEmbeddings()
355
+ (input_process): InputProcess(
356
+ (poseEmbedding): Linear(in_features=1536, out_features=512, bias=True)
357
+ )
358
+ (input_process2): Linear(in_features=1280, out_features=512, bias=True)
359
+ (mix_audio_text): Linear(in_features=512, out_features=256, bias=True)
360
+ )
361
+ )
362
+ 10-02 16:51:25 | init MDM success
363
+ 10-02 16:51:26 | load self-pretrained checkpoints for VAESKConv
364
+ 10-02 16:51:26 | load self-pretrained checkpoints for VAESKConv
365
+ 10-02 16:51:26 | VAESKConv(
366
+ (encoder): LocalEncoder(
367
+ (layers): ModuleList(
368
+ (0): Sequential(
369
+ (0): SkeletonResidual(
370
+ (residual): Sequential(
371
+ (0): SkeletonConv()
372
+ (1): GroupNorm(10, 330, eps=1e-05, affine=True)
373
+ )
374
+ (shortcut): SkeletonConv()
375
+ (common): Sequential(
376
+ (0): SkeletonPool()
377
+ (1): Tanh()
378
+ )
379
+ )
380
+ )
381
+ (1): Sequential(
382
+ (0): SkeletonResidual(
383
+ (residual): Sequential(
384
+ (0): SkeletonConv()
385
+ (1): GroupNorm(10, 210, eps=1e-05, affine=True)
386
+ )
387
+ (shortcut): SkeletonConv()
388
+ (common): Sequential(
389
+ (0): SkeletonPool()
390
+ (1): Tanh()
391
+ )
392
+ )
393
+ )
394
+ (2-3): 2 x Sequential(
395
+ (0): SkeletonResidual(
396
+ (residual): Sequential(
397
+ (0): SkeletonConv()
398
+ (1): GroupNorm(10, 240, eps=1e-05, affine=True)
399
+ )
400
+ (shortcut): SkeletonConv()
401
+ (common): Sequential(
402
+ (0): Tanh()
403
+ )
404
+ )
405
+ )
406
+ )
407
+ )
408
+ (decoder): VQDecoderV3(
409
+ (main): Sequential(
410
+ (0): ResBlock(
411
+ (model): Sequential(
412
+ (0): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
413
+ (1): LeakyReLU(negative_slope=0.2, inplace=True)
414
+ (2): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
415
+ )
416
+ )
417
+ (1): ResBlock(
418
+ (model): Sequential(
419
+ (0): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
420
+ (1): LeakyReLU(negative_slope=0.2, inplace=True)
421
+ (2): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
422
+ )
423
+ )
424
+ (2): Upsample(scale_factor=2.0, mode='nearest')
425
+ (3): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
426
+ (4): LeakyReLU(negative_slope=0.2, inplace=True)
427
+ (5): Upsample(scale_factor=2.0, mode='nearest')
428
+ (6): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
429
+ (7): LeakyReLU(negative_slope=0.2, inplace=True)
430
+ (8): Upsample(scale_factor=2.0, mode='nearest')
431
+ (9): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
432
+ (10): LeakyReLU(negative_slope=0.2, inplace=True)
433
+ (11): Upsample(scale_factor=2.0, mode='nearest')
434
+ (12): Conv1d(240, 330, kernel_size=(3,), stride=(1,), padding=(1,))
435
+ (13): LeakyReLU(negative_slope=0.2, inplace=True)
436
+ (14): Conv1d(330, 330, kernel_size=(3,), stride=(1,), padding=(1,))
437
+ )
438
+ )
439
+ (fc_mu): Linear(in_features=240, out_features=240, bias=True)
440
+ (fc_logvar): Linear(in_features=240, out_features=240, bias=True)
441
+ )
442
+ 10-02 16:51:26 | init VAESKConv success
443
+ 10-02 16:51:26 | load self-pretrained checkpoints for VAESKConv
444
+ 10-02 16:51:27 | load self-pretrained checkpoints for MDM
445
+ 10-02 17:20:08 | l2 loss: 0.0
446
+ 10-02 17:20:08 | lvel loss: 0.0
447
+ 10-02 17:20:09 | fid score: 0.46520193405646815
448
+ 10-02 17:20:09 | align score: 0.7362512849757596
449
+ 10-02 17:20:09 | l1div score: 12.308517456054688
450
+ 10-02 17:20:09 | total inference time: 1721 s for 945 s motion
ckpt/beatx2_cospeech_diffusion/1002_165120_diffusion_rvqvae_128.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {a_encoder: null, a_fix_pre: false, a_pre_encoder: null, acc: 1, acc_weight: 0.0,
2
+ additional_data: false, adv_weight: 20.0, ali_weight: 0.0, amsgrad: false, apex: false,
3
+ asmr: 0.0, atcont: 0.0, atmr: 0.0, aud_prob: 1.0, audio_dims: 1, audio_f: 256, audio_fps: 16000,
4
+ audio_norm: false, audio_rep: onset+amplitude, audio_sr: 16000, batch_size: 40,
5
+ beat_align: true, benchmark: true, cache_only: false, cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/,
6
+ cf: 0.0, ch: 1.0, cl: 1.0, clean_final_seconds: 0, clean_first_seconds: 0, commit: 0.02,
7
+ config: configs/diffusion_rvqvae_128.yaml, csv_name: a2g_0, cu: 1.0, cudnn_enabled: true,
8
+ d_lr_weight: 0.2, d_name: null, data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/,
9
+ data_path_1: ./datasets/hub/, dataset: beat_sep_lower, ddp: false, debug: false,
10
+ decay_epochs: 500, decay_rate: 0.1, decode_fusion: null, depth: 3, deterministic: true,
11
+ dilation_growth_rate: 3, disable_filtering: false, div_reg_weight: 0.0, downs_t: [
12
+ 3], dropout_prob: 0.3, e_name: VAESKConv, e_path: weights/AESKConv_240_100.bin,
13
+ emb_width: 512, emo_rep: null, emotion_dims: 8, emotion_f: 0, epoch_stage: 0, epochs: 2000,
14
+ eval_model: motion_representation, f_encoder: 'null', f_fix_pre: false, f_pre_encoder: 'null',
15
+ fac_prob: 1.0, facial_dims: 100, facial_f: 0, facial_fps: 15, facial_norm: false,
16
+ facial_rep: smplxflame_30, fid_weight: 0.0, finger_net: original, freeze_wordembed: false,
17
+ fsmr: 0.0, ftmr: 0.0, fusion_mode: sum, g_name: MDM, gap_weight: 0.0, gpus: [0],
18
+ grad_norm: 0.99, hidden_size: 768, hvqvae_multipliers: [1], id_rep: onehot, input_context: both,
19
+ is_train: true, ita_weight: 0.0, iwa_weight: 0.0, joint_channel: 3, kld_aud_weight: 0.0,
20
+ kld_fac_weight: 0.0, kld_weight: 0.0, l: 4, l_bins: 512, l_mu: 0.99, levels: 1,
21
+ lf: 3.0, lh: 3.0, ll: 3.0, loader_workers: 0, log_period: 10, loss_contrastive_neg_weight: 0.005,
22
+ loss_contrastive_pos_weight: 0.2, loss_gan_weight: 5.0, loss_kld_weight: 0.1, loss_physical_weight: 0.0,
23
+ loss_reg_weight: 0.05, loss_regression_weight: 70.0, lr_base: 5.0e-05, lr_min: 1.0e-07,
24
+ lr_policy: step, lu: 3.0, m_conv: 1.0, m_decoder: null, m_encoder: 'null', m_fix_pre: false,
25
+ m_pre_encoder: 'null', mean_pose_path: ./mean_std/beatx_2_330_mean.npy, mean_trans_path: ./mean_std/beatx_2_trans_mean.npy,
26
+ model: denoiser, momentum: 0.8, motion_f: 256, msmr: 0.0, mtmr: 0.0, multi_length_training: [
27
+ 1.0], n_layer: 1, n_poses: 34, n_pre_poses: 4, name: 1002_165120_diffusion_rvqvae_128,
28
+ nesterov: true, new_cache: false, no_adv_epoch: 999, notes: '', opt: adam, opt_betas: [
29
+ 0.5, 0.999], ori_joints: beat_smplx_joints, out_path: ./outputs/audio2pose/, pos_encoding_type: sin,
30
+ pos_prob: 1.0, pose_dims: 330, pose_fps: 30, pose_length: 128, pose_norm: true,
31
+ pose_rep: smplxflame_30, pre_frames: 4, pre_type: zero, pretrain: false, project: s2g,
32
+ queue_size: 1024, random_seed: 2021, rec_aud_weight: 0.0, rec_fac_weight: 0.0, rec_pos_weight: 0.0,
33
+ rec_txt_weight: 0.0, rec_ver_weight: 0.0, rec_weight: 1.0, root_path: ./, root_weight: 1.0,
34
+ rot6d: true, sample_length: 34, sem_rep: null, sparse: 1, speaker_dims: 4, speaker_f: 0,
35
+ speaker_id: onehot, stat: ts, std_pose_path: ./mean_std/beatx_2_330_std.npy, std_trans_path: ./mean_std/beatx_2_trans_std.npy,
36
+ stride: 20, strides_t: [2], t_encoder: 'null', t_fix_pre: false, t_pre_encoder: fasttext,
37
+ tar_joints: beat_smplx_full, test_ckpt: ./ckpt/beatx2_cospeech_diffusion/last_500.bin,
38
+ test_data_path: /datasets/trinity/test/, test_length: 128, test_period: 20, train_data_path: /datasets/trinity/train/,
39
+ train_trans: true, trainer: diffusion_rvqvae, training_speakers: [2], tsmr: 0.0,
40
+ ttmr: 0.0, txt_prob: 1.0, use_amass: false, use_aug: false, use_bottleneck: true,
41
+ use_motionclip: false, use_trans: true, vae_codebook_size: 256, vae_grow: [1, 1,
42
+ 2, 1], vae_layer: 4, vae_length: 240, vae_quantizer_lambda: 1.0, vae_test_dim: 330,
43
+ vae_test_len: 32, vae_test_stride: 20, val_data_path: /datasets/trinity/val/, variational: false,
44
+ vel: 1, vel_weight: 0.0, vqvae_ckpt: null, vqvae_hands_path: ./ckpt/beatx2_rvqvae/RVQVAE_hands/net_300000.pth,
45
+ vqvae_latent_scale: 5.0, vqvae_lower_path: ./ckpt/beatx2_rvqvae/RVQVAE_lower/net_300000.pth,
46
+ vqvae_lower_trans_path: ./ckpt/beatx2_rvqvae/RVQVAE_lower_trans/net_300000.pth,
47
+ vqvae_reverse_decoder_dilation: true, vqvae_squeeze_scale: 4, vqvae_type: rvqvae,
48
+ vqvae_upper_path: ./ckpt/beatx2_rvqvae/RVQVAE_upper/net_300000.pth, warmup_epochs: 0,
49
+ warmup_lr: 0.0005, wei_weight: 0.0, weight_decay: 0.0, width: 512, word_cache: false,
50
+ word_dims: 300, word_f: 256, word_index_num: 11195, word_rep: textgrid, z_type: speaker}
ckpt/beatx2_cospeech_diffusion/last_500.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d79b6fd3e412f7e3cb61eb6795ff686f6cdf80d32ce2bf941cd985d8cae24cc1
3
+ size 128770342
ckpt/beatx2_rvqvae/RVQVAE_hands/net_300000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4eb84ff69009be0b3e68419c5382aa10443b73739dfe2e2928b046e2db59a8b
3
+ size 83048747
ckpt/beatx2_rvqvae/RVQVAE_hands/run.log ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/beatx2_rvqvae/RVQVAE_lower/net_300000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a29217af4f33b7b50ae9aebfdfc2bf2c0e80bed48316ab218cdc40043bb03d20
3
+ size 81499947
ckpt/beatx2_rvqvae/RVQVAE_lower/run.log ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/beatx2_rvqvae/RVQVAE_lower_trans/net_300000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb81af9ebd6c34b473db39e4c343e76fb3b30e4dbbab60d56460544f9cea7f6f
3
+ size 81536811
ckpt/beatx2_rvqvae/RVQVAE_lower_trans/run.log ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/beatx2_rvqvae/RVQVAE_upper/net_300000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:959d066138b293455a98fb0175b1fe2fcc31da9a1de83c9bfbf093ffea746a0e
3
+ size 81794923
ckpt/beatx2_rvqvae/RVQVAE_upper/run.log ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/beatx_1-30_amass_h3d_diffusion/0402_095910_diffusion_h3d_new_main-tmr-h3d623.txt ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/beatx_1-30_amass_h3d_diffusion/0402_095910_diffusion_h3d_new_main-tmr-h3d623.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {a_encoder: null, a_fix_pre: false, a_pre_encoder: null, acc: 1, acc_weight: 0.0,
2
+ additional_data: false, adv_weight: 20.0, ali_weight: 0.0, amsgrad: false, apex: false,
3
+ asmr: 0.0, atcont: 0.0, atmr: 0.0, aud_prob: 1.0, audio_dims: 1, audio_f: 256, audio_fps: 16000,
4
+ audio_norm: false, audio_rep: onset+amplitude, audio_scale: 1.0, audio_sr: 16000,
5
+ batch_size: 200, beat_align: true, benchmark: true, cache_only: false, cache_path: datasets/beat_cache/h3d623_smplx_en_emage/,
6
+ cf: 0.0, ch: 1.0, cl: 1.0, clean_final_seconds: 0, clean_first_seconds: 0, commit: 0.02,
7
+ config: configs/diffusion_h3d_new.yaml, csv_name: a2g_0, cu: 1.0, cudnn_enabled: true,
8
+ d_lr_weight: 0.2, d_name: null, data_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/,
9
+ data_path_1: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/datasets/hub/,
10
+ dataset: beat_sep_lower_new, ddp: false, debug: false, decay_epochs: 200, decay_rate: 0.1,
11
+ decode_fusion: null, depth: 3, deterministic: true, dilation_growth_rate: 3, disable_filtering: false,
12
+ div_reg_weight: 0.0, downs_t: [3], dropout_prob: 0.3, e_name: VAESKConv, e_path: weights/AESKConv_240_100.bin,
13
+ emb_width: 512, emo_rep: null, emotion_dims: 8, emotion_f: 0, epoch_stage: 0, epochs: 2000,
14
+ eval: false, eval_model: motion_representation, f_encoder: 'null', f_fix_pre: false,
15
+ f_pre_encoder: 'null', fac_prob: 1.0, facial_dims: 100, facial_f: 0, facial_fps: 15,
16
+ facial_norm: false, facial_rep: smplxflame_30, fid_weight: 0.0, finger_net: original,
17
+ freeze_wordembed: false, fsmr: 0.0, ftmr: 0.0, fusion_mode: sum, g_name: MDM, gap_weight: 0.0,
18
+ gpus: [0], grad_norm: 0.99, hidden_size: 768, hvqvae_multipliers: [1], id_rep: onehot,
19
+ input_context: both, is_train: true, ita_weight: 0.0, iwa_weight: 0.0, joint_channel: 3,
20
+ kld_aud_weight: 0.0, kld_fac_weight: 0.0, kld_weight: 0.0, l: 4, l_bins: 512, l_mu: 0.99,
21
+ levels: 1, lf: 3.0, lh: 3.0, ll: 3.0, loader_workers: 0, log_period: 10, loss_contrastive_neg_weight: 0.005,
22
+ loss_contrastive_pos_weight: 0.2, loss_gan_weight: 5.0, loss_kld_weight: 0.1, loss_physical_weight: 0.0,
23
+ loss_reg_weight: 0.05, loss_regression_weight: 70.0, lr_base: 5.0e-05, lr_min: 1.0e-07,
24
+ lr_policy: step, lu: 3.0, m_conv: 1.0, m_decoder: null, m_encoder: 'null', m_fix_pre: false,
25
+ m_pre_encoder: 'null', mean_pose_path: /mnt/fu09a/chenbohong/gdc/momask-codes/dataset/BEAT_HumanML3D/Mean_new.npy,
26
+ model: denoiser, momentum: 0.8, motion_f: 256, motionclip_path: /mnt/fu09a/chenbohong/gdc/MotionCLIP/exps/my-paper-model-smplx-64-BEATX/checkpoint_0080.pth.tar,
27
+ msmr: 0.0, mtmr: 0.0, multi_length_training: [1.0], n_layer: 1, n_poses: 34, n_pre_poses: 4,
28
+ name: 0402_095910_diffusion_h3d_new_main-tmr-h3d623, nesterov: true, new_cache: true,
29
+ no_adv_epoch: 999, notes: '', only_data: 0, opt: adam, opt_betas: [0.5, 0.999],
30
+ ori_joints: beat_smplx_joints, out_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/outputs/audio2pose/,
31
+ pos_encoding_type: sin, pos_prob: 1.0, pose_dims: 330, pose_fps: 30, pose_length: 128,
32
+ pose_norm: true, pose_rep: h3d623, pre_frames: 4, pre_type: zero, pretrain: false,
33
+ project: s2g, prompt_scale: 1.0, queue_size: 1024, random_seed: 2021, rec_aud_weight: 0.0,
34
+ rec_fac_weight: 0.0, rec_pos_weight: 0.0, rec_txt_weight: 0.0, rec_ver_weight: 0.0,
35
+ rec_weight: 1.0, root_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/,
36
+ root_weight: 1.0, rot6d: true, sample_length: 34, sem_rep: null, sparse: 1, speaker_dims: 4,
37
+ speaker_f: 0, speaker_id: onehot, stat: ts, std_pose_path: /mnt/fu09a/chenbohong/gdc/momask-codes/dataset/BEAT_HumanML3D/Std_new.npy,
38
+ stride: 20, strides_t: [2], t_encoder: 'null', t_fix_pre: false, t_pre_encoder: fasttext,
39
+ tar_joints: beat_smplx_full, test_ckpt: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/outputs/audio2pose/custom/0329_195328_diffusion_h3d_main-tmr-h3d623/last_400.bin,
40
+ test_data_path: /datasets/trinity/test/, test_length: 64, test_period: 20, text_sample_stride: 0,
41
+ tmr_mean_pose_path: /mnt/fu09a/chenbohong/gdc/momask-codes/dataset/BEAT_HumanML3D/Mean.npy,
42
+ tmr_std_pose_path: /mnt/fu09a/chenbohong/gdc/momask-codes/dataset/BEAT_HumanML3D/Std.npy,
43
+ train_data_path: /datasets/trinity/train/, train_trans: true, trainer: h3d_diffusion,
44
+ training_speakers: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
45
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], tsmr: 0.0, ttmr: 0.0, txt_prob: 1.0,
46
+ use_amass: true, use_aug: false, use_bottleneck: true, vae_codebook_size: 256, vae_grow: [
47
+ 1, 1, 2, 1], vae_layer: 4, vae_length: 240, vae_quantizer_lambda: 1.0, vae_test_dim: 330,
48
+ vae_test_len: 32, vae_test_stride: 20, val_data_path: /datasets/trinity/val/, variational: false,
49
+ vel: 1, vel_weight: 0.0, vqvae_ckpt: null, vqvae_hands_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_new_HB/RVQVAE_hands/net_300000.pth,
50
+ vqvae_latent_scale: 10.0, vqvae_lower_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_new_HB/RVQVAE_lower/net_300000.pth,
51
+ vqvae_reverse_decoder_dilation: true, vqvae_upper_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_new_HB/RVQVAE_upper/net_300000.pth,
52
+ warmup_epochs: 0, warmup_lr: 0.0005, wei_weight: 0.0, weight_decay: 0.0, width: 512,
53
+ word_cache: false, word_dims: 300, word_f: 256, word_index_num: 11195, word_rep: textgrid,
54
+ z_type: speaker}
ckpt/beatx_1-30_amass_h3d_diffusion/events.out.tfevents.1712023151.FU09.51754.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c3039924ad15287a632b3e0f919ed7e652db2a1dffb64f262beb85914a183b3
3
+ size 3750982
ckpt/beatx_1-30_amass_h3d_diffusion/last_600.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ebef33ac1f131420de05565cd9927313446eadc3bd5a2ab4cc436a09da368bf
3
+ size 130348642
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/events.out.tfevents.1711936445.FU09.1750416.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffb969ab2e40916e0d6cf2a3b015f89a4c714ea4f5215031485bf30b688ca7c8
3
+ size 232345
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/net_300000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19cc2cbb68371a91c4107dce073d25feda56af1acc2b8ce938120738d423c8d6
3
+ size 85261291
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/run.log ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/events.out.tfevents.1711936412.FU09.1747951.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6511cf3be93e318cc01d3995ed69636250907f1cccd705babccac92964a6b2b9
3
+ size 232345
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/net_300000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8de38c6be9c927a5d6df1c4e03489d704b961cf6f4777dcf31379381192cfcfe
3
+ size 82151403
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/run.log ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/events.out.tfevents.1711936499.FU09.1757898.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f636dde1b675a489ac993ab9f9eb1d2c36ac0da05dfcb804e77de4b9a23dd1a
3
+ size 232345
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/net_300000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25657855dace8008f769fd6675fdd595a7a3464a0b4ea1def294ed171bc55ca2
3
+ size 82753707
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/run.log ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/beatx_1-30_amass_h3d_tmr/log ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **protocal A**
2
+ T2M
3
+ | Metrics | Recall @1 | Recall @2 | Recall @3 | Recall @5 | Recall @10 |
4
+ |-------------| --------- | --------- | --------- | --------- | --------- |
5
+ |epoch_99 | 4.135 |7.804 |10.679 |16.183 |27.245 |
6
+ |epoch_299 | 4.929 |8.653 |12.103 |17.798 |29.189 |
7
+ |epoch_399 | 4.765 |8.516 |12.432 |18.784 |29.217 |
8
+
9
+ M2T
10
+ | Metrics | Recall @1 | Recall @2 | Recall @3 | Recall @5 | Recall @10 |
11
+ |-------------| --------- | --------- | --------- | --------- | --------- |
12
+ |epoch_99 | 4.436 |8.762 |11.829 |17.579 |27.656 |
13
+ |epoch_299 | 5.285 |9.912 |12.870 |19.168 |29.874 |
14
+ |epoch_399 | 5.449 |9.365 |13.226 |19.387 |30.038 |
15
+
16
+ **protocal B**
17
+ T2M
18
+ | Metrics | Recall @1 | Recall @2 | Recall @3 | Recall @5 | Recall @10 |
19
+ |-------------| --------- | --------- | --------- | --------- | --------- |
20
+ |epoch_99 | 7.996 |13.472 |17.662 |24.644 |36.692 |
21
+ |epoch_299 | 8.899 |14.321 |18.894 |25.575 |37.158 |
22
+ |epoch_399 | 9.337 |14.047 |19.250 |26.643 |37.295 |
23
+
24
+ M2T
25
+ | Metrics | Recall @1 | Recall @2 | Recall @3 | Recall @5 | Recall @10 |
26
+ |-------------| --------- | --------- | --------- | --------- | --------- |
27
+ |epoch_99 | 7.996 |12.623 |16.238 |22.344 |32.421 |
28
+ |epoch_299 | 9.721 |14.266 |17.470 |23.768 |34.858 |
29
+ |epoch_399 | 9.775 |14.485 |18.264 |24.288 |34.995 |
30
+
31
+ **protocal D**
32
+ T2M
33
+ | Metrics | Recall @1 | Recall @2 | Recall @3 | Recall @5 | Recall @10 |
34
+ |-------------| --------- | --------- | --------- | --------- | --------- |
35
+ |epoch_99 | 62.815 |79.409 |85.433 |91.375 |96.933 |
36
+ |epoch_299 | 62.240 |77.026 |83.571 |88.828 |94.003 |
37
+ |epoch_399 | 60.570 |76.369 |82.558 |87.760 |93.182 |
38
+
39
+ M2T
40
+ | Metrics | Recall @1 | Recall @2 | Recall @3 | Recall @5 | Recall @10 |
41
+ |-------------| --------- | --------- | --------- | --------- | --------- |
42
+ |epoch_99 | 65.307 |79.463 |86.884 |92.223 |96.742 |
43
+ |epoch_299 | 62.924 |77.820 |83.708 |89.677 |93.894 |
44
+ |epoch_399 | 62.240 |76.670 |82.284 |87.952 |93.127 |
ckpt/beatx_1-30_amass_h3d_tmr/motion_epoch=299.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcd2b8a7dca0c7855b14029407d5c9102ec91ebd71170bb7cfdb8f4e06671141
3
+ size 18416565
ckpt/beatx_1-30_amass_h3d_tmr/text_epoch=299.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8ac085ed2edf47ed6f20b3ae5ec8c89b98e90237315663f8c17c9f69934b3ed
3
+ size 284055884
ckpt/distilbert-base-uncased/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertModel"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "pad_token_id": 0,
17
+ "qa_dropout": 0.1,
18
+ "seq_classif_dropout": 0.2,
19
+ "sinusoidal_pos_embds": false,
20
+ "tie_weights_": true,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.29.1",
23
+ "vocab_size": 30522
24
+ }
ckpt/distilbert-base-uncased/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1df15ef6ca103a52df977eec51dd1058d5f6a2fdf5b3ae5d2e7fc225e9801143
3
+ size 265483293
ckpt/distilbert-base-uncased/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
ckpt/distilbert-base-uncased/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
ckpt/distilbert-base-uncased/tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "DistilBertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
ckpt/distilbert-base-uncased/vocab.txt ADDED
The diff for this file is too large to render. See raw diff