robinwitch commited on Nov 19, 2024

Commit

3f17024

verified ·

1 Parent(s): 6f80f6a

Upload 35 files

Browse files

Files changed (35) hide show

ckpt/beatx2_cospeech_diffusion/0403_212319_diffusion_rvqvae_128.txt +0 -0
ckpt/beatx2_cospeech_diffusion/0403_212319_diffusion_rvqvae_128.yaml +54 -0
ckpt/beatx2_cospeech_diffusion/1002_165120_diffusion_rvqvae_128.txt +450 -0
ckpt/beatx2_cospeech_diffusion/1002_165120_diffusion_rvqvae_128.yaml +50 -0
ckpt/beatx2_cospeech_diffusion/last_500.bin +3 -0
ckpt/beatx2_rvqvae/RVQVAE_hands/net_300000.pth +3 -0
ckpt/beatx2_rvqvae/RVQVAE_hands/run.log +0 -0
ckpt/beatx2_rvqvae/RVQVAE_lower/net_300000.pth +3 -0
ckpt/beatx2_rvqvae/RVQVAE_lower/run.log +0 -0
ckpt/beatx2_rvqvae/RVQVAE_lower_trans/net_300000.pth +3 -0
ckpt/beatx2_rvqvae/RVQVAE_lower_trans/run.log +0 -0
ckpt/beatx2_rvqvae/RVQVAE_upper/net_300000.pth +3 -0
ckpt/beatx2_rvqvae/RVQVAE_upper/run.log +0 -0
ckpt/beatx_1-30_amass_h3d_diffusion/0402_095910_diffusion_h3d_new_main-tmr-h3d623.txt +0 -0
ckpt/beatx_1-30_amass_h3d_diffusion/0402_095910_diffusion_h3d_new_main-tmr-h3d623.yaml +54 -0
ckpt/beatx_1-30_amass_h3d_diffusion/events.out.tfevents.1712023151.FU09.51754.0 +3 -0
ckpt/beatx_1-30_amass_h3d_diffusion/last_600.bin +3 -0
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/events.out.tfevents.1711936445.FU09.1750416.0 +3 -0
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/net_300000.pth +3 -0
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/run.log +0 -0
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/events.out.tfevents.1711936412.FU09.1747951.0 +3 -0
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/net_300000.pth +3 -0
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/run.log +0 -0
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/events.out.tfevents.1711936499.FU09.1757898.0 +3 -0
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/net_300000.pth +3 -0
ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/run.log +0 -0
ckpt/beatx_1-30_amass_h3d_tmr/log +44 -0
ckpt/beatx_1-30_amass_h3d_tmr/motion_epoch=299.ckpt +3 -0
ckpt/beatx_1-30_amass_h3d_tmr/text_epoch=299.ckpt +3 -0
ckpt/distilbert-base-uncased/config.json +24 -0
ckpt/distilbert-base-uncased/pytorch_model.bin +3 -0
ckpt/distilbert-base-uncased/special_tokens_map.json +7 -0
ckpt/distilbert-base-uncased/tokenizer.json +0 -0
ckpt/distilbert-base-uncased/tokenizer_config.json +13 -0
ckpt/distilbert-base-uncased/vocab.txt +0 -0

ckpt/beatx2_cospeech_diffusion/0403_212319_diffusion_rvqvae_128.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/beatx2_cospeech_diffusion/0403_212319_diffusion_rvqvae_128.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+{a_encoder: null, a_fix_pre: false, a_pre_encoder: null, acc: 1, acc_weight: 0.0,
+  additional_data: false, adv_weight: 20.0, ali_weight: 0.0, amsgrad: false, apex: false,
+  asmr: 0.0, atcont: 0.0, atmr: 0.0, aud_prob: 1.0, audio_dims: 1, audio_f: 256, audio_fps: 16000,
+  audio_norm: false, audio_rep: onset+amplitude, audio_sr: 16000, batch_size: 40,
+  beat_align: true, benchmark: true, cache_only: false, cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/,
+  cf: 0.0, ch: 1.0, cl: 1.0, clean_final_seconds: 0, clean_first_seconds: 0, commit: 0.02,
+  config: configs/diffusion_rvqvae_128.yaml, csv_name: a2g_0, cu: 1.0, cudnn_enabled: true,
+  d_lr_weight: 0.2, d_name: null, data_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/,
+  data_path_1: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/datasets/hub/,
+  dataset: beat_sep_lower, ddp: false, debug: false, decay_epochs: 200, decay_rate: 0.1,
+  decode_fusion: null, depth: 3, deterministic: true, dilation_growth_rate: 3, disable_filtering: false,
+  div_reg_weight: 0.0, downs_t: [3], dropout_prob: 0.3, e_name: VAESKConv, e_path: weights/AESKConv_240_100.bin,
+  emb_width: 512, emo_rep: null, emotion_dims: 8, emotion_f: 0, epoch_stage: 0, epochs: 1000,
+  eval_model: motion_representation, f_encoder: 'null', f_fix_pre: false, f_pre_encoder: 'null',
+  fac_prob: 1.0, facial_dims: 100, facial_f: 0, facial_fps: 15, facial_norm: false,
+  facial_rep: smplxflame_30, fid_weight: 0.0, finger_net: original, freeze_wordembed: false,
+  fsmr: 0.0, ftmr: 0.0, fusion_mode: sum, g_name: MDM, gap_weight: 0.0, gpus: [0],
+  grad_norm: 0.99, hidden_size: 768, hvqvae_multipliers: [1], id_rep: onehot, input_context: both,
+  is_train: true, ita_weight: 0.0, iwa_weight: 0.0, joint_channel: 3, kld_aud_weight: 0.0,
+  kld_fac_weight: 0.0, kld_weight: 0.0, l: 4, l_bins: 512, l_mu: 0.99, levels: 1,
+  lf: 3.0, lh: 3.0, ll: 3.0, loader_workers: 0, log_period: 10, loss_contrastive_neg_weight: 0.005,
+  loss_contrastive_pos_weight: 0.2, loss_gan_weight: 5.0, loss_kld_weight: 0.1, loss_physical_weight: 0.0,
+  loss_reg_weight: 0.05, loss_regression_weight: 70.0, lr_base: 5.0e-05, lr_min: 1.0e-07,
+  lr_policy: step, lu: 3.0, m_conv: 1.0, m_decoder: null, m_encoder: 'null', m_fix_pre: false,
+  m_pre_encoder: 'null', mean_pose_path: /mnt/fu09a/chenbohong/PantoMatrix/beatx_2_330_mean.npy,
+  mean_trans_path: /mnt/fu09a/chenbohong/PantoMatrix/beatx_2_trans_mean.npy, model: denoiser,
+  momentum: 0.8, motion_f: 256, msmr: 0.0, mtmr: 0.0, multi_length_training: [1.0],
+  n_layer: 1, n_poses: 34, n_pre_poses: 4, name: 0403_212319_diffusion_rvqvae_128,
+  nesterov: true, new_cache: false, no_adv_epoch: 999, notes: '', opt: adam, opt_betas: [
+    0.5, 0.999], ori_joints: beat_smplx_joints, out_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/outputs/audio2pose/,
+  pos_encoding_type: sin, pos_prob: 1.0, pose_dims: 330, pose_fps: 30, pose_length: 128,
+  pose_norm: true, pose_rep: smplxflame_30, pre_frames: 4, pre_type: zero, pretrain: false,
+  project: s2g, queue_size: 1024, random_seed: 2021, rec_aud_weight: 0.0, rec_fac_weight: 0.0,
+  rec_pos_weight: 0.0, rec_txt_weight: 0.0, rec_ver_weight: 0.0, rec_weight: 1.0,
+  root_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/, root_weight: 1.0,
+  rot6d: true, sample_length: 34, sem_rep: null, sparse: 1, speaker_dims: 4, speaker_f: 0,
+  speaker_id: onehot, stat: ts, std_pose_path: /mnt/fu09a/chenbohong/PantoMatrix/beatx_2_330_std.npy,
+  std_trans_path: /mnt/fu09a/chenbohong/PantoMatrix/beatx_2_trans_std.npy, stride: 20,
+  strides_t: [2], t_encoder: 'null', t_fix_pre: false, t_pre_encoder: fasttext, tar_joints: beat_smplx_full,
+  test_ckpt: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/outputs/audio2pose/custom/0330_140056_diffusion_rvqvae/last_300.bin,
+  test_data_path: /datasets/trinity/test/, test_length: 128, test_period: 20, train_data_path: /datasets/trinity/train/,
+  train_trans: true, trainer: diffusion_rvqvae, training_speakers: [2], tsmr: 0.0,
+  ttmr: 0.0, txt_prob: 1.0, use_amass: false, use_aug: false, use_bottleneck: true,
+  use_trans: true, vae_codebook_size: 256, vae_grow: [1, 1, 2, 1], vae_layer: 4, vae_length: 240,
+  vae_quantizer_lambda: 1.0, vae_test_dim: 330, vae_test_len: 32, vae_test_stride: 20,
+  val_data_path: /datasets/trinity/val/, variational: false, vel: 1, vel_weight: 0.0,
+  vqvae_ckpt: null, vqvae_hands_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_beatx2/RVQVAE_hands/net_300000.pth,
+  vqvae_latent_scale: 5.0, vqvae_lower_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_beatx2/RVQVAE_lower/net_300000.pth,
+  vqvae_lower_trans_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_beatx2/RVQVAE_lower_trans/net_300000.pth,
+  vqvae_reverse_decoder_dilation: true, vqvae_squeeze_scale: 4, vqvae_type: rvqvae,
+  vqvae_upper_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_beatx2/RVQVAE_upper/net_300000.pth,
+  warmup_epochs: 0, warmup_lr: 0.0005, wei_weight: 0.0, weight_decay: 0.0, width: 512,
+  word_cache: false, word_dims: 300, word_f: 256, word_index_num: 11195, word_rep: textgrid,
+  z_type: speaker}

ckpt/beatx2_cospeech_diffusion/1002_165120_diffusion_rvqvae_128.txt ADDED Viewed

	@@ -0,0 +1,450 @@

+ 10-02 16:51:21 | {'a_encoder': None,
+ 'a_fix_pre': False,
+ 'a_pre_encoder': None,
+ 'acc': 1,
+ 'acc_weight': 0.0,
+ 'additional_data': False,
+ 'adv_weight': 20.0,
+ 'ali_weight': 0.0,
+ 'amsgrad': False,
+ 'apex': False,
+ 'asmr': 0.0,
+ 'atcont': 0.0,
+ 'atmr': 0.0,
+ 'aud_prob': 1.0,
+ 'audio_dims': 1,
+ 'audio_f': 256,
+ 'audio_fps': 16000,
+ 'audio_norm': False,
+ 'audio_rep': 'onset+amplitude',
+ 'audio_sr': 16000,
+ 'batch_size': 40,
+ 'beat_align': True,
+ 'benchmark': True,
+ 'cache_only': False,
+ 'cache_path': 'datasets/beat_cache/beat_smplx_en_emage_2_128/',
+ 'cf': 0.0,
+ 'ch': 1.0,
+ 'cl': 1.0,
+ 'clean_final_seconds': 0,
+ 'clean_first_seconds': 0,
+ 'commit': 0.02,
+ 'config': 'configs/diffusion_rvqvae_128.yaml',
+ 'csv_name': 'a2g_0',
+ 'cu': 1.0,
+ 'cudnn_enabled': True,
+ 'd_lr_weight': 0.2,
+ 'd_name': None,
+ 'data_path': './datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/',
+ 'data_path_1': './datasets/hub/',
+ 'dataset': 'beat_sep_lower',
+ 'ddp': False,
+ 'debug': False,
+ 'decay_epochs': 500,
+ 'decay_rate': 0.1,
+ 'decode_fusion': None,
+ 'depth': 3,
+ 'deterministic': True,
+ 'dilation_growth_rate': 3,
+ 'disable_filtering': False,
+ 'div_reg_weight': 0.0,
+ 'downs_t': [3],
+ 'dropout_prob': 0.3,
+ 'e_name': 'VAESKConv',
+ 'e_path': 'weights/AESKConv_240_100.bin',
+ 'emb_width': 512,
+ 'emo_rep': None,
+ 'emotion_dims': 8,
+ 'emotion_f': 0,
+ 'epoch_stage': 0,
+ 'epochs': 2000,
+ 'eval_model': 'motion_representation',
+ 'f_encoder': 'null',
+ 'f_fix_pre': False,
+ 'f_pre_encoder': 'null',
+ 'fac_prob': 1.0,
+ 'facial_dims': 100,
+ 'facial_f': 0,
+ 'facial_fps': 15,
+ 'facial_norm': False,
+ 'facial_rep': 'smplxflame_30',
+ 'fid_weight': 0.0,
+ 'finger_net': 'original',
+ 'freeze_wordembed': False,
+ 'fsmr': 0.0,
+ 'ftmr': 0.0,
+ 'fusion_mode': 'sum',
+ 'g_name': 'MDM',
+ 'gap_weight': 0.0,
+ 'gpus': [0],
+ 'grad_norm': 0.99,
+ 'hidden_size': 768,
+ 'hvqvae_multipliers': [1],
+ 'id_rep': 'onehot',
+ 'input_context': 'both',
+ 'is_train': True,
+ 'ita_weight': 0.0,
+ 'iwa_weight': 0.0,
+ 'joint_channel': 3,
+ 'kld_aud_weight': 0.0,
+ 'kld_fac_weight': 0.0,
+ 'kld_weight': 0.0,
+ 'l': 4,
+ 'l_bins': 512,
+ 'l_mu': 0.99,
+ 'levels': 1,
+ 'lf': 3.0,
+ 'lh': 3.0,
+ 'll': 3.0,
+ 'loader_workers': 0,
+ 'log_period': 10,
+ 'loss_contrastive_neg_weight': 0.005,
+ 'loss_contrastive_pos_weight': 0.2,
+ 'loss_gan_weight': 5.0,
+ 'loss_kld_weight': 0.1,
+ 'loss_physical_weight': 0.0,
+ 'loss_reg_weight': 0.05,
+ 'loss_regression_weight': 70.0,
+ 'lr_base': 5e-05,
+ 'lr_min': 1e-07,
+ 'lr_policy': 'step',
+ 'lu': 3.0,
+ 'm_conv': 1.0,
+ 'm_decoder': None,
+ 'm_encoder': 'null',
+ 'm_fix_pre': False,
+ 'm_pre_encoder': 'null',
+ 'mean_pose_path': './mean_std/beatx_2_330_mean.npy',
+ 'mean_trans_path': './mean_std/beatx_2_trans_mean.npy',
+ 'model': 'denoiser',
+ 'momentum': 0.8,
+ 'motion_f': 256,
+ 'msmr': 0.0,
+ 'mtmr': 0.0,
+ 'multi_length_training': [1.0],
+ 'n_layer': 1,
+ 'n_poses': 34,
+ 'n_pre_poses': 4,
+ 'name': '1002_165120_diffusion_rvqvae_128',
+ 'nesterov': True,
+ 'new_cache': False,
+ 'no_adv_epoch': 999,
+ 'notes': '',
+ 'opt': 'adam',
+ 'opt_betas': [0.5, 0.999],
+ 'ori_joints': 'beat_smplx_joints',
+ 'out_path': './outputs/audio2pose/',
+ 'pos_encoding_type': 'sin',
+ 'pos_prob': 1.0,
+ 'pose_dims': 330,
+ 'pose_fps': 30,
+ 'pose_length': 128,
+ 'pose_norm': True,
+ 'pose_rep': 'smplxflame_30',
+ 'pre_frames': 4,
+ 'pre_type': 'zero',
+ 'pretrain': False,
+ 'project': 's2g',
+ 'queue_size': 1024,
+ 'random_seed': 2021,
+ 'rec_aud_weight': 0.0,
+ 'rec_fac_weight': 0.0,
+ 'rec_pos_weight': 0.0,
+ 'rec_txt_weight': 0.0,
+ 'rec_ver_weight': 0.0,
+ 'rec_weight': 1.0,
+ 'root_path': './',
+ 'root_weight': 1.0,
+ 'rot6d': True,
+ 'sample_length': 34,
+ 'sem_rep': None,
+ 'sparse': 1,
+ 'speaker_dims': 4,
+ 'speaker_f': 0,
+ 'speaker_id': 'onehot',
+ 'stat': 'ts',
+ 'std_pose_path': './mean_std/beatx_2_330_std.npy',
+ 'std_trans_path': './mean_std/beatx_2_trans_std.npy',
+ 'stride': 20,
+ 'strides_t': [2],
+ 't_encoder': 'null',
+ 't_fix_pre': False,
+ 't_pre_encoder': 'fasttext',
+ 'tar_joints': 'beat_smplx_full',
+ 'test_ckpt': './ckpt/beatx2_cospeech_diffusion/last_500.bin',
+ 'test_data_path': '/datasets/trinity/test/',
+ 'test_length': 128,
+ 'test_period': 20,
+ 'train_data_path': '/datasets/trinity/train/',
+ 'train_trans': True,
+ 'trainer': 'diffusion_rvqvae',
+ 'training_speakers': [2],
+ 'tsmr': 0.0,
+ 'ttmr': 0.0,
+ 'txt_prob': 1.0,
+ 'use_amass': False,
+ 'use_aug': False,
+ 'use_bottleneck': True,
+ 'use_motionclip': False,
+ 'use_trans': True,
+ 'vae_codebook_size': 256,
+ 'vae_grow': [1, 1, 2, 1],
+ 'vae_layer': 4,
+ 'vae_length': 240,
+ 'vae_quantizer_lambda': 1.0,
+ 'vae_test_dim': 330,
+ 'vae_test_len': 32,
+ 'vae_test_stride': 20,
+ 'val_data_path': '/datasets/trinity/val/',
+ 'variational': False,
+ 'vel': 1,
+ 'vel_weight': 0.0,
+ 'vqvae_ckpt': None,
+ 'vqvae_hands_path': './ckpt/beatx2_rvqvae/RVQVAE_hands/net_300000.pth',
+ 'vqvae_latent_scale': 5.0,
+ 'vqvae_lower_path': './ckpt/beatx2_rvqvae/RVQVAE_lower/net_300000.pth',
+ 'vqvae_lower_trans_path': './ckpt/beatx2_rvqvae/RVQVAE_lower_trans/net_300000.pth',
+ 'vqvae_reverse_decoder_dilation': True,
+ 'vqvae_squeeze_scale': 4,
+ 'vqvae_type': 'rvqvae',
+ 'vqvae_upper_path': './ckpt/beatx2_rvqvae/RVQVAE_upper/net_300000.pth',
+ 'warmup_epochs': 0,
+ 'warmup_lr': 0.0005,
+ 'wei_weight': 0.0,
+ 'weight_decay': 0.0,
+ 'width': 512,
+ 'word_cache': False,
+ 'word_dims': 300,
+ 'word_f': 256,
+ 'word_index_num': 11195,
+ 'word_rep': 'textgrid',
+ 'z_type': 'speaker'}
+ 10-02 16:51:21 | # ------------ 1002_165120_diffusion_rvqvae_128 ----------- #
+ 10-02 16:51:21 | PyTorch version: 2.4.1+cu121
+ 10-02 16:51:21 | CUDA version: 12.1
+ 10-02 16:51:21 | 1 GPUs
+ 10-02 16:51:21 | Random Seed: 2021
+ 10-02 16:51:25 | Audio bit rate: 16000
+ 10-02 16:51:25 | Reading data './datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/'...
+ 10-02 16:51:25 | Creating the dataset cache...
+ 10-02 16:51:25 | Found the cache ./datasets/beat_cache/beat_smplx_en_emage_2_128/train/smplxflame_30_cache
+ 10-02 16:51:25 | Init train dataloader success
+ 10-02 16:51:25 | Init val dataloader success
+ 10-02 16:51:25 | Audio bit rate: 16000
+ 10-02 16:51:25 | Reading data './datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/'...
+ 10-02 16:51:25 | Creating the dataset cache...
+ 10-02 16:51:25 | Found the cache ./datasets/beat_cache/beat_smplx_en_emage_2_128/test/smplxflame_30_cache
+ 10-02 16:51:25 | Init test dataloader success
+ 10-02 16:51:25 | DataParallel(
+  (module): MDM(
+    (WavEncoder): WavEncoder(
+      (feat_extractor): Sequential(
+        (0): BasicBlock(
+          (conv1): Conv1d(2, 64, kernel_size=(15,), stride=(5,), padding=(1700,))
+          (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act1): LeakyReLU(negative_slope=0.01, inplace=True)
+          (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))
+          (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act2): LeakyReLU(negative_slope=0.01, inplace=True)
+          (downsample): Sequential(
+            (0): Conv1d(2, 64, kernel_size=(15,), stride=(5,), padding=(1700,))
+            (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          )
+        )
+        (1): BasicBlock(
+          (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,))
+          (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act1): LeakyReLU(negative_slope=0.01, inplace=True)
+          (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))
+          (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act2): LeakyReLU(negative_slope=0.01, inplace=True)
+          (downsample): Sequential(
+            (0): Conv1d(64, 64, kernel_size=(15,), stride=(6,))
+            (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          )
+        )
+        (2): BasicBlock(
+          (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))
+          (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act1): LeakyReLU(negative_slope=0.01, inplace=True)
+          (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))
+          (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act2): LeakyReLU(negative_slope=0.01, inplace=True)
+        )
+        (3): BasicBlock(
+          (conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,))
+          (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act1): LeakyReLU(negative_slope=0.01, inplace=True)
+          (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))
+          (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act2): LeakyReLU(negative_slope=0.01, inplace=True)
+          (downsample): Sequential(
+            (0): Conv1d(64, 128, kernel_size=(15,), stride=(6,))
+            (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          )
+        )
+        (4): BasicBlock(
+          (conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))
+          (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act1): LeakyReLU(negative_slope=0.01, inplace=True)
+          (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))
+          (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act2): LeakyReLU(negative_slope=0.01, inplace=True)
+        )
+        (5): BasicBlock(
+          (conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,))
+          (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act1): LeakyReLU(negative_slope=0.01, inplace=True)
+          (conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,))
+          (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (act2): LeakyReLU(negative_slope=0.01, inplace=True)
+          (downsample): Sequential(
+            (0): Conv1d(128, 256, kernel_size=(15,), stride=(3,))
+            (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          )
+        )
+      )
+    )
+    (text_encoder_body): Linear(in_features=300, out_features=256, bias=True)
+    (text_pre_encoder_body): Embedding(11195, 300)
+    (sequence_pos_encoder): PositionalEncoding(
+      (dropout): Dropout(p=0.1, inplace=False)
+    )
+    (mytimmblocks): ModuleList(
+      (0-7): 8 x Block(
+        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): Attention(
+          (qkv): Linear(in_features=512, out_features=1536, bias=False)
+          (q_norm): Identity()
+          (k_norm): Identity()
+          (attn_drop): Dropout(p=0.0, inplace=False)
+          (proj): Linear(in_features=512, out_features=512, bias=True)
+          (proj_drop): Dropout(p=0.0, inplace=False)
+        )
+        (ls1): Identity()
+        (drop_path1): DropPath(drop_prob=0.100)
+        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Mlp(
+          (fc1): Linear(in_features=512, out_features=1024, bias=True)
+          (act): GELU(approximate='none')
+          (drop1): Dropout(p=0.0, inplace=False)
+          (norm): Identity()
+          (fc2): Linear(in_features=1024, out_features=512, bias=True)
+          (drop2): Dropout(p=0.0, inplace=False)
+        )
+        (ls2): Identity()
+        (drop_path2): DropPath(drop_prob=0.100)
+      )
+    )
+    (embed_timestep): TimestepEmbedder(
+      (sequence_pos_encoder): PositionalEncoding(
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (time_embed): Sequential(
+        (0): Linear(in_features=512, out_features=512, bias=True)
+        (1): SiLU()
+        (2): Linear(in_features=512, out_features=512, bias=True)
+      )
+    )
+    (embed_style): Linear(in_features=6, out_features=64, bias=True)
+    (embed_text): Linear(in_features=6144, out_features=512, bias=True)
+    (output_process): OutputProcess(
+      (poseFinal): Linear(in_features=512, out_features=1536, bias=True)
+    )
+    (rel_pos): SinusoidalEmbeddings()
+    (input_process): InputProcess(
+      (poseEmbedding): Linear(in_features=1536, out_features=512, bias=True)
+    )
+    (input_process2): Linear(in_features=1280, out_features=512, bias=True)
+    (mix_audio_text): Linear(in_features=512, out_features=256, bias=True)
+  )
+)
+ 10-02 16:51:25 | init MDM success
+ 10-02 16:51:26 | load self-pretrained checkpoints for VAESKConv
+ 10-02 16:51:26 | load self-pretrained checkpoints for VAESKConv
+ 10-02 16:51:26 | VAESKConv(
+  (encoder): LocalEncoder(
+    (layers): ModuleList(
+      (0): Sequential(
+        (0): SkeletonResidual(
+          (residual): Sequential(
+            (0): SkeletonConv()
+            (1): GroupNorm(10, 330, eps=1e-05, affine=True)
+          )
+          (shortcut): SkeletonConv()
+          (common): Sequential(
+            (0): SkeletonPool()
+            (1): Tanh()
+          )
+        )
+      )
+      (1): Sequential(
+        (0): SkeletonResidual(
+          (residual): Sequential(
+            (0): SkeletonConv()
+            (1): GroupNorm(10, 210, eps=1e-05, affine=True)
+          )
+          (shortcut): SkeletonConv()
+          (common): Sequential(
+            (0): SkeletonPool()
+            (1): Tanh()
+          )
+        )
+      )
+      (2-3): 2 x Sequential(
+        (0): SkeletonResidual(
+          (residual): Sequential(
+            (0): SkeletonConv()
+            (1): GroupNorm(10, 240, eps=1e-05, affine=True)
+          )
+          (shortcut): SkeletonConv()
+          (common): Sequential(
+            (0): Tanh()
+          )
+        )
+      )
+    )
+  )
+  (decoder): VQDecoderV3(
+    (main): Sequential(
+      (0): ResBlock(
+        (model): Sequential(
+          (0): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
+          (1): LeakyReLU(negative_slope=0.2, inplace=True)
+          (2): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
+        )
+      )
+      (1): ResBlock(
+        (model): Sequential(
+          (0): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
+          (1): LeakyReLU(negative_slope=0.2, inplace=True)
+          (2): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
+        )
+      )
+      (2): Upsample(scale_factor=2.0, mode='nearest')
+      (3): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
+      (4): LeakyReLU(negative_slope=0.2, inplace=True)
+      (5): Upsample(scale_factor=2.0, mode='nearest')
+      (6): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
+      (7): LeakyReLU(negative_slope=0.2, inplace=True)
+      (8): Upsample(scale_factor=2.0, mode='nearest')
+      (9): Conv1d(240, 240, kernel_size=(3,), stride=(1,), padding=(1,))
+      (10): LeakyReLU(negative_slope=0.2, inplace=True)
+      (11): Upsample(scale_factor=2.0, mode='nearest')
+      (12): Conv1d(240, 330, kernel_size=(3,), stride=(1,), padding=(1,))
+      (13): LeakyReLU(negative_slope=0.2, inplace=True)
+      (14): Conv1d(330, 330, kernel_size=(3,), stride=(1,), padding=(1,))
+    )
+  )
+  (fc_mu): Linear(in_features=240, out_features=240, bias=True)
+  (fc_logvar): Linear(in_features=240, out_features=240, bias=True)
+)
+ 10-02 16:51:26 | init VAESKConv success
+ 10-02 16:51:26 | load self-pretrained checkpoints for VAESKConv
+ 10-02 16:51:27 | load self-pretrained checkpoints for MDM
+ 10-02 17:20:08 | l2 loss: 0.0
+ 10-02 17:20:08 | lvel loss: 0.0
+ 10-02 17:20:09 | fid score: 0.46520193405646815
+ 10-02 17:20:09 | align score: 0.7362512849757596
+ 10-02 17:20:09 | l1div score: 12.308517456054688
+ 10-02 17:20:09 | total inference time: 1721 s for 945 s motion

ckpt/beatx2_cospeech_diffusion/1002_165120_diffusion_rvqvae_128.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+{a_encoder: null, a_fix_pre: false, a_pre_encoder: null, acc: 1, acc_weight: 0.0,
+  additional_data: false, adv_weight: 20.0, ali_weight: 0.0, amsgrad: false, apex: false,
+  asmr: 0.0, atcont: 0.0, atmr: 0.0, aud_prob: 1.0, audio_dims: 1, audio_f: 256, audio_fps: 16000,
+  audio_norm: false, audio_rep: onset+amplitude, audio_sr: 16000, batch_size: 40,
+  beat_align: true, benchmark: true, cache_only: false, cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/,
+  cf: 0.0, ch: 1.0, cl: 1.0, clean_final_seconds: 0, clean_first_seconds: 0, commit: 0.02,
+  config: configs/diffusion_rvqvae_128.yaml, csv_name: a2g_0, cu: 1.0, cudnn_enabled: true,
+  d_lr_weight: 0.2, d_name: null, data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/,
+  data_path_1: ./datasets/hub/, dataset: beat_sep_lower, ddp: false, debug: false,
+  decay_epochs: 500, decay_rate: 0.1, decode_fusion: null, depth: 3, deterministic: true,
+  dilation_growth_rate: 3, disable_filtering: false, div_reg_weight: 0.0, downs_t: [
+    3], dropout_prob: 0.3, e_name: VAESKConv, e_path: weights/AESKConv_240_100.bin,
+  emb_width: 512, emo_rep: null, emotion_dims: 8, emotion_f: 0, epoch_stage: 0, epochs: 2000,
+  eval_model: motion_representation, f_encoder: 'null', f_fix_pre: false, f_pre_encoder: 'null',
+  fac_prob: 1.0, facial_dims: 100, facial_f: 0, facial_fps: 15, facial_norm: false,
+  facial_rep: smplxflame_30, fid_weight: 0.0, finger_net: original, freeze_wordembed: false,
+  fsmr: 0.0, ftmr: 0.0, fusion_mode: sum, g_name: MDM, gap_weight: 0.0, gpus: [0],
+  grad_norm: 0.99, hidden_size: 768, hvqvae_multipliers: [1], id_rep: onehot, input_context: both,
+  is_train: true, ita_weight: 0.0, iwa_weight: 0.0, joint_channel: 3, kld_aud_weight: 0.0,
+  kld_fac_weight: 0.0, kld_weight: 0.0, l: 4, l_bins: 512, l_mu: 0.99, levels: 1,
+  lf: 3.0, lh: 3.0, ll: 3.0, loader_workers: 0, log_period: 10, loss_contrastive_neg_weight: 0.005,
+  loss_contrastive_pos_weight: 0.2, loss_gan_weight: 5.0, loss_kld_weight: 0.1, loss_physical_weight: 0.0,
+  loss_reg_weight: 0.05, loss_regression_weight: 70.0, lr_base: 5.0e-05, lr_min: 1.0e-07,
+  lr_policy: step, lu: 3.0, m_conv: 1.0, m_decoder: null, m_encoder: 'null', m_fix_pre: false,
+  m_pre_encoder: 'null', mean_pose_path: ./mean_std/beatx_2_330_mean.npy, mean_trans_path: ./mean_std/beatx_2_trans_mean.npy,
+  model: denoiser, momentum: 0.8, motion_f: 256, msmr: 0.0, mtmr: 0.0, multi_length_training: [
+    1.0], n_layer: 1, n_poses: 34, n_pre_poses: 4, name: 1002_165120_diffusion_rvqvae_128,
+  nesterov: true, new_cache: false, no_adv_epoch: 999, notes: '', opt: adam, opt_betas: [
+    0.5, 0.999], ori_joints: beat_smplx_joints, out_path: ./outputs/audio2pose/, pos_encoding_type: sin,
+  pos_prob: 1.0, pose_dims: 330, pose_fps: 30, pose_length: 128, pose_norm: true,
+  pose_rep: smplxflame_30, pre_frames: 4, pre_type: zero, pretrain: false, project: s2g,
+  queue_size: 1024, random_seed: 2021, rec_aud_weight: 0.0, rec_fac_weight: 0.0, rec_pos_weight: 0.0,
+  rec_txt_weight: 0.0, rec_ver_weight: 0.0, rec_weight: 1.0, root_path: ./, root_weight: 1.0,
+  rot6d: true, sample_length: 34, sem_rep: null, sparse: 1, speaker_dims: 4, speaker_f: 0,
+  speaker_id: onehot, stat: ts, std_pose_path: ./mean_std/beatx_2_330_std.npy, std_trans_path: ./mean_std/beatx_2_trans_std.npy,
+  stride: 20, strides_t: [2], t_encoder: 'null', t_fix_pre: false, t_pre_encoder: fasttext,
+  tar_joints: beat_smplx_full, test_ckpt: ./ckpt/beatx2_cospeech_diffusion/last_500.bin,
+  test_data_path: /datasets/trinity/test/, test_length: 128, test_period: 20, train_data_path: /datasets/trinity/train/,
+  train_trans: true, trainer: diffusion_rvqvae, training_speakers: [2], tsmr: 0.0,
+  ttmr: 0.0, txt_prob: 1.0, use_amass: false, use_aug: false, use_bottleneck: true,
+  use_motionclip: false, use_trans: true, vae_codebook_size: 256, vae_grow: [1, 1,
+    2, 1], vae_layer: 4, vae_length: 240, vae_quantizer_lambda: 1.0, vae_test_dim: 330,
+  vae_test_len: 32, vae_test_stride: 20, val_data_path: /datasets/trinity/val/, variational: false,
+  vel: 1, vel_weight: 0.0, vqvae_ckpt: null, vqvae_hands_path: ./ckpt/beatx2_rvqvae/RVQVAE_hands/net_300000.pth,
+  vqvae_latent_scale: 5.0, vqvae_lower_path: ./ckpt/beatx2_rvqvae/RVQVAE_lower/net_300000.pth,
+  vqvae_lower_trans_path: ./ckpt/beatx2_rvqvae/RVQVAE_lower_trans/net_300000.pth,
+  vqvae_reverse_decoder_dilation: true, vqvae_squeeze_scale: 4, vqvae_type: rvqvae,
+  vqvae_upper_path: ./ckpt/beatx2_rvqvae/RVQVAE_upper/net_300000.pth, warmup_epochs: 0,
+  warmup_lr: 0.0005, wei_weight: 0.0, weight_decay: 0.0, width: 512, word_cache: false,
+  word_dims: 300, word_f: 256, word_index_num: 11195, word_rep: textgrid, z_type: speaker}

ckpt/beatx2_cospeech_diffusion/last_500.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d79b6fd3e412f7e3cb61eb6795ff686f6cdf80d32ce2bf941cd985d8cae24cc1
+size 128770342

ckpt/beatx2_rvqvae/RVQVAE_hands/net_300000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4eb84ff69009be0b3e68419c5382aa10443b73739dfe2e2928b046e2db59a8b
+size 83048747

ckpt/beatx2_rvqvae/RVQVAE_hands/run.log ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/beatx2_rvqvae/RVQVAE_lower/net_300000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a29217af4f33b7b50ae9aebfdfc2bf2c0e80bed48316ab218cdc40043bb03d20
+size 81499947

ckpt/beatx2_rvqvae/RVQVAE_lower/run.log ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/beatx2_rvqvae/RVQVAE_lower_trans/net_300000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb81af9ebd6c34b473db39e4c343e76fb3b30e4dbbab60d56460544f9cea7f6f
+size 81536811

ckpt/beatx2_rvqvae/RVQVAE_lower_trans/run.log ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/beatx2_rvqvae/RVQVAE_upper/net_300000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:959d066138b293455a98fb0175b1fe2fcc31da9a1de83c9bfbf093ffea746a0e
+size 81794923

ckpt/beatx2_rvqvae/RVQVAE_upper/run.log ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/beatx_1-30_amass_h3d_diffusion/0402_095910_diffusion_h3d_new_main-tmr-h3d623.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/beatx_1-30_amass_h3d_diffusion/0402_095910_diffusion_h3d_new_main-tmr-h3d623.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+{a_encoder: null, a_fix_pre: false, a_pre_encoder: null, acc: 1, acc_weight: 0.0,
+  additional_data: false, adv_weight: 20.0, ali_weight: 0.0, amsgrad: false, apex: false,
+  asmr: 0.0, atcont: 0.0, atmr: 0.0, aud_prob: 1.0, audio_dims: 1, audio_f: 256, audio_fps: 16000,
+  audio_norm: false, audio_rep: onset+amplitude, audio_scale: 1.0, audio_sr: 16000,
+  batch_size: 200, beat_align: true, benchmark: true, cache_only: false, cache_path: datasets/beat_cache/h3d623_smplx_en_emage/,
+  cf: 0.0, ch: 1.0, cl: 1.0, clean_final_seconds: 0, clean_first_seconds: 0, commit: 0.02,
+  config: configs/diffusion_h3d_new.yaml, csv_name: a2g_0, cu: 1.0, cudnn_enabled: true,
+  d_lr_weight: 0.2, d_name: null, data_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/,
+  data_path_1: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/datasets/hub/,
+  dataset: beat_sep_lower_new, ddp: false, debug: false, decay_epochs: 200, decay_rate: 0.1,
+  decode_fusion: null, depth: 3, deterministic: true, dilation_growth_rate: 3, disable_filtering: false,
+  div_reg_weight: 0.0, downs_t: [3], dropout_prob: 0.3, e_name: VAESKConv, e_path: weights/AESKConv_240_100.bin,
+  emb_width: 512, emo_rep: null, emotion_dims: 8, emotion_f: 0, epoch_stage: 0, epochs: 2000,
+  eval: false, eval_model: motion_representation, f_encoder: 'null', f_fix_pre: false,
+  f_pre_encoder: 'null', fac_prob: 1.0, facial_dims: 100, facial_f: 0, facial_fps: 15,
+  facial_norm: false, facial_rep: smplxflame_30, fid_weight: 0.0, finger_net: original,
+  freeze_wordembed: false, fsmr: 0.0, ftmr: 0.0, fusion_mode: sum, g_name: MDM, gap_weight: 0.0,
+  gpus: [0], grad_norm: 0.99, hidden_size: 768, hvqvae_multipliers: [1], id_rep: onehot,
+  input_context: both, is_train: true, ita_weight: 0.0, iwa_weight: 0.0, joint_channel: 3,
+  kld_aud_weight: 0.0, kld_fac_weight: 0.0, kld_weight: 0.0, l: 4, l_bins: 512, l_mu: 0.99,
+  levels: 1, lf: 3.0, lh: 3.0, ll: 3.0, loader_workers: 0, log_period: 10, loss_contrastive_neg_weight: 0.005,
+  loss_contrastive_pos_weight: 0.2, loss_gan_weight: 5.0, loss_kld_weight: 0.1, loss_physical_weight: 0.0,
+  loss_reg_weight: 0.05, loss_regression_weight: 70.0, lr_base: 5.0e-05, lr_min: 1.0e-07,
+  lr_policy: step, lu: 3.0, m_conv: 1.0, m_decoder: null, m_encoder: 'null', m_fix_pre: false,
+  m_pre_encoder: 'null', mean_pose_path: /mnt/fu09a/chenbohong/gdc/momask-codes/dataset/BEAT_HumanML3D/Mean_new.npy,
+  model: denoiser, momentum: 0.8, motion_f: 256, motionclip_path: /mnt/fu09a/chenbohong/gdc/MotionCLIP/exps/my-paper-model-smplx-64-BEATX/checkpoint_0080.pth.tar,
+  msmr: 0.0, mtmr: 0.0, multi_length_training: [1.0], n_layer: 1, n_poses: 34, n_pre_poses: 4,
+  name: 0402_095910_diffusion_h3d_new_main-tmr-h3d623, nesterov: true, new_cache: true,
+  no_adv_epoch: 999, notes: '', only_data: 0, opt: adam, opt_betas: [0.5, 0.999],
+  ori_joints: beat_smplx_joints, out_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/outputs/audio2pose/,
+  pos_encoding_type: sin, pos_prob: 1.0, pose_dims: 330, pose_fps: 30, pose_length: 128,
+  pose_norm: true, pose_rep: h3d623, pre_frames: 4, pre_type: zero, pretrain: false,
+  project: s2g, prompt_scale: 1.0, queue_size: 1024, random_seed: 2021, rec_aud_weight: 0.0,
+  rec_fac_weight: 0.0, rec_pos_weight: 0.0, rec_txt_weight: 0.0, rec_ver_weight: 0.0,
+  rec_weight: 1.0, root_path: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/,
+  root_weight: 1.0, rot6d: true, sample_length: 34, sem_rep: null, sparse: 1, speaker_dims: 4,
+  speaker_f: 0, speaker_id: onehot, stat: ts, std_pose_path: /mnt/fu09a/chenbohong/gdc/momask-codes/dataset/BEAT_HumanML3D/Std_new.npy,
+  stride: 20, strides_t: [2], t_encoder: 'null', t_fix_pre: false, t_pre_encoder: fasttext,
+  tar_joints: beat_smplx_full, test_ckpt: /mnt/fu09a/chenbohong/PantoMatrix/scripts/EMAGE_2024/outputs/audio2pose/custom/0329_195328_diffusion_h3d_main-tmr-h3d623/last_400.bin,
+  test_data_path: /datasets/trinity/test/, test_length: 64, test_period: 20, text_sample_stride: 0,
+  tmr_mean_pose_path: /mnt/fu09a/chenbohong/gdc/momask-codes/dataset/BEAT_HumanML3D/Mean.npy,
+  tmr_std_pose_path: /mnt/fu09a/chenbohong/gdc/momask-codes/dataset/BEAT_HumanML3D/Std.npy,
+  train_data_path: /datasets/trinity/train/, train_trans: true, trainer: h3d_diffusion,
+  training_speakers: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+    19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], tsmr: 0.0, ttmr: 0.0, txt_prob: 1.0,
+  use_amass: true, use_aug: false, use_bottleneck: true, vae_codebook_size: 256, vae_grow: [
+    1, 1, 2, 1], vae_layer: 4, vae_length: 240, vae_quantizer_lambda: 1.0, vae_test_dim: 330,
+  vae_test_len: 32, vae_test_stride: 20, val_data_path: /datasets/trinity/val/, variational: false,
+  vel: 1, vel_weight: 0.0, vqvae_ckpt: null, vqvae_hands_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_new_HB/RVQVAE_hands/net_300000.pth,
+  vqvae_latent_scale: 10.0, vqvae_lower_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_new_HB/RVQVAE_lower/net_300000.pth,
+  vqvae_reverse_decoder_dilation: true, vqvae_upper_path: /mnt/fu09a/chenbohong/gdc/T2M-GPT/output_new_HB/RVQVAE_upper/net_300000.pth,
+  warmup_epochs: 0, warmup_lr: 0.0005, wei_weight: 0.0, weight_decay: 0.0, width: 512,
+  word_cache: false, word_dims: 300, word_f: 256, word_index_num: 11195, word_rep: textgrid,
+  z_type: speaker}

ckpt/beatx_1-30_amass_h3d_diffusion/events.out.tfevents.1712023151.FU09.51754.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c3039924ad15287a632b3e0f919ed7e652db2a1dffb64f262beb85914a183b3
+size 3750982

ckpt/beatx_1-30_amass_h3d_diffusion/last_600.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ebef33ac1f131420de05565cd9927313446eadc3bd5a2ab4cc436a09da368bf
+size 130348642

ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/events.out.tfevents.1711936445.FU09.1750416.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffb969ab2e40916e0d6cf2a3b015f89a4c714ea4f5215031485bf30b688ca7c8
+size 232345

ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/net_300000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19cc2cbb68371a91c4107dce073d25feda56af1acc2b8ce938120738d423c8d6
+size 85261291

ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_hands/run.log ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/events.out.tfevents.1711936412.FU09.1747951.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6511cf3be93e318cc01d3995ed69636250907f1cccd705babccac92964a6b2b9
+size 232345

ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/net_300000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8de38c6be9c927a5d6df1c4e03489d704b961cf6f4777dcf31379381192cfcfe
+size 82151403

ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_lower/run.log ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/events.out.tfevents.1711936499.FU09.1757898.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f636dde1b675a489ac993ab9f9eb1d2c36ac0da05dfcb804e77de4b9a23dd1a
+size 232345

ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/net_300000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25657855dace8008f769fd6675fdd595a7a3464a0b4ea1def294ed171bc55ca2
+size 82753707

ckpt/beatx_1-30_amass_h3d_rvqvae/RVQVAE_upper/run.log ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/beatx_1-30_amass_h3d_tmr/log ADDED Viewed

	@@ -0,0 +1,44 @@

+**protocal A**
+T2M
+|   Metrics   |  Recall @1 |  Recall @2 |  Recall @3 |  Recall @5 |  Recall @10 |
+|-------------|  --------- |  --------- |  --------- |  --------- |  --------- |
+|epoch_99 |  4.135   |7.804   |10.679   |16.183   |27.245   |
+|epoch_299 |  4.929   |8.653   |12.103   |17.798   |29.189   |
+|epoch_399 |  4.765   |8.516   |12.432   |18.784   |29.217   |
+M2T
+|   Metrics   |  Recall @1 |  Recall @2 |  Recall @3 |  Recall @5 |  Recall @10 |
+|-------------|  --------- |  --------- |  --------- |  --------- |  --------- |
+|epoch_99 |  4.436   |8.762   |11.829   |17.579   |27.656   |
+|epoch_299 |  5.285   |9.912   |12.870   |19.168   |29.874   |
+|epoch_399 |  5.449   |9.365   |13.226   |19.387   |30.038   |
+**protocal B**
+T2M
+|   Metrics   |  Recall @1 |  Recall @2 |  Recall @3 |  Recall @5 |  Recall @10 |
+|-------------|  --------- |  --------- |  --------- |  --------- |  --------- |
+|epoch_99 |  7.996   |13.472   |17.662   |24.644   |36.692   |
+|epoch_299 |  8.899   |14.321   |18.894   |25.575   |37.158   |
+|epoch_399 |  9.337   |14.047   |19.250   |26.643   |37.295   |
+M2T
+|   Metrics   |  Recall @1 |  Recall @2 |  Recall @3 |  Recall @5 |  Recall @10 |
+|-------------|  --------- |  --------- |  --------- |  --------- |  --------- |
+|epoch_99 |  7.996   |12.623   |16.238   |22.344   |32.421   |
+|epoch_299 |  9.721   |14.266   |17.470   |23.768   |34.858   |
+|epoch_399 |  9.775   |14.485   |18.264   |24.288   |34.995   |
+**protocal D**
+T2M
+|   Metrics   |  Recall @1 |  Recall @2 |  Recall @3 |  Recall @5 |  Recall @10 |
+|-------------|  --------- |  --------- |  --------- |  --------- |  --------- |
+|epoch_99 |  62.815   |79.409   |85.433   |91.375   |96.933   |
+|epoch_299 |  62.240   |77.026   |83.571   |88.828   |94.003   |
+|epoch_399 |  60.570   |76.369   |82.558   |87.760   |93.182   |
+M2T
+|   Metrics   |  Recall @1 |  Recall @2 |  Recall @3 |  Recall @5 |  Recall @10 |
+|-------------|  --------- |  --------- |  --------- |  --------- |  --------- |
+|epoch_99 |  65.307   |79.463   |86.884   |92.223   |96.742   |
+|epoch_299 |  62.924   |77.820   |83.708   |89.677   |93.894   |
+|epoch_399 |  62.240   |76.670   |82.284   |87.952   |93.127   |

ckpt/beatx_1-30_amass_h3d_tmr/motion_epoch=299.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd2b8a7dca0c7855b14029407d5c9102ec91ebd71170bb7cfdb8f4e06671141
+size 18416565

ckpt/beatx_1-30_amass_h3d_tmr/text_epoch=299.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8ac085ed2edf47ed6f20b3ae5ec8c89b98e90237315663f8c17c9f69934b3ed
+size 284055884

ckpt/distilbert-base-uncased/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_name_or_path": "distilbert-base-uncased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertModel"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.29.1",
+  "vocab_size": 30522
+}

ckpt/distilbert-base-uncased/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1df15ef6ca103a52df977eec51dd1058d5f6a2fdf5b3ae5d2e7fc225e9801143
+size 265483293

ckpt/distilbert-base-uncased/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

ckpt/distilbert-base-uncased/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/distilbert-base-uncased/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

ckpt/distilbert-base-uncased/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff