import torch import torch.nn.functional as F from espnet.nets.pytorch_backend.conformer.encoder import Encoder from torch import nn from visualizr import logger from visualizr.model.base import BaseModule class LSTM(nn.Module): def __init__(self, motion_dim, output_dim, num_layers=2, hidden_dim=128): super().__init__() self.lstm = nn.LSTM( input_size=motion_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True, ) self.fc = nn.Linear(hidden_dim, output_dim) def forward(self, x): x, _ = self.lstm(x) return self.fc(x) class DiffusionPredictor(BaseModule): def __init__(self, conf): super(DiffusionPredictor, self).__init__() self.infer_type = conf.infer_type self.initialize_layers(conf) logger.info(f"infer_type: {self.infer_type}") def create_conformer_encoder(self, attention_dim, num_blocks): return Encoder( idim=0, attention_dim=attention_dim, attention_heads=2, linear_units=attention_dim, num_blocks=num_blocks, input_layer=None, dropout_rate=0.2, positional_dropout_rate=0.2, attention_dropout_rate=0.2, normalize_before=False, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=3, macaron_style=True, pos_enc_layer_type="rel_pos", selfattention_layer_type="rel_selfattn", use_cnn_module=True, cnn_module_kernel=13, ) def initialize_layers( self, conf, mfcc_dim=39, hubert_dim=1024, speech_layers=4, speech_dim=512, decoder_dim=1024, motion_start_dim=512, HAL_layers=25, ): self.conf = conf # Speech downsampling if self.infer_type.startswith("mfcc"): # from 100 hz to 25 hz self.down_sample1 = nn.Conv1d( mfcc_dim, 256, kernel_size=3, stride=2, padding=1 ) self.down_sample2 = nn.Conv1d( 256, speech_dim, kernel_size=3, stride=2, padding=1 ) elif self.infer_type.startswith("hubert"): # from 50 hz to 25 hz self.down_sample1 = nn.Conv1d( hubert_dim, speech_dim, kernel_size=3, stride=2, padding=1 ) self.weights = nn.Parameter(torch.zeros(HAL_layers)) self.speech_encoder = self.create_conformer_encoder( speech_dim, speech_layers ) else: logger.exception("infer_type not supported") # Encoders & Decoders self.coarse_decoder = self.create_conformer_encoder( decoder_dim, conf.decoder_layers ) # LSTM predictors for Variance Adapter if self.infer_type != "hubert_audio_only": self.pose_predictor = LSTM(speech_dim, 3) self.pose_encoder = LSTM(3, speech_dim) if "full_control" in self.infer_type: self.location_predictor = LSTM(speech_dim, 1) self.location_encoder = LSTM(1, speech_dim) self.face_scale_predictor = LSTM(speech_dim, 1) self.face_scale_encoder = LSTM(1, speech_dim) # Linear transformations self.init_code_proj = nn.Sequential(nn.Linear(motion_start_dim, 128)) self.noisy_encoder = nn.Sequential(nn.Linear(conf.motion_dim, 128)) self.t_encoder = nn.Sequential(nn.Linear(1, 128)) self.encoder_direction_code = nn.Linear(conf.motion_dim, 128) self.out_proj = nn.Linear(decoder_dim, conf.motion_dim) def forward( self, initial_code, direction_code, seq_input_vector, face_location, face_scale, yaw_pitch_roll, noisy_x, t_emb, control_flag=False, ): global x if self.infer_type.startswith("mfcc"): x = self.mfcc_speech_downsample(seq_input_vector) elif self.infer_type.startswith("hubert"): norm_weights = F.softmax(self.weights, dim=-1) weighted_feature = ( norm_weights.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) * seq_input_vector ).sum(dim=1) x = self.down_sample1(weighted_feature.transpose(1, 2)).transpose(1, 2) x, _ = self.speech_encoder(x, masks=None) predicted_location, predicted_scale, predicted_pose = ( face_location, face_scale, yaw_pitch_roll, ) if self.infer_type != "hubert_audio_only": logger.info(f"pose controllable. control_flag: {control_flag}") x, predicted_location, predicted_scale, predicted_pose = ( self.adjust_features( x, face_location, face_scale, yaw_pitch_roll, control_flag ) ) # Variable initial_code and direction_code serve as a motion guide # extracted from the reference image. # This aims to tell the model what the starting motion should be. concatenated_features = self.combine_features( x, initial_code, direction_code, noisy_x, t_emb ) outputs = self.decode_features(concatenated_features) return outputs, predicted_location, predicted_scale, predicted_pose def mfcc_speech_downsample(self, seq_input_vector): x = self.down_sample1(seq_input_vector.transpose(1, 2)) return self.down_sample2(x).transpose(1, 2) def adjust_features( self, x, face_location, face_scale, yaw_pitch_roll, control_flag ): predicted_location, predicted_scale = 0, 0 if "full_control" in self.infer_type: logger.info(f"full controllable. control_flag: {control_flag}") x_residual, predicted_location = self.adjust_location( x, face_location, control_flag ) x = x + x_residual x_residual, predicted_scale = self.adjust_scale(x, face_scale, control_flag) x = x + x_residual x_residual, predicted_pose = self.adjust_pose(x, yaw_pitch_roll, control_flag) x = x + x_residual return x, predicted_location, predicted_scale, predicted_pose def adjust_location(self, x, face_location, control_flag): if control_flag: predicted_location = face_location else: predicted_location = self.location_predictor(x) return self.location_encoder(predicted_location), predicted_location def adjust_scale(self, x, face_scale, control_flag): if control_flag: predicted_face_scale = face_scale else: predicted_face_scale = self.face_scale_predictor(x) return self.face_scale_encoder(predicted_face_scale), predicted_face_scale def adjust_pose(self, x, yaw_pitch_roll, control_flag): if control_flag: predicted_pose = yaw_pitch_roll else: predicted_pose = self.pose_predictor(x) return self.pose_encoder(predicted_pose), predicted_pose def combine_features(self, x, initial_code, direction_code, noisy_x, t_emb): init_code_proj = ( self.init_code_proj(initial_code).unsqueeze(1).repeat(1, x.size(1), 1) ) noisy_feature = self.noisy_encoder(noisy_x) t_emb_feature = ( self.t_encoder(t_emb.unsqueeze(1).float()) .unsqueeze(1) .repeat(1, x.size(1), 1) ) direction_code_feature = ( self.encoder_direction_code(direction_code) .unsqueeze(1) .repeat(1, x.size(1), 1) ) return torch.cat( (x, direction_code_feature, init_code_proj, noisy_feature, t_emb_feature), dim=-1, ) def decode_features(self, concatenated_features): outputs, _ = self.coarse_decoder(concatenated_features, masks=None) return self.out_proj(outputs)