MoDA-PLUS

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 10 days ago

Commit

7758cff

verified ·

1 Parent(s): cfcc2fd

Upload 247 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -0
configs/audio2motion/inference/inference.yaml +35 -0
configs/audio2motion/model/audio_processer_config.yaml +36 -0
configs/audio2motion/model/config.yaml +59 -0
configs/audio2motion/model/crop_config.yaml +21 -0
configs/audio2motion/model/liveportrait_config.yaml +59 -0
configs/audio2motion/model/models.yaml +43 -0
requirements.txt +45 -0
src/datasets/mean.pt +3 -0
src/datasets/preprocess/__pycache__/flow_filter.cpython-310.pyc +0 -0
src/datasets/preprocess/__pycache__/video_crop.cpython-310.pyc +0 -0
src/datasets/preprocess/__pycache__/visualize.cpython-310.pyc +0 -0
src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-310.pyc +0 -0
src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-312.pyc +0 -0
src/datasets/preprocess/extract_features/__pycache__/feature_extractor_pipeline.cpython-310.pyc +0 -0
src/datasets/preprocess/extract_features/__pycache__/motion_processer.cpython-310.pyc +0 -0
src/datasets/preprocess/extract_features/__pycache__/test_processer.cpython-310.pyc +0 -0
src/datasets/preprocess/extract_features/audio_processer.py +471 -0
src/datasets/preprocess/extract_features/face_segmentation/__init__.py +88 -0
src/datasets/preprocess/extract_features/face_segmentation/__pycache__/__init__.cpython-310.pyc +0 -0
src/datasets/preprocess/extract_features/face_segmentation/__pycache__/bisenet.cpython-310.pyc +0 -0
src/datasets/preprocess/extract_features/face_segmentation/__pycache__/resnet.cpython-310.pyc +0 -0
src/datasets/preprocess/extract_features/face_segmentation/bisenet.py +285 -0
src/datasets/preprocess/extract_features/face_segmentation/resnet.py +113 -0
src/datasets/preprocess/extract_features/motion_processer.py +1420 -0
src/examples/driving_audios/10.wav +3 -0
src/examples/driving_audios/5.wav +3 -0
src/examples/driving_audios/6.wav +3 -0
src/examples/driving_audios/tmp_5.wav +3 -0
src/examples/reference_images/1.jpg +3 -0
src/examples/reference_images/2.jpg +0 -0
src/examples/reference_images/3.jpg +0 -0
src/examples/reference_images/4.jpg +0 -0
src/examples/reference_images/5.jpg +0 -0
src/examples/reference_images/6.jpg +0 -0
src/examples/reference_images/7.jpg +3 -0
src/examples/silent-audio.wav +3 -0
src/models/audio/__pycache__/audio_processer.cpython-310.pyc +0 -0
src/models/audio/__pycache__/audio_proj.cpython-310.pyc +0 -0
src/models/audio/__pycache__/hubert.cpython-310.pyc +0 -0
src/models/audio/__pycache__/wav2vec.cpython-310.pyc +0 -0
src/models/audio/__pycache__/wav2vec2.cpython-310.pyc +0 -0
src/models/audio/__pycache__/wav2vec_modified.cpython-310.pyc +0 -0
src/models/audio/audio_processer.py +407 -0
src/models/audio/audio_proj.py +124 -0
src/models/audio/hubert.py +120 -0
src/models/audio/hubert2.py +120 -0
src/models/audio/wav2vec.py +210 -0
src/models/audio/wav2vec2.py +123 -0
src/models/audio/wav2vec_modified.py +223 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/examples/driving_audios/10.wav filter=lfs diff=lfs merge=lfs -text
+src/examples/driving_audios/5.wav filter=lfs diff=lfs merge=lfs -text
+src/examples/driving_audios/6.wav filter=lfs diff=lfs merge=lfs -text
+src/examples/driving_audios/tmp_5.wav filter=lfs diff=lfs merge=lfs -text
+src/examples/reference_images/1.jpg filter=lfs diff=lfs merge=lfs -text
+src/examples/reference_images/7.jpg filter=lfs diff=lfs merge=lfs -text
+src/examples/silent-audio.wav filter=lfs diff=lfs merge=lfs -text
+src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/t1.jpg filter=lfs diff=lfs merge=lfs -text

configs/audio2motion/inference/inference.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+output_fps: 25
+## appearance and motion feature extractor
+appearance_feature_extractor_path:  pretrain_weights/decode/v1/first_stage/base_models/appearance_feature_extractor.pth
+motion_extractor_path: pretrain_weights/decode/v1/first_stage/base_models/motion_extractor.pth
+## SPADEGenerator
+spade_generator_path: pretrain_weights/decode/v1/first_stage/base_models/spade_generator.pth
+warping_module_path:  pretrain_weights/decode/v1/first_stage/base_models/warping_module.pth
+## stitching retargeting module
+stitching_retargeting_module_path:  pretrain_weights/decode/v1/first_stage/retargeting_models/stitching_retargeting_module.pth
+#
+# audio processer config
+audio_model_config: configs/audio2motion/model/audio_processer_config.yaml
+# motion processer config
+motion_processer_config: configs/audio2motion/model/liveportrait_config.yaml
+# motion generator model
+motion_models_config: configs/audio2motion/model/config.yaml
+use_ref_kp: False
+motion_generator_path: pretrain_weights/moda/net-200.pth
+need_normalized: True
+# other configs
+device_id: 0
+batch_size: 100
+source_max_dim: 1280 # the max dim of height and width of source image or video
+source_division: 2 # make sure the height and width of source image or video can be divided by this number
+input_height: 256
+input_width: 256
+source_fps: 25
+min_video_length: 50
+max_video_length: 500

configs/audio2motion/model/audio_processer_config.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# models settings
+model_params:
+  model_name: hubert   # wav2vec or hubert
+  model_type: base   # base large
+  is_chinese: True
+  is_original: True
+  only_last_features: False
+  use_audio_separator: False
+  audio_separator_name: Kim_Vocal_2.onnx
+# model weights
+model_weights:
+  audio_separator_path: pretrain_weights/audio/audio_separator
+  hubert_path:
+    chinese:
+      base: pretrain_weights/audio/chinese-hubert-base
+# data settings
+data_params:
+  sample_rate: 16000
+  max_length: 60         # seconds
+  sub_clip_length: 3000  # samples
+  fps: 25
+  sample_strategy: "presample"
+  audio_pad_mode: replicate  # pad mode for audio, replicate or zero
+  save_to_cpu: True    # saving gpu memory
+# device settings
+device_params:
+  device_id: 0
+  flag_force_cpu: False
+  flag_use_half_precision: False
+cache_dir: preprocessed/HDTF/vocals
+tmp_dir: src/tmp

configs/audio2motion/model/config.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+model_name: TalkingHeadDiT-B
+audio_projector:
+  type: MLP
+  pretrained_model_path: None
+  device: cuda
+  params:
+    model_name: MLP-S-3
+    sequence_length: 1
+    blocks: 12
+    audio_feat_dim: 768
+    keypoint_dim: 63
+    feature_dim: 512
+    output_dim: 256
+    context_tokens: 1
+    audio_embedder_type: simple
+    audio_cond_dim: 63
+motion_generator:
+  type: DiT
+  pretrained_model_path: None
+  device: cuda
+  params:
+    model_name: DiT-S-8-8
+    architecture: decoder
+    use_emo: True
+    input_dim: 70
+    output_dim: 70
+    exp_dim: 63
+    n_prev_frames: 1
+    n_pred_frames: 80
+    use_indicator: False
+    feature_dim: 256
+    n_heads: 8
+    n_layers: 8
+    mlp_ratio: 4
+    no_use_learnable_pe: True
+    norm_type: rms_norm  # [rms_norm|layer_norm]
+    qk_norm: rms_norm    # [rms_norm|layer_norm|null]
+    steps: 1000
+noise_scheduler:
+  type: flow_matching
+  sample_mode: sample
+  device: cuda
+  params:
+    time_shifting: True
+    num_train_timesteps: 1000
+    num_inference_steps: 10
+    eta: 0.2
+    beta_start: 0.0001
+    beta_end: 0.02
+    s: 0.008
+    mode: cosine
+train:
+  audio_drop_prob: 0.3
+  cond_drop_prob: 0.2
+  motion_drop_prob: 0.3
+  audio_drop_ratio : 0.2
+  motion_drop_ratio: 0.1
+  pre_drop_ratio : 0.0
+device_specific: True

configs/audio2motion/model/crop_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+insightface_root: pretrain_weights/decode/v1/insightface
+landmark_ckpt_path: pretrain_weights/decode/v1/first_stage/landmark.onnx
+xpose_config_file_path: src/utils/UniPose_SwinT.py
+device_id: 0                         # gpu device id
+flag_force_cpu: False                # force cpu inference, WIP
+det_thresh: 0.15                      # detection threshold
+########## source image or video cropping option ##########
+dsize: 512                           # crop size
+scale: 2.3                           # scale factor
+vx_ratio: 0                          # vx ratio
+vy_ratio: -0.125                     # vy ratio +up, -down
+max_face_num: 0                      # max face number, 0 mean no limit
+flag_do_rot: True                    # whether to conduct the rotation when flag_do_crop is True
+animal_face_type: animal_face_9      # animal_face_68 -> 68 landmark points, animal_face_9 -> 9 landmarks
+########## driving video auto cropping option ##########
+scale_crop_driving_video: 2.2        # 2.0 # scale factor for cropping driving video
+vx_ratio_crop_driving_video: 0.0     # adjust x offset
+vy_ratio_crop_driving_video: -0.1  # adjust y offset
+direction: large-small               # direction of cropping
+source_max_dim: 1280
+source_division: 2

configs/audio2motion/model/liveportrait_config.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+# model config
+models_config: configs/audio2motion/model/models.yaml
+# 1. face appearance feature
+appearance_feature_extractor_path: pretrain_weights/decode/v1/first_stage/base_models/appearance_feature_extractor.pth
+# 2. motion feature
+motion_extractor_path: pretrain_weights/decode/v1/first_stage/base_models/motion_extractor.pth
+# 3. stitching retargeting module
+stitching_retargeting_module_path: pretrain_weights/decode/v1/first_stage/retargeting_models/stitching_retargeting_module.pth
+# 4. feature warper
+warping_module_path: pretrain_weights/decode/v1/first_stage/base_models/warping_module.pth
+# 5. SPADEGenerator
+spade_generator_path: pretrain_weights/decode/v1/first_stage/base_models/spade_generator.pth
+# 6. cropper
+crop_cfg: "configs/audio2motion/model/crop_config.yaml"
+# 7. face parser
+face_parser_weight_path: "pretrain_weights/face/face-parsing/79999_iter.pth"
+resnet_weight_path: "pretrain_weights/face/face-parsing/resnet18-5c106cde.pth"
+# motion template
+need_normalized: True
+# others
+batch_size: 100
+source_max_dim: 1920 # the max dim of height and width of source image or video
+source_division: 2 # make sure the height and width of source image or video can be divided by this number
+input_height: 256
+input_width: 256
+output_height: 512
+output_width: 512
+output_fps: 25
+# driving params
+flag_do_torch_compile: False
+flag_use_half_precision: True
+flag_relative_motion: False
+flag_normalize_lip: False
+flag_source_video_eye_retargeting: False
+flag_eye_retargeting: False
+flag_lip_retargeting: False
+flag_stitching: True
+lip_normalize_threshold: 0.03 # threshold for flag_normalize_lip
+source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+anchor_frame: 0 # TO IMPLEMENT
+driving_option: "expression-friendly" # "expression-friendly" or "pose-friendly"
+driving_multiplier: 1.0 # be used only when driving_option is "expression-friendly"
+lib_multiplier: 1.0
+driving_smooth_observation_variance: 3e-7 # the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+animation_region: "all" #["exp", "pose", "lip", "eyes", "all"], the region where the animation was performed, "exp" means the expression, "pose" means the head pose
+mask_crop: src/utils/resources/mask_template.png
+lip_array: src/utils/resources/lip_array.pkl

configs/audio2motion/model/models.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+model_params:
+  appearance_feature_extractor_params: # the F in the paper
+    image_channel: 3
+    block_expansion: 64
+    num_down_blocks: 2
+    max_features: 512
+    reshape_channel: 32
+    reshape_depth: 16
+    num_resblocks: 6
+  motion_extractor_params: # the M in the paper
+    num_kp: 21
+    backbone: convnextv2_tiny
+  warping_module_params: # the W in the paper
+    num_kp: 21
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+    reshape_channel: 32
+    estimate_occlusion_map: True
+    dense_motion_params:
+      block_expansion: 32
+      max_features: 1024
+      num_blocks: 5
+      reshape_depth: 16
+      compress: 4
+  spade_generator_params: # the G in the paper
+    upscale: 2 # represents upsample factor 256x256 -> 512x512
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+  stitching_retargeting_module_params: # the S in the paper
+    stitching:
+      input_size: 126 # (21*3)*2
+      hidden_sizes: [128, 128, 64]
+      output_size: 65 # (21*3)+2(tx,ty)
+    lip:
+      input_size: 65 # (21*3)+2
+      hidden_sizes: [128, 128, 64]
+      output_size: 63 # (21*3)
+    eye:
+      input_size: 66 # (21*3)+3
+      hidden_sizes: [256, 256, 128, 128, 64]
+      output_size: 63 # (21*3)

requirements.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+--find-links https://download.pytorch.org/whl/torch_stable.html
+accelerate==0.28.0
+audio-separator==0.17.2
+av==12.1.0
+bitsandbytes==0.43.1
+decord==0.6.0
+diffusers==0.27.2
+einops==0.8.0
+huggingface==0.0.1
+huggingface-hub==0.25.1
+insightface==0.7.3
+librosa==0.10.2.post1
+mediapipe[vision]==0.10.14
+mlflow==2.13.1
+moviepy==1.0.3
+numpy==1.26.4
+omegaconf==2.3.0
+onnx2torch==1.5.14
+onnx==1.16.1
+onnxruntime-gpu==1.18.0
+opencv-python==4.10.0.84
+pillow==10.3.0
+pyyaml==6.0.1
+setuptools==70.0.0
+torch==2.2.2+cu121
+torchaudio==2.2.2
+torchvision==0.17.2+cu121
+transformers==4.39.2
+xformers==0.0.25.post1
+isort==5.13.2
+pre-commit==3.7.1
+scipy==1.13.1
+imageio==2.34.2
+lmdb==1.4.1
+rich==13.7.1
+ffmpeg-python==0.2.0
+scikit-image==0.24.0
+albumentations==1.4.10
+matplotlib==3.9.0
+imageio-ffmpeg==0.5.1
+tyro==0.8.5
+gradio==5.1.0
+pykalman==0.9.7
+tensorboardX==2.6.2.2

src/datasets/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db742e76a39bbf81fb5b09fcc488bad0cbab9355df509d8e91967b58d02c6dfc
+size 2582

src/datasets/preprocess/__pycache__/flow_filter.cpython-310.pyc ADDED Viewed

Binary file (7.38 kB). View file

src/datasets/preprocess/__pycache__/video_crop.cpython-310.pyc ADDED Viewed

Binary file (7.24 kB). View file

src/datasets/preprocess/__pycache__/visualize.cpython-310.pyc ADDED Viewed

Binary file (867 Bytes). View file

src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-310.pyc ADDED Viewed

Binary file (13.2 kB). View file

src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-312.pyc ADDED Viewed

Binary file (25.6 kB). View file

src/datasets/preprocess/extract_features/__pycache__/feature_extractor_pipeline.cpython-310.pyc ADDED Viewed

Binary file (20.1 kB). View file

src/datasets/preprocess/extract_features/__pycache__/motion_processer.cpython-310.pyc ADDED Viewed

Binary file (36.8 kB). View file

src/datasets/preprocess/extract_features/__pycache__/test_processer.cpython-310.pyc ADDED Viewed

Binary file (6.32 kB). View file

src/datasets/preprocess/extract_features/audio_processer.py ADDED Viewed

	@@ -0,0 +1,471 @@

+import os
+from posixpath import isfile
+from re import A
+import sys
+import os.path as osp
+from typing import List, Dict, Tuple, Optional, Union, Any
+import yaml
+from omegaconf import OmegaConf
+import math
+import librosa
+import soundfile
+import numpy as np
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from pydub import AudioSegment
+from audio_separator.separator import Separator
+from transformers import Wav2Vec2FeatureExtractor, HubertModel
+from src.utils.rprint import rlog as log
+from src.utils.util import resample_audio
+from src.models.audio.wav2vec_modified import Wav2VecModel
+from src.models.audio.hubert import HubertModel_ as HubertModel
+def pad_audio(audio, audio_unit=320, pad_threshold=80):
+    batch_size, audio_len = audio.shape
+    n_units = audio_len // audio_unit
+    side_len = math.ceil((audio_unit * n_units + pad_threshold - audio_len) / 2)
+    if side_len >= 0:
+        reflect_len = side_len // 2
+        replicate_len = side_len % 2
+        if reflect_len > 0:
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+        if replicate_len > 0:
+            audio = F.pad(audio, (1, 1), mode='replicate')
+    return audio
+def cut_audio(audio_path: str, save_dir: str, length=60) -> List[str]:
+    """Cut audio into sub-divisions and return subfile paths. Supports wav format.
+    Args:
+        audio_path (str): the source audio file path
+        save_dir (str): the save directory of sub-divisions
+        length (int, optional): The max length of each sub-division. Defaults to 60 secs.
+    Returns:
+        List[str]: the subfile paths
+    """
+    audio_name = osp.basename(audio_path).split('.')[0]
+    audio = AudioSegment.from_wav(audio_path)
+    segment_length = length * 1000. # pydub uses milliseconds
+    num_segments = math.ceil(len(audio) / segment_length)
+    os.makedirs(save_dir, exist_ok=True)
+    audio_list = []
+    if num_segments > 1:
+        for i in range(num_segments):
+            start_time = i * segment_length
+            end_time = min((i + 1) * segment_length, len(audio))
+            segment = audio[start_time:end_time]
+            path = osp.join(save_dir, f"{audio_name}_segment_{i+1}.wav")
+            audio_list.append(path)
+            segment.export(path, format="wav")
+    else:
+        audio_list = [audio_path]
+    return audio_list
+class AudioProcessor(object):
+    def __init__(self, cfg_path: str, is_training: bool = False, device_id=0) -> None:
+        cfg = OmegaConf.load(cfg_path)
+        self.cfg = cfg
+        self.is_training = is_training
+        log("========================================= Audio Processer =========================================")
+        log(OmegaConf.to_yaml(cfg))
+        # setting device
+        self.device_id = device_id
+        self.use_half = cfg.device_params.flag_use_half_precision
+        if cfg.device_params.flag_force_cpu:
+            self.device = 'cpu'
+        else:
+            try:
+                if torch.backends.mps.is_available():
+                    self.device = 'mps'
+                else:
+                    self.device = 'cuda:' + str(self.device_id)
+            except:
+                self.device = 'cuda:' + str(self.device_id)
+        # init audio separator
+        self.audio_separator = None
+        self.cache_dir = cfg.cache_dir
+        self.tmp_dir = cfg.tmp_dir
+        self.use_audio_separator = cfg.model_params.use_audio_separator
+        self.audio_separator_name = cfg.model_params.audio_separator_name
+        self.audio_separator_path = cfg.model_weights.audio_separator_path
+        self.set_audio_separator(cfg.cache_dir)
+        # load audio encoder, wav2vec or hubert
+        self.model_name = cfg.model_params.model_name
+        self.is_chinese = cfg.model_params.is_chinese
+        self.audio_encoder, self.feature_extractor = self.load_model(
+            model_name = cfg.model_params.model_name,
+            model_type = cfg.model_params.model_type,
+            is_chinese = cfg.model_params.is_chinese,
+        )
+        self.only_last_features = cfg.model_params.only_last_features
+        if cfg.model_params.only_last_features:
+            self.feature_shape = (1, 768)
+        else:
+            self.feature_shape = (12, 768)     # features of 12 blocks
+        # init data params
+        self.sample_strategy = cfg.data_params.sample_strategy
+        self.sample_rate = cfg.data_params.sample_rate
+        self.fps = cfg.data_params.fps
+        self.audio_unit = cfg.data_params.sample_rate / cfg.data_params.fps   # num of audio samples per frame
+        self.max_length = cfg.data_params.max_length
+        self.subclip_len = cfg.data_params.sub_clip_length
+        self.save_to_cpu = cfg.data_params.save_to_cpu
+        self.pad_mode = cfg.data_params.audio_pad_mode
+        log("========================================= Audio Processer: Done =========================================")
+    def load_model(self, model_name: str="wav2vec", model_type: str="base", is_chinese: bool = False):
+        assert model_name in ["wav2vec", "hubert"], f"Unknown audio model {model_name}, only support wav2vec or hubert"
+        assert model_type in ["base", "large"], f"Unknown audio model type {model_type}, only support base or large"
+        if model_name == "wav2vec":
+            # load wav2vec model weights
+            if is_chinese:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.chinese.base
+                else:
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.chinese.large
+            else:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.default.base
+                else:
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.default.large
+            if model_weight_path is None:
+                raise ValueError(f"model_weight_path is None")
+            audio_encoder = Wav2VecModel.from_pretrained(model_weight_path, local_files_only=True).to(device=self.device)
+        else:
+            if is_chinese:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.hubert_path.chinese.base
+                else:
+                    model_weight_path = self.cfg.model_weights.hubert_path.chinese.large
+            else:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.hubert_path.default.base
+                else:
+                    model_weight_path = self.cfg.model_weights.hubert_path.default.large
+            if model_weight_path is None:
+                raise ValueError(f"model_weight_path is None")
+            audio_encoder = HubertModel.from_pretrained(model_weight_path, local_files_only=True).to(device=self.device)
+        log(f"{model_name}-{model_type}-chinese-{is_chinese} model has beed loaded from {model_weight_path}")
+        total_params = sum(p.numel() for p in audio_encoder.parameters())
+        print('Number of parameter: % .4fM' % (total_params / 1e6))
+        # weights initialization
+        audio_encoder.feature_extractor._freeze_parameters()
+        if not self.cfg.model_params.is_original:
+            frozen_layers = [0, 1]
+            for name, param in audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+        audio_encoder = audio_encoder.to(self.device)
+        if self.use_half:
+            audio_encoder = audio_encoder.half()
+        audio_encoder.eval()
+        # feature extractor
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_weight_path)
+        return audio_encoder, feature_extractor
+    def set_audio_separator(self, output_dir: str) -> None:
+        del self.audio_separator
+        if self.audio_separator_name is not None and self.use_audio_separator:
+            try:
+                os.makedirs(output_dir, exist_ok=True)
+            except OSError as _:
+                print("Fail to create the output cache dir.")
+            self.audio_separator = Separator(
+                output_dir=output_dir,
+                output_single_stem="vocals",
+                model_file_dir=self.audio_separator_path,
+            )
+            self.audio_separator.load_model(self.audio_separator_name)
+            assert self.audio_separator.model_instance is not None, "Fail to load audio separate model."
+        else:
+            self.audio_separator=None
+            log("Use audio directly without vocals seperator.")
+    def seperate_audio(self, audio_path: str, output_dir: Union[str, None] = None) -> str:
+        if output_dir is not None:
+            if output_dir != self.cache_dir:
+                # reload audio separator
+                self.set_audio_separator(output_dir)
+        if self.audio_separator is not None:
+            # 1. separate vocals
+            # TODO: process in memory
+            try:
+                outputs = self.audio_separator.separate(audio_path)
+                if len(outputs) <= 0:
+                    raise RuntimeError("Audio separate failed.")
+                vocal_audio_file = outputs[0]
+                vocal_audio_name, _ = os.path.splitext(vocal_audio_file)
+                vocal_audio_file = os.path.join(self.audio_separator.output_dir, vocal_audio_file)
+                vocal_audio_file = resample_audio(vocal_audio_file, os.path.join(self.audio_separator.output_dir, f"{vocal_audio_name}-16k.wav"), self.sample_rate)
+            except Exception as e:
+                log(f"Fail to separate vocals from {audio_path}, error info [{e}]")
+                vocal_audio_file=audio_path
+        else:
+            vocal_audio_file=audio_path
+        return vocal_audio_file
+    def load_audio(self, audio_path: str, mono: bool = True, duration: Optional[float] = None) -> Any:
+        try:
+            audio_data, sampling_rate = librosa.load(audio_path, sr=self.sample_rate, mono=mono, duration=duration)
+        except Exception as e:
+            raise RuntimeError(f"Fail to load audio from {audio_path}, error info [{e}]")
+        return audio_data, sampling_rate
+    def prepare_audio_data(self, audio_data: Union[np.ndarray, torch.Tensor], n_frames: Optional[int]=None) -> Tuple[List[Any], int]:
+        """Prepare audio data for processing.
+        """
+        #print(f"==========> Using Wav2Vec2FeatureExtractor to extract audio features")
+        audio_data = np.squeeze(self.feature_extractor(audio_data, sampling_rate=self.sample_rate).input_values)
+        clip_len = int(len(audio_data) / self.audio_unit)
+        if n_frames is not None:
+            if abs(n_frames - clip_len) > 7:
+                log(f"The number of frames must be close to the clip length (in 280ms), got {n_frames} and {clip_len}")
+                return [], n_frames
+            clip_len = n_frames
+        else:
+            n_frames = clip_len
+        if isinstance(audio_data, np.ndarray):
+            audio_data = torch.from_numpy(audio_data).float().to(self.device)
+        assert audio_data.ndim == 1, 'Audio must be 1D tensor.'
+        # padding
+        # padding audio to fit the clip length
+        n_audio_samples = round(self.audio_unit * clip_len)
+        n_padding_audio_samples = n_audio_samples - len(audio_data)
+        n_padding_frames = math.ceil(n_padding_audio_samples / self.audio_unit)
+        if n_padding_audio_samples > 0:
+            if self.pad_mode == 'zero':
+                padding_value = 0
+            elif self.pad_mode == 'replicate':
+                padding_value = float(audio_data[-1])
+            else:
+                raise ValueError(f'Unknown pad mode: {self.pad_mode}')
+            audio_data = F.pad(audio_data, (0, n_padding_audio_samples), value=padding_value)
+        # devide audio into sub-divisions for saving GPU memory
+        audio_segments = []
+        if clip_len <= self.subclip_len:
+            n_subdivision = 1
+            subclip_len = clip_len
+        else:
+            n_subdivision = math.ceil(clip_len / self.subclip_len)
+            subclip_len = self.subclip_len
+        for i in range(0, n_subdivision):
+            start_idx = i * subclip_len
+            end_idx = min(start_idx + subclip_len, clip_len)
+            # debug
+            #log(f"[{i+1}/{n_subdivision}] data index [{round(start_idx * self.audio_unit)}, {round(end_idx * self.audio_unit)})")
+            audio_segments.append(
+                {
+                    "data": audio_data[round(start_idx * self.audio_unit):round(end_idx * self.audio_unit)].unsqueeze(0),
+                    "start_idx": start_idx,
+                    "end_idx": end_idx,
+                    "length": end_idx - start_idx
+                }
+            )
+        return audio_segments, n_frames
+    def get_audio_embedding(self, audio, clip_len: int) -> torch.Tensor:
+        if audio.ndim == 2:
+            # Extract audio features
+            assert audio.shape[1] == 16000 * clip_len / self.fps, \
+                f'Incorrect audio length {audio.shape[1]}'
+            # Extract audio features
+            if self.use_half:
+                audio = audio.half()
+            embeddings = self.audio_encoder(
+                pad_audio(audio), seq_len=clip_len, sample_strategy=self.sample_strategy, output_hidden_states=True
+            )  # (N, L, 768)
+            assert len(embeddings) > 0, "Fail to extract audio embedding"
+            if self.only_last_features:
+                audio_emb = embeddings.last_hidden_state.squeeze(0)
+            else:
+                audio_emb = torch.stack(
+                    embeddings.hidden_states[1:], dim=1
+                ).squeeze(0)
+                audio_emb = rearrange(audio_emb, "b s d -> s b d")
+        elif audio.ndim == 3:
+            assert audio.shape[1] == clip_len, f'Incorrect audio feature length {audio.shape[1]}'
+            audio_emb = audio
+        else:
+            raise ValueError(f'Incorrect audio input shape {audio.shape}')
+        return audio_emb
+    def get_audio_embeddings(self, audio_segments: List[Any]) -> Optional[torch.Tensor]:
+        audio_embs = []
+        for audio_segment in audio_segments:
+            if self.is_training:
+                audio_emb = self.get_audio_embedding(audio_segment["data"], audio_segment["length"])
+            else:
+                with torch.no_grad():
+                    audio_emb = self.get_audio_embedding(audio_segment["data"], audio_segment["length"])
+            audio_emb = audio_emb.cpu() if self.save_to_cpu else audio_emb
+            audio_embs.append(audio_emb)
+            #log(f"audio segment [{audio_segment['start_idx']}, {audio_segment['end_idx']}) has been processed.")
+        if len(audio_embs) == 0:
+            return None
+        audio_emb = torch.cat(audio_embs, dim=0)
+        return audio_emb
+    def preprocess(
+        self,
+        audio_path: str,
+        n_frames: Optional[int] = None,
+        duration: Optional[float] = None,
+        need_seperate: bool = False
+    ):
+        """ Preprocess a WAV audio file by separating the vocals from the background and resampling it to a 16 kHz sample rate.
+        The separated vocal track is then converted into wav2vec2 for further processing or analysis.
+        """
+        if need_seperate:
+            vocal_audio_file = self.seperate_audio(audio_path)
+        else:
+            vocal_audio_file = audio_path
+        audio_data, sampling_rate = self.load_audio(vocal_audio_file, duration=duration)
+        assert sampling_rate == 16000, "The sample rate of audio must be 16000"
+        audio_segments, n_frames = self.prepare_audio_data(audio_data, n_frames)
+        audio_emb = self.get_audio_embeddings(audio_segments)
+        if audio_emb is None:
+            log(f"{audio_path} has been processed, but no audio embedding, set as 'None'.")
+        #else:
+            #log(f"{audio_path} has been processed, audio embedding shape {audio_emb.shape}.")
+        return audio_emb, n_frames
+    def preprocess_long(
+        self,
+        audio_path: str,
+        need_seperate: bool = False
+    ):
+        audio_list = cut_audio(audio_path, self.tmp_dir, length=self.max_length)
+        audio_emb_list = []
+        l = 0
+        for idx, audio_path in enumerate(audio_list):
+            padding = (idx+1) == len(audio_list)
+            emb, length = self.preprocess(audio_path, need_seperate=need_seperate)
+            audio_emb_list.append(emb)
+            log(f"Processing audio {idx+1}/{len(audio_list)}, path: {audio_path} length: {length}")
+            l += length
+        audio_emb = torch.cat(audio_emb_list)
+        audio_length = l
+        # remove tmp file
+        if len(audio_list) > 1:
+            for audio_path in audio_list:
+                os.remove(audio_path)
+        return audio_emb, audio_length
+    def add_silent_audio(self, audio_path: str, silent_audio_path: Optional[str] = None, add_duration: float = 1., linear_fusion=False, mode="post"):
+        # mode, pre, post, both
+        assert mode in ["pre", "post", "both"], f"Unkown mode: {mode}, only support pre, post, both"
+        if silent_audio_path is None:
+            return audio_path, 0
+        else:
+            audio_dir = osp.dirname(audio_path)
+            audio_name = osp.basename(audio_path)
+            temp_audio_path = osp.join(audio_dir, f"tmp_{audio_name}")
+            if osp.isfile(temp_audio_path):
+                os.remove(temp_audio_path)
+            audio, sr1 = librosa.load(audio_path, mono=True, sr=16000)
+            # denoise
+            audio = librosa.effects.preemphasis(audio)       # enhance voice
+            # load silent audio
+            silent_audio, sr2 = librosa.load(silent_audio_path, mono=True, sr=16000)
+            silent_audio = silent_audio[:int(add_duration*sr2)]
+            if linear_fusion:
+                short_len = min(len(audio), len(silent_audio))
+                fusion_ratio = np.linspace(0, 1.0, num=short_len)
+                # get pre padding audio
+                pre_pad_audio = fusion_ratio * silent_audio[:short_len] + (1 - fusion_ratio) * audio[:short_len]
+                if short_len < len(silent_audio):
+                    pre_pad_audio = np.hstack((pre_pad_audio, silent_audio[short_len:]))
+                pre_pad_audio = np.flip(pre_pad_audio, axis=0)
+                # get post padding audio
+                post_pad_audio = (1 - fusion_ratio) * silent_audio[-short_len:] + fusion_ratio * audio[-short_len:]
+                if short_len < len(silent_audio):
+                    post_pad_audio = np.hstack((silent_audio[:-short_len], post_pad_audio))
+                post_pad_audio = np.flip(post_pad_audio, axis=0)
+            else:
+                pre_pad_audio = silent_audio
+                post_pad_audio = silent_audio
+            # padding audio
+            if mode == "both":
+                combined_audio = np.hstack((pre_pad_audio, audio, post_pad_audio))
+            elif mode == "pre":
+                combined_audio = np.hstack((pre_pad_audio, audio))
+            else:
+                combined_audio = np.hstack((audio, post_pad_audio))
+            add_nframes = math.floor(add_duration * sr2 / self.audio_unit)
+            #print(f"audio length: {len(audio)}, pre_pad_audio length: {len(pre_pad_audio)}, post_pad_audio length: {len(post_pad_audio)}, combined_length: {len(combined_audio)}, total add {add_nframes*2} frames")
+            #print(f"audio duration: {librosa.get_duration(audio, sr=sr1)}, silent duration: {librosa.get_duration(silent_audio, sr=sr2)}, combined duration: {librosa.get_duration(combined_audio, sr=sr2)}")
+            soundfile.write(temp_audio_path, combined_audio, sr2)
+            return temp_audio_path, add_nframes
+    def get_long_audio_emb(self, audio_path: str) -> torch.Tensor:
+        audio_emb, length = self.preprocess_long(audio_path)
+        log(f"Load audio from {osp.realpath(audio_path)} done, audio_emb shape: {audio_emb.shape}.")
+        return audio_emb
+    def __enter__(self):
+        return self

src/datasets/preprocess/extract_features/face_segmentation/__init__.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import cv2
+import numpy as np
+import torch
+from torchvision import transforms
+from .bisenet import BiSeNet
+def vis_parsing_maps(im, parsing_anno, stride, save_im=False, save_path='parsing_map_on_im2.jpg'):
+    # Colors for all 20 parts
+    part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
+                   [255, 0, 85], [255, 0, 170],
+                   [0, 255, 0], [85, 255, 0], [170, 255, 0],
+                   [0, 255, 85], [0, 255, 170],
+                   [0, 0, 255], [85, 0, 255], [170, 0, 255],
+                   [0, 85, 255], [0, 170, 255],
+                   [255, 255, 0], [255, 255, 85], [255, 255, 170],
+                   [255, 0, 255], [255, 85, 255], [255, 170, 255],
+                   [0, 255, 255], [85, 255, 255], [170, 255, 255]]
+    im = np.array(im)
+    vis_im = im.copy().astype(np.uint8)
+    vis_parsing_anno = parsing_anno.copy().astype(np.uint8)
+    vis_parsing_anno = cv2.resize(vis_parsing_anno, None, fx=stride, fy=stride, interpolation=cv2.INTER_NEAREST)
+    vis_parsing_anno_color = np.zeros((vis_parsing_anno.shape[0], vis_parsing_anno.shape[1], 3)) + 255
+    num_of_class = np.max(vis_parsing_anno)
+    for pi in range(1, num_of_class + 1):
+        index = np.where(vis_parsing_anno == pi)
+        vis_parsing_anno_color[index[0], index[1], :] = part_colors[pi]
+    vis_parsing_anno_color = vis_parsing_anno_color.astype(np.uint8)
+    # print(vis_parsing_anno_color.shape, vis_im.shape)
+    vis_im = cv2.addWeighted(cv2.cvtColor(vis_im, cv2.COLOR_RGB2BGR), 0.4, vis_parsing_anno_color, 0.6, 0)
+    # Save result or not
+    if save_im:
+        cv2.imwrite(save_path[:-4] +'.png', vis_parsing_anno)
+        cv2.imwrite(save_path, vis_im, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
+    # return vis_im
+def get_face_mask(face_parser, images, batch_size=128):
+    # images: Bx3xHxW
+    kernel = np.ones((13, 13), np.float32)
+    face_masks = []
+    for i in range(0, images.shape[0], batch_size):
+        images_batch = images[i:i+batch_size]
+        with torch.no_grad():
+            out = face_parser(images_batch)[0]
+            parsing = out.cpu().numpy().argmax(1)
+            masks = np.zeros_like(parsing, np.float32)
+            for idx in range(1, 14):
+                masks[parsing == idx] = 1
+            for mask in masks:
+                mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=2)
+                mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)
+                mask = cv2.dilate(mask, kernel, iterations=3)
+                face_masks.append(mask)
+    return face_masks
+def build_face_parser(weight_path, resnet_weight_path, n_classes=19, device_id=0):
+    model_state_dict = torch.load(weight_path, weights_only=False)
+    bisenet = BiSeNet(n_classes, resnet_weight_path=resnet_weight_path)
+    # load model
+    #bisenet.load_state_dict(model_state_dict, strict=True)
+    bisenet_state_dict = bisenet.state_dict()
+    for k, v in model_state_dict.items():
+        if 'fc' in k: continue
+        bisenet_state_dict.update({k: v})
+    bisenet.load_state_dict(bisenet_state_dict)
+    bisenet.to(f"cuda:{device_id}")
+    to_tensor = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Resize((512, 512)),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    return bisenet.eval(), to_tensor

src/datasets/preprocess/extract_features/face_segmentation/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (3.07 kB). View file

src/datasets/preprocess/extract_features/face_segmentation/__pycache__/bisenet.cpython-310.pyc ADDED Viewed

Binary file (8.38 kB). View file

src/datasets/preprocess/extract_features/face_segmentation/__pycache__/resnet.cpython-310.pyc ADDED Viewed

Binary file (3.77 kB). View file

src/datasets/preprocess/extract_features/face_segmentation/bisenet.py ADDED Viewed

	@@ -0,0 +1,285 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from .resnet import Resnet18
+# from modules.bn import InPlaceABNSync as BatchNorm2d
+class ConvBNReLU(nn.Module):
+    def __init__(self, in_chan, out_chan, ks=3, stride=1, padding=1, *args, **kwargs):
+        super(ConvBNReLU, self).__init__()
+        self.conv = nn.Conv2d(in_chan,
+                out_chan,
+                kernel_size = ks,
+                stride = stride,
+                padding = padding,
+                bias = False)
+        self.bn = nn.BatchNorm2d(out_chan)
+        self.init_weight()
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(self.bn(x))
+        return x
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+class BiSeNetOutput(nn.Module):
+    def __init__(self, in_chan, mid_chan, n_classes, *args, **kwargs):
+        super(BiSeNetOutput, self).__init__()
+        self.conv = ConvBNReLU(in_chan, mid_chan, ks=3, stride=1, padding=1)
+        self.conv_out = nn.Conv2d(mid_chan, n_classes, kernel_size=1, bias=False)
+        self.init_weight()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.conv_out(x)
+        return x
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+class AttentionRefinementModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(AttentionRefinementModule, self).__init__()
+        self.conv = ConvBNReLU(in_chan, out_chan, ks=3, stride=1, padding=1)
+        self.conv_atten = nn.Conv2d(out_chan, out_chan, kernel_size= 1, bias=False)
+        self.bn_atten = nn.BatchNorm2d(out_chan)
+        self.sigmoid_atten = nn.Sigmoid()
+        self.init_weight()
+    def forward(self, x):
+        feat = self.conv(x)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv_atten(atten)
+        atten = self.bn_atten(atten)
+        atten = self.sigmoid_atten(atten)
+        out = torch.mul(feat, atten)
+        return out
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+class ContextPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(ContextPath, self).__init__()
+        backbone_weight_path = kwargs.get("resnet_weight_path", None)
+        self.resnet = Resnet18(backbone_weight_path)
+        self.arm16 = AttentionRefinementModule(256, 128)
+        self.arm32 = AttentionRefinementModule(512, 128)
+        self.conv_head32 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_head16 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_avg = ConvBNReLU(512, 128, ks=1, stride=1, padding=0)
+        self.init_weight()
+    def forward(self, x):
+        H0, W0 = x.size()[2:]
+        feat8, feat16, feat32 = self.resnet(x)
+        H8, W8 = feat8.size()[2:]
+        H16, W16 = feat16.size()[2:]
+        H32, W32 = feat32.size()[2:]
+        avg = F.avg_pool2d(feat32, feat32.size()[2:])
+        avg = self.conv_avg(avg)
+        avg_up = F.interpolate(avg, (H32, W32), mode='nearest')
+        feat32_arm = self.arm32(feat32)
+        feat32_sum = feat32_arm + avg_up
+        feat32_up = F.interpolate(feat32_sum, (H16, W16), mode='nearest')
+        feat32_up = self.conv_head32(feat32_up)
+        feat16_arm = self.arm16(feat16)
+        feat16_sum = feat16_arm + feat32_up
+        feat16_up = F.interpolate(feat16_sum, (H8, W8), mode='nearest')
+        feat16_up = self.conv_head16(feat16_up)
+        return feat8, feat16_up, feat32_up  # x8, x8, x16
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+### This is not used, since I replace this with the resnet feature with the same size
+class SpatialPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(SpatialPath, self).__init__()
+        self.conv1 = ConvBNReLU(3, 64, ks=7, stride=2, padding=3)
+        self.conv2 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv3 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv_out = ConvBNReLU(64, 128, ks=1, stride=1, padding=0)
+        self.init_weight()
+    def forward(self, x):
+        feat = self.conv1(x)
+        feat = self.conv2(feat)
+        feat = self.conv3(feat)
+        feat = self.conv_out(feat)
+        return feat
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+class FeatureFusionModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(FeatureFusionModule, self).__init__()
+        self.convblk = ConvBNReLU(in_chan, out_chan, ks=1, stride=1, padding=0)
+        self.conv1 = nn.Conv2d(out_chan,
+                out_chan//4,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.conv2 = nn.Conv2d(out_chan//4,
+                out_chan,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+        self.init_weight()
+    def forward(self, fsp, fcp):
+        fcat = torch.cat([fsp, fcp], dim=1)
+        feat = self.convblk(fcat)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv1(atten)
+        atten = self.relu(atten)
+        atten = self.conv2(atten)
+        atten = self.sigmoid(atten)
+        feat_atten = torch.mul(feat, atten)
+        feat_out = feat_atten + feat
+        return feat_out
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+class BiSeNet(nn.Module):
+    def __init__(self, n_classes, *args, **kwargs):
+        super(BiSeNet, self).__init__()
+        backbone_weight_path = kwargs.get("resnet_weight_path", None)
+        self.cp = ContextPath(resnet_weight_path=backbone_weight_path)
+        ## here self.sp is deleted
+        self.ffm = FeatureFusionModule(256, 256)
+        self.conv_out = BiSeNetOutput(256, 256, n_classes)
+        self.conv_out16 = BiSeNetOutput(128, 64, n_classes)
+        self.conv_out32 = BiSeNetOutput(128, 64, n_classes)
+        self.init_weight()
+    def forward(self, x):
+        H, W = x.size()[2:]
+        feat_res8, feat_cp8, feat_cp16 = self.cp(x)  # here return res3b1 feature
+        feat_sp = feat_res8  # use res3b1 feature to replace spatial path feature
+        feat_fuse = self.ffm(feat_sp, feat_cp8)
+        feat_out = self.conv_out(feat_fuse)
+        feat_out16 = self.conv_out16(feat_cp8)
+        feat_out32 = self.conv_out32(feat_cp16)
+        feat_out = F.interpolate(feat_out, (H, W), mode='bilinear', align_corners=True)
+        feat_out16 = F.interpolate(feat_out16, (H, W), mode='bilinear', align_corners=True)
+        feat_out32 = F.interpolate(feat_out32, (H, W), mode='bilinear', align_corners=True)
+        return feat_out, feat_out16, feat_out32
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params = [], [], [], []
+        for name, child in self.named_children():
+            child_wd_params, child_nowd_params = child.get_params()
+            if isinstance(child, FeatureFusionModule) or isinstance(child, BiSeNetOutput):
+                lr_mul_wd_params += child_wd_params
+                lr_mul_nowd_params += child_nowd_params
+            else:
+                wd_params += child_wd_params
+                nowd_params += child_nowd_params
+        return wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params
+if __name__ == "__main__":
+    net = BiSeNet(19)
+    net.cuda()
+    net.eval()
+    in_ten = torch.randn(16, 3, 640, 480).cuda()
+    out, out16, out32 = net(in_ten)
+    print(out.shape)
+    net.get_params()

src/datasets/preprocess/extract_features/face_segmentation/resnet.py ADDED Viewed

	@@ -0,0 +1,113 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as modelzoo
+# from modules.bn import InPlaceABNSync as BatchNorm2d
+resnet18_url = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    def __init__(self, in_chan, out_chan, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(in_chan, out_chan, stride)
+        self.bn1 = nn.BatchNorm2d(out_chan)
+        self.conv2 = conv3x3(out_chan, out_chan)
+        self.bn2 = nn.BatchNorm2d(out_chan)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        if in_chan != out_chan or stride != 1:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_chan, out_chan,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_chan),
+                )
+    def forward(self, x):
+        residual = self.conv1(x)
+        residual = F.relu(self.bn1(residual))
+        residual = self.conv2(residual)
+        residual = self.bn2(residual)
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+        out = shortcut + residual
+        out = self.relu(out)
+        return out
+def create_layer_basic(in_chan, out_chan, bnum, stride=1):
+    layers = [BasicBlock(in_chan, out_chan, stride=stride)]
+    for i in range(bnum-1):
+        layers.append(BasicBlock(out_chan, out_chan, stride=1))
+    return nn.Sequential(*layers)
+class Resnet18(nn.Module):
+    def __init__(self, backbone_weight_path=None):
+        super(Resnet18, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = create_layer_basic(64, 64, bnum=2, stride=1)
+        self.layer2 = create_layer_basic(64, 128, bnum=2, stride=2)
+        self.layer3 = create_layer_basic(128, 256, bnum=2, stride=2)
+        self.layer4 = create_layer_basic(256, 512, bnum=2, stride=2)
+        self.init_weight(backbone_weight_path)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(self.bn1(x))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        feat8 = self.layer2(x) # 1/8
+        feat16 = self.layer3(feat8) # 1/16
+        feat32 = self.layer4(feat16) # 1/32
+        return feat8, feat16, feat32
+    def init_weight(self, backbone_weight_path=None):
+        if backbone_weight_path is None:
+            state_dict = modelzoo.load_url(resnet18_url)
+        else:
+            state_dict = torch.load(backbone_weight_path, weights_only=False)
+        self_state_dict = self.state_dict()
+        for k, v in state_dict.items():
+            if 'fc' in k: continue
+            self_state_dict.update({k: v})
+        self.load_state_dict(self_state_dict)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module,  nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+if __name__ == "__main__":
+    net = Resnet18()
+    x = torch.randn(16, 3, 224, 224)
+    out = net(x)
+    print(out[0].size())
+    print(out[1].size())
+    print(out[2].size())
+    net.get_params()

src/datasets/preprocess/extract_features/motion_processer.py ADDED Viewed

	@@ -0,0 +1,1420 @@

+"""
+Motion feature extractor
+"""
+import os
+import os.path as osp
+import sys
+import pickle
+from omegaconf import OmegaConf
+import torch
+from PIL import Image
+import numpy as np
+import cv2
+import imageio
+import pickle
+import time
+from decord import VideoReader # must after import torch
+from rich.progress import track
+sys.path.append(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__)))))))
+from src.datasets.preprocess.extract_features.face_segmentation import build_face_parser, get_face_mask, vis_parsing_maps
+from src.thirdparty.liveportrait.src.utils.helper import load_model, concat_feat
+from src.thirdparty.liveportrait.src.utils.io import load_image_rgb, resize_to_limit, load_video
+from src.thirdparty.liveportrait.src.utils.video import get_fps, images2video, add_audio_to_video
+from src.thirdparty.liveportrait.src.utils.camera import headpose_pred_to_degree, get_rotation_matrix
+from src.thirdparty.liveportrait.src.utils.cropper import Cropper
+from src.thirdparty.liveportrait.src.utils.crop import prepare_paste_back, paste_back, paste_back_with_face_mask
+from src.thirdparty.liveportrait.src.utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio
+from src.thirdparty.liveportrait.src.utils.helper import mkdir, basename, dct2device, is_image, calc_motion_multiplier
+from src.utils.filter import smooth as ksmooth
+from src.utils.filter import smooth_
+from skimage.metrics import peak_signal_noise_ratio
+import warnings
+def psnr(imgs1, imgs2):
+    psnrs = []
+    for img1, img2 in zip(imgs1, imgs2):
+        psnr = peak_signal_noise_ratio(img1, img2, data_range=255)
+        psnrs.append(psnr)
+    return psnrs
+def suffix(filename):
+    """a.jpg -> jpg"""
+    pos = filename.rfind(".")
+    if pos == -1:
+        return ""
+    return filename[pos + 1:]
+def dump(wfp, obj):
+    wd = osp.split(wfp)[0]
+    if wd != "" and not osp.exists(wd):
+        mkdir(wd)
+    _suffix = suffix(wfp)
+    if _suffix == "npy":
+        np.save(wfp, obj)
+    elif _suffix == "pkl":
+        pickle.dump(obj, open(wfp, "wb"))
+    else:
+        raise Exception("Unknown type: {}".format(_suffix))
+def load(fp):
+    suffix_ = suffix(fp)
+    if suffix_ == "npy":
+        return np.load(fp)
+    elif suffix_ == "pkl":
+        return pickle.load(open(fp, "rb"))
+    else:
+        raise Exception(f"Unknown type: {suffix}")
+def remove_suffix(filepath):
+    """a/b/c.jpg -> a/b/c"""
+    return osp.join(osp.dirname(filepath), basename(filepath))
+class MotionProcesser(object):
+    def __init__(self, cfg_path, device_id=0) -> None:
+        device = f"cuda:{device_id}"
+        cfg = OmegaConf.load(cfg_path)
+        print(f"Load cfg from {osp.realpath(cfg_path)} done.")
+        print(f"=============================== Driven CFG ===============================")
+        print(OmegaConf.to_yaml(cfg))
+        print(f"=============================== ========== ===============================")
+        models_config = OmegaConf.load(cfg.models_config)
+        # 1. init appearance feature extractor
+        self.appearance_feature_extractor = load_model(
+            cfg.appearance_feature_extractor_path,
+            models_config,
+            device,
+            'appearance_feature_extractor'
+        )
+        print(f'1. Load appearance_feature_extractor from {osp.realpath(cfg.appearance_feature_extractor_path)} done.')
+        # 2. # init motion extractor
+        self.motion_extractor = load_model(
+            cfg.motion_extractor_path,
+            models_config,
+            device,
+            'motion_extractor'
+        )
+        print(f'2. Load motion_extractor from {osp.realpath(cfg.motion_extractor_path)} done.')
+        # 3. init S and R
+        if cfg.stitching_retargeting_module_path is not None and osp.exists(cfg.stitching_retargeting_module_path):
+            self.stitching_retargeting_module = load_model(
+                cfg.stitching_retargeting_module_path,
+                models_config,
+                device,
+                'stitching_retargeting_module'
+            )
+            print(f'3. Load stitching_retargeting_module from {osp.realpath(cfg.stitching_retargeting_module_path)} done.')
+        else:
+            self.stitching_retargeting_module = None
+        # 4. init motion warper
+        self.warping_module = load_model(
+            cfg.warping_module_path,
+            models_config,
+            device,
+            'warping_module'
+        )
+        print(f"4. Load warping_module from {osp.realpath(cfg.warping_module_path)} done.")
+        # 5. init decoder
+        self.spade_generator = load_model(
+            cfg.spade_generator_path,
+            models_config,
+            device,
+            'spade_generator'
+        )
+        print(f"Load generator from {osp.realpath(cfg.spade_generator_path)} done.")
+        # # Optimize for inference
+        self.compile = cfg.flag_do_torch_compile
+        if self.compile:
+            torch._dynamo.config.suppress_errors = True  # Suppress errors and fall back to eager execution
+            self.warping_module = torch.compile(self.warping_module, mode='max-autotune')
+            self.spade_generator = torch.compile(self.spade_generator, mode='max-autotune')
+        # 6. init cropper
+        crop_cfg = OmegaConf.load(cfg.crop_cfg)
+        self.cropper = Cropper(crop_cfg=crop_cfg, image_type="human_face", device_id=device_id)
+        self.cfg = cfg
+        self.models_config = models_config
+        self.device = device
+        # 7. load crop mask
+        self.mask_crop = cv2.imread(cfg.mask_crop, cv2.IMREAD_COLOR)
+        # 8. load lib array
+        with open(cfg.lip_array, 'rb') as f:
+            self.lip_array = pickle.load(f)
+        # 9. load face parser
+        self.face_parser, self.to_tensor = build_face_parser(weight_path=cfg.face_parser_weight_path, resnet_weight_path=cfg.resnet_weight_path, device_id=device_id)
+    def inference_ctx(self):
+        ctx = torch.autocast(device_type=self.device[:4], dtype=torch.float16,
+                                 enabled=self.cfg.flag_use_half_precision)
+        return ctx
+    @torch.no_grad()
+    def extract_feature_3d(self, x: torch.Tensor) -> torch.Tensor:
+        """ get the appearance feature of the image by F
+        x: Bx3xHxW, normalized to 0~1
+        """
+        with self.inference_ctx():
+            feature_3d = self.appearance_feature_extractor(x)
+        return feature_3d.float()
+    @torch.no_grad()
+    def get_kp_info(self, x: torch.Tensor, **kwargs) -> dict:
+        """ get the implicit keypoint information
+        x: Bx3xHxW, normalized to 0~1
+        flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape
+        return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'
+        """
+        with self.inference_ctx():
+            kp_info = self.motion_extractor(x)
+            if self.cfg.flag_use_half_precision:
+                # float the dict
+                for k, v in kp_info.items():
+                    if isinstance(v, torch.Tensor):
+                        kp_info[k] = v.float()
+        return kp_info
+    @torch.no_grad()
+    def refine_kp(self, kp_info):
+        bs = kp_info['exp'].shape[0]
+        kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None]  # Bx1
+        kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None]  # Bx1
+        kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None]  # Bx1
+        kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3)  # BxNx3
+        if 'kp' in kp_info.keys():
+            kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3)  # BxNx3
+        return kp_info
+    @torch.no_grad()
+    def transform_keypoint(self, kp_info: dict):
+        """
+        transform the implicit keypoints with the pose, shift, and expression deformation
+        kp: BxNx3
+        """
+        kp = kp_info['kp']    # (bs, k, 3)
+        pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
+        t, exp = kp_info['t'], kp_info['exp']
+        scale = kp_info['scale']
+        pitch = headpose_pred_to_degree(pitch)
+        yaw = headpose_pred_to_degree(yaw)
+        roll = headpose_pred_to_degree(roll)
+        bs = kp.shape[0]
+        if kp.ndim == 2:
+            num_kp = kp.shape[1] // 3  # Bx(num_kpx3)
+        else:
+            num_kp = kp.shape[1]  # Bxnum_kpx3
+        rot_mat = get_rotation_matrix(pitch, yaw, roll)    # (bs, 3, 3)
+        # Eqn.2: s * (R * x_c,s + exp) + t
+        kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)
+        kp_transformed *= scale[..., None]  # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
+        kp_transformed[:, :, 0:2] += t[:, None, 0:2]  # remove z, only apply tx ty
+        return kp_transformed
+    @torch.no_grad()
+    def stitching(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+        """ conduct the stitching
+        kp_source: Bxnum_kpx3
+        kp_driving: Bxnum_kpx3
+        """
+        if self.stitching_retargeting_module is not None:
+            bs, num_kp = kp_source.shape[:2]
+            kp_driving_new = kp_driving.clone()
+            # stich
+            feat_stiching = concat_feat(kp_source, kp_driving_new)
+            delta = self.stitching_retargeting_module['stitching'](feat_stiching) # Bxnum_kpx3
+            delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3)  # 1x20x3
+            delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2)  # 1x1x2
+            kp_driving_new += delta_exp
+            kp_driving_new[..., :2] += delta_tx_ty
+            return kp_driving_new
+        return kp_driving
+    @torch.no_grad()
+    def warp_decode(self, feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> dict[str, torch.Tensor]:
+        """ get the image after the warping of the implicit keypoints
+        feature_3d: Bx32x16x64x64, feature volume
+        kp_source: BxNx3
+        kp_driving: BxNx3
+        """
+        # The line 18 in Algorithm 1: D(W(f_s; x_s, x′_d,i)）
+        with self.inference_ctx():
+            if self.compile:
+                # Mark the beginning of a new CUDA Graph step
+                torch.compiler.cudagraph_mark_step_begin()
+            # get decoder input
+            ret_dct = self.warping_module(feature_3d, kp_source=kp_source, kp_driving=kp_driving)
+            # print(f"=============================================================================")
+            # for out_key, out_value in ret_dct.items():
+            #     if isinstance(out_value, str) or isinstance(out_value, int) or isinstance(out_value, float):
+            #         print(f"{out_key}: {out_value}")
+            #     elif isinstance(out_value, torch.Tensor):
+            #         print(f"{out_key}: tensor shape {out_value.shape}, min: {torch.min(out_value)}, max: {torch.max(out_value)}, mean: {torch.mean(out_value)}, std: {torch.std(out_value)}")
+            #     else:
+            #         print(f"{out_key}: data type {type(out_value)}")
+            # decode
+            ret_dct['out'] = self.spade_generator(feature=ret_dct['out'])
+            # float the dict
+            if self.cfg.flag_use_half_precision:
+                for k, v in ret_dct.items():
+                    if isinstance(v, torch.Tensor):
+                        ret_dct[k] = v.float()
+        return ret_dct
+    def parse_output(self, out: torch.Tensor) -> np.ndarray:
+        """ construct the output as standard
+        return: 1xHxWx3, uint8
+        """
+        out = np.transpose(out.cpu().numpy(), [0, 2, 3, 1])  # 1x3xHxW -> 1xHxWx3
+        out = np.clip(out, 0, 1)  # clip to 0~1
+        out = np.clip(out * 255, 0, 255).astype(np.uint8)  # 0~1 -> 0~255
+        return out
+    @torch.no_grad()
+    def calc_combined_eye_ratio(self, c_d_eyes_i, source_lmk):
+        c_s_eyes = calc_eye_close_ratio(source_lmk[None])
+        c_s_eyes_tensor = torch.from_numpy(c_s_eyes).float().to(self.device)
+        c_d_eyes_i_tensor = torch.Tensor([c_d_eyes_i[0][0]]).reshape(1, 1).to(self.device)
+        # [c_s,eyes, c_d,eyes,i]
+        combined_eye_ratio_tensor = torch.cat([c_s_eyes_tensor, c_d_eyes_i_tensor], dim=1)
+        return combined_eye_ratio_tensor
+    @torch.no_grad()
+    def calc_combined_lip_ratio(self, c_d_lip_i, source_lmk):
+        c_s_lip = calc_lip_close_ratio(source_lmk[None])
+        c_s_lip_tensor = torch.from_numpy(c_s_lip).float().to(self.device)
+        c_d_lip_i_tensor = torch.Tensor([c_d_lip_i[0]]).to(self.device).reshape(1, 1) # 1x1
+        # [c_s,lip, c_d,lip,i]
+        combined_lip_ratio_tensor = torch.cat([c_s_lip_tensor, c_d_lip_i_tensor], dim=1) # 1x2
+        return combined_lip_ratio_tensor
+    def calc_ratio(self, lmk_lst):
+        input_eye_ratio_lst = []
+        input_lip_ratio_lst = []
+        for lmk in lmk_lst:
+            # for eyes retargeting
+            input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))
+            # for lip retargeting
+            input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))
+        return input_eye_ratio_lst, input_lip_ratio_lst
+    @torch.no_grad()
+    def retarget_lip(self, kp_source: torch.Tensor, lip_close_ratio: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        lip_close_ratio: Bx2
+        Return: Bx(3*num_kp)
+        """
+        feat_lip = concat_feat(kp_source, lip_close_ratio)
+        delta = self.stitching_retargeting_module['lip'](feat_lip)
+        return delta.reshape(-1, kp_source.shape[1], 3)
+    @torch.no_grad()
+    def retarget_eye(self, kp_source: torch.Tensor, eye_close_ratio: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        eye_close_ratio: Bx3
+        Return: Bx(3*num_kp)
+        """
+        feat_eye = concat_feat(kp_source, eye_close_ratio)
+        delta = self.stitching_retargeting_module['eye'](feat_eye)
+        return delta.reshape(-1, kp_source.shape[1], 3)
+    def crop_image(self, img, do_crop=False):
+        ######## process source info ########
+        if do_crop:
+            crop_info = self.cropper.crop_source_image(img, self.cropper.crop_cfg)
+            if crop_info is None:
+                raise Exception("No face detected in the source image!")
+            lmk = crop_info['lmk_crop']
+            img_crop_256x256 = crop_info['img_crop_256x256']
+        else:
+            crop_info = None
+            lmk = self.cropper.calc_lmk_from_cropped_image(img)
+            img_crop_256x256 = cv2.resize(img, (256, 256))  # force to resize to 256x256
+        return img_crop_256x256, lmk, crop_info
+    def crop_source_video(self, img_lst, do_crop=False):
+        if do_crop:
+            ret_s = self.cropper.crop_source_video(img_lst, self.cropper.crop_cfg)
+            print(f'Source video is cropped, {len(ret_s["frame_crop_lst"])} frames are processed.')
+            img_crop_256x256_lst, lmk_crop_lst, M_c2o_lst = ret_s['frame_crop_lst'], ret_s['lmk_crop_lst'], ret_s['M_c2o_lst']
+        else:
+            M_c2o_lst = None
+            lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(img_lst)
+            img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in img_lst]  # force to resize to 256x256
+        return img_crop_256x256_lst, lmk_crop_lst, M_c2o_lst
+    def crop_driving_videos(self, img_lst, do_crop=False):
+        if do_crop:
+            ret_d = self.cropper.crop_driving_video(img_lst)
+            print(f'Driving video is cropped, {len(ret_d["frame_crop_lst"])} frames are processed.')
+            img_crop_lst, lmk_crop_lst = ret_d['frame_crop_lst'], ret_d['lmk_crop_lst']
+            img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in img_lst]
+        else:
+            lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(img_lst)
+            img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in img_lst]  # force to resize to 256x256
+        return img_crop_256x256_lst, lmk_crop_lst
+    def prepare_source(self, src_img):
+        """ construct the input as standard
+        img: HxWx3, uint8, 256x256
+        """
+        # processing source image to tensor
+        h, w = src_img.shape[:2]
+        if h != self.cfg.input_height or w != self.cfg.input_width:
+            x = cv2.resize(src_img, (self.cfg.input_width, self.cfg.input_height))
+        else:
+            x = src_img.copy()
+        if x.ndim == 3:
+            x = x[np.newaxis].astype(np.float32) / 255.  # HxWx3 -> 1xHxWx3, normalized to 0~1
+        elif x.ndim == 4:
+            x = x.astype(np.float32) / 255.  # BxHxWx3, normalized to 0~1
+        else:
+            raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')
+        x = np.clip(x, 0, 1)  # clip to 0~1
+        x = torch.from_numpy(x).permute(0, 3, 1, 2)  # 1xHxWx3 -> 1x3xHxW
+        x = x.to(self.device)
+        # extract features
+        I_s = x
+        f_s = self.extract_feature_3d(I_s)
+        x_s_info = self.get_kp_info(I_s)
+        return f_s, x_s_info
+    def process_clips(self, clips):
+        """ construct the input as standard
+        clips: NxBxHxWx3, uint8
+        """
+        # resize to 256 x 256
+        imgs = []
+        for img in clips:
+            h, w = img.shape[:2]
+            if h != self.cfg.input_height or w != self.cfg.input_width:
+                img = cv2.resize(img, (self.cfg.input_width, self.cfg.input_height))
+            else:
+                img = img.copy()
+            imgs.append(img)
+        # processing video frames to tensor
+        if isinstance(imgs, list):
+            _imgs = np.array(imgs)[..., np.newaxis]  # TxHxWx3x1
+        elif isinstance(imgs, np.ndarray):
+            _imgs = imgs
+        else:
+            raise ValueError(f'imgs type error: {type(imgs)}')
+        y = _imgs.astype(np.float32) / 255.
+        y = np.clip(y, 0, 1)  # clip to 0~1
+        y = torch.from_numpy(y).permute(0, 4, 3, 1, 2)  # TxHxWx3x1 -> Tx1x3xHxW
+        y = y.to(self.device)
+        return y
+    def prepare_driving_videos(self, vid_frames, feat_type="tensor"):
+        """ get driving kp infos
+        vid_frames: image list of HxWx3, uint8
+        """
+        # extract features
+        total_len = len(vid_frames)
+        kp_infos = {"pitch": [], "yaw": [], "roll": [], "t": [], "exp": [], "scale": [], "kp": []}
+        for start_idx in range(0, total_len, self.cfg.batch_size):
+            frames = vid_frames[start_idx: min(start_idx + self.cfg.batch_size, total_len)]
+            frames = self.process_clips(frames).squeeze(1)
+            kp_info = self.get_kp_info(frames)
+            for k, v in kp_info.items():
+                kp_infos[k].append(v)
+        # combine the kp_infos
+        for k, v in kp_infos.items():
+            kp_infos[k] = torch.cat(v, dim=0)
+        if feat_type == "np":
+            for k, v in kp_infos.items():
+                kp_infos[k] = v.cpu().numpy()
+        return kp_infos
+    def get_driving_template(self, kp_infos, smooth=False, dtype="pt_tensor"):
+        kp_infos = self.refine_kp(kp_infos)
+        motion_list = []
+        n_frames = len(kp_infos["exp"])
+        for idx in range(n_frames):
+            exp = kp_infos["exp"][idx]
+            scale = kp_infos["scale"][idx]
+            t = kp_infos["t"][idx]
+            pitch = kp_infos["pitch"][idx]
+            yaw = kp_infos["yaw"][idx]
+            roll = kp_infos["roll"][idx]
+            R = get_rotation_matrix(pitch, yaw, roll)
+            R = R.reshape(1, 3, 3)
+            exp = exp.reshape(1, 21, 3)
+            scale = scale.reshape(1, 1)
+            t = t.reshape(1, 3)
+            pitch = pitch.reshape(1, 1)
+            yaw = yaw.reshape(1, 1)
+            roll = roll.reshape(1, 1)
+            if dtype == "np":
+                R = R.cpu().numpy().astype(np.float32)
+                exp = exp.cpu().numpy().astype(np.float32)
+                scale = scale.cpu().numpy().astype(np.float32)
+                t = t.cpu().numpy().astype(np.float32)
+                pitch = pitch.cpu().numpy().astype(np.float32)
+                yaw = yaw.cpu().numpy().astype(np.float32)
+                roll = roll.cpu().numpy().astype(np.float32)
+            motion_list.append(
+                {"exp": exp, "scale": scale, "R": R, "t": t, "pitch": pitch, "yaw": yaw, "roll": roll}
+            )
+        tgt_motion = {'n_frames': n_frames, 'output_fps': 25, 'motion': motion_list}
+        if smooth:
+            print("Smoothing motion sequence...")
+            tgt_motion = smooth_(tgt_motion, method="ema")
+        return tgt_motion
+    @torch.no_grad()
+    def update_delta_new_eyeball_direction(self, eyeball_direction_x, eyeball_direction_y, delta_new, **kwargs):
+        if eyeball_direction_x > 0:
+                delta_new[0, 11, 0] += eyeball_direction_x * 0.0007
+                delta_new[0, 15, 0] += eyeball_direction_x * 0.001
+        else:
+            delta_new[0, 11, 0] += eyeball_direction_x * 0.001
+            delta_new[0, 15, 0] += eyeball_direction_x * 0.0007
+        delta_new[0, 11, 1] += eyeball_direction_y * -0.001
+        delta_new[0, 15, 1] += eyeball_direction_y * -0.001
+        blink = -eyeball_direction_y / 2.
+        delta_new[0, 11, 1] += blink * -0.001
+        delta_new[0, 13, 1] += blink * 0.0003
+        delta_new[0, 15, 1] += blink * -0.001
+        delta_new[0, 16, 1] += blink * 0.0003
+        return delta_new
+    def driven(self, f_s, x_s_info, s_lmk, c_s_eyes_lst, kp_infos, c_d_eyes_lst=None, c_d_lip_lst=None, smooth=False):
+        # source kp info
+        x_d_i_news=[]
+        x_ss=[]
+        f_ss=[]
+        x_s_info = self.refine_kp(x_s_info)
+        R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+        x_s = self.transform_keypoint(x_s_info)
+        x_c_s = x_s_info["kp"]
+        # driving kp infos
+        driving_template_dct = self.get_driving_template(kp_infos, smooth)
+        n_frames = driving_template_dct['n_frames']
+        # driving params
+        flag_normalize_lip = self.cfg.flag_normalize_lip
+        flag_relative_motion = self.cfg.flag_relative_motion
+        flag_source_video_eye_retargeting = self.cfg.flag_source_video_eye_retargeting
+        lip_normalize_threshold = self.cfg.lip_normalize_threshold
+        source_video_eye_retargeting_threshold = self.cfg.source_video_eye_retargeting_threshold
+        animation_region = self.cfg.animation_region
+        driving_option = self.cfg.driving_option
+        flag_stitching = self.cfg.flag_stitching
+        flag_eye_retargeting = self.cfg.flag_eye_retargeting
+        flag_lip_retargeting = self.cfg.flag_lip_retargeting
+        driving_multiplier = self.cfg.driving_multiplier
+        lib_multiplier = self.cfg.lib_multiplier
+        # let lip-open scalar to be 0 at first
+        lip_delta_before_animation, eye_delta_before_animation = None, None
+        if flag_normalize_lip and flag_relative_motion and s_lmk is not None:
+            c_d_lip_before_animation = [0.]
+            combined_lip_ratio_tensor_before_animation = self.calc_combined_lip_ratio(c_d_lip_before_animation, s_lmk)
+            if combined_lip_ratio_tensor_before_animation[0][0] >= lip_normalize_threshold:
+                lip_delta_before_animation = self.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+        # let eye-open scalar to be the same as the first frame if the latter is eye-open state
+        if flag_source_video_eye_retargeting and s_lmk is not None:
+            combined_eye_ratio_tensor_frame_zero = c_s_eyes_lst[0]
+            c_d_eye_before_animation_frame_zero = [[combined_eye_ratio_tensor_frame_zero[0][:2].mean()]]
+            if c_d_eye_before_animation_frame_zero[0][0] < source_video_eye_retargeting_threshold:
+                c_d_eye_before_animation_frame_zero = [[0.39]]
+            combined_eye_ratio_tensor_before_animation = self.calc_combined_eye_ratio(c_d_eye_before_animation_frame_zero, s_lmk)
+            eye_delta_before_animation = self.retarget_eye(x_s, combined_eye_ratio_tensor_before_animation)
+        # animate
+        I_p_lst = []
+        for i in range(n_frames):
+            x_d_i_info = driving_template_dct['motion'][i]
+            x_d_i_info = dct2device(x_d_i_info, self.device)
+            # R
+            R_d_i = x_d_i_info['R']
+            if i == 0:  # cache the first frame
+                R_d_0 = R_d_i
+                x_d_0_info = x_d_i_info.copy()
+            # enhance lip
+            # if i > 0:
+            #     for lip_idx in [6, 12, 14, 17, 19, 20]:
+            #         x_d_i_info['exp'][:, lip_idx, :] = x_d_0_info['exp'][:, lip_idx, :] + (x_d_i_info['exp'][:, lip_idx, :] - x_d_0_info['exp'][:, lip_idx, :]) * lib_multiplier
+            # normalize eye_ball, TODO
+            x_d_i_info['exp'] = self.update_delta_new_eyeball_direction(0, -5, x_d_i_info['exp'])
+            # debug
+            #print(f"frame {i:03d}, src scale {x_s_info['scale']}, 0 scale {x_d_0_info['scale']}, i scale {x_d_i_info['scale']}")
+            # delta
+            delta_new = x_s_info['exp'].clone()
+            if flag_relative_motion:
+                # R
+                if animation_region == "all" or animation_region == "pose":
+                    R_new = (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
+                else:
+                    R_new = R_s
+                # exp
+                if animation_region == "all" or animation_region == "exp":
+                    delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, eyes_idx, :]
+                # scale
+                if animation_region == "all":
+                    scale_new = x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
+                else:
+                    scale_new = x_s_info['scale']
+                # translation
+                if animation_region == "all" or animation_region == "pose":
+                    t_new = x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
+                else:
+                    t_new = x_s_info['t']
+            else:
+                # R
+                if animation_region == "all" or animation_region == "pose":
+                    R_new = R_d_i
+                else:
+                    R_new = R_s
+                # exp
+                if animation_region == "all" or animation_region == "exp":
+                    EYE_IDX=[1,2,6,11,12,13,14,15,16,17,18,19,20]
+                    delta_new[:, EYE_IDX, :] = x_d_i_info['exp'][:, EYE_IDX, :]
+                    # for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                    #     delta_new[:, idx, :] = x_d_i_info['exp'][:, idx, :]
+                    delta_new[:, 3:5, 1] = x_d_i_info['exp'][:, 3:5, 1]
+                    delta_new[:, 5, 2] = x_d_i_info['exp'][:, 5, 2]
+                    delta_new[:, 8, 2] = x_d_i_info['exp'][:, 8, 2]
+                    delta_new[:, 9, 1:] = x_d_i_info['exp'][:, 9, 1:]
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_i_info['exp'][:, lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_i_info['exp'][:, eyes_idx, :]
+                # scale
+                scale_new = x_s_info['scale']
+                # translation
+                if animation_region == "all" or animation_region == "pose":
+                    t_new = x_d_i_info['t']
+                else:
+                    t_new = x_s_info['t']
+            t_new[..., 2].fill_(0)  # zero tz
+            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+            if flag_relative_motion and driving_option == "expression-friendly":
+                if i == 0:
+                    x_d_0_new = x_d_i_new
+                    motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)
+                x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier
+                x_d_i_new = x_d_diff + x_s
+            # Algorithm 1 in Liveportrait:
+            if not flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # without stitching or retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new += lip_delta_before_animation
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+                else:
+                    pass
+            elif flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # with stitching and without retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new) + lip_delta_before_animation
+                else:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+            else:
+                eyes_delta, lip_delta = None, None
+                if flag_eye_retargeting and s_lmk is not None and c_d_eyes_lst is not None:
+                    c_d_eyes_i = c_d_eyes_lst[i]
+                    combined_eye_ratio_tensor = self.calc_combined_eye_ratio(c_d_eyes_i, s_lmk)
+                    eyes_delta = self.retarget_eye(x_s, combined_eye_ratio_tensor)
+                if flag_lip_retargeting and s_lmk is not None and c_d_lip_lst is not None:
+                    c_d_lip_i = c_d_lip_lst[i]
+                    combined_lip_ratio_tensor = self.calc_combined_lip_ratio(c_d_lip_i, s_lmk)
+                    # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+                    lip_delta = self.retarget_lip(x_s, combined_lip_ratio_tensor)
+                if flag_relative_motion:  # use x_s
+                    x_d_i_new = x_s + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                else:  # use x_d,i
+                    x_d_i_new = x_d_i_new + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                if flag_stitching:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+            x_d_i_new = x_s + (x_d_i_new - x_s) * driving_multiplier
+            x_d_i_news.append(x_d_i_new)
+        f_s_s= f_s.expand(n_frames, *f_s.shape[1:])
+        x_s_s = x_s.expand(n_frames, *x_s.shape[1:])
+        x_d_i_new = torch.cat(x_d_i_news, dim=0)
+        for start in range(0, n_frames, 100):
+            end = min(start + 100,n_frames)
+            with torch.no_grad(), torch.autocast('cuda'):
+                out = self.warp_decode(f_s_s[start:end], x_s_s[start:end], x_d_i_new[start:end])
+                I_p_lst.append(out['out'])
+        I_p=torch.cat(I_p_lst, dim=0)
+        I_p_i = self.parse_output(I_p)
+        return I_p_i
+    def driven_debug(self, f_s, x_s_info, s_lmk, c_s_eyes_lst, driving_template_dct, c_d_eyes_lst=None, c_d_lip_lst=None):
+        # source kp info
+        x_s_info = self.refine_kp(x_s_info)
+        x_c_s = x_s_info["kp"]
+        R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+        x_s = self.transform_keypoint(x_s_info)
+        n_frames = driving_template_dct['n_frames']
+        # driving params
+        flag_normalize_lip = self.cfg.flag_normalize_lip
+        flag_relative_motion = self.cfg.flag_relative_motion
+        flag_source_video_eye_retargeting = self.cfg.flag_source_video_eye_retargeting
+        lip_normalize_threshold = self.cfg.lip_normalize_threshold
+        source_video_eye_retargeting_threshold = self.cfg.source_video_eye_retargeting_threshold
+        animation_region = self.cfg.animation_region
+        driving_option = self.cfg.driving_option
+        flag_stitching = self.cfg.flag_stitching
+        flag_eye_retargeting = self.cfg.flag_eye_retargeting
+        flag_lip_retargeting = self.cfg.flag_lip_retargeting
+        driving_multiplier = self.cfg.driving_multiplier
+        # let lip-open scalar to be 0 at first
+        lip_delta_before_animation, eye_delta_before_animation = None, None
+        if flag_normalize_lip and flag_relative_motion and s_lmk is not None:
+            c_d_lip_before_animation = [0.]
+            combined_lip_ratio_tensor_before_animation = self.calc_combined_lip_ratio(c_d_lip_before_animation, s_lmk)
+            if combined_lip_ratio_tensor_before_animation[0][0] >= lip_normalize_threshold:
+                lip_delta_before_animation = self.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+        # let eye-open scalar to be the same as the first frame if the latter is eye-open state
+        if flag_source_video_eye_retargeting and s_lmk is not None:
+            combined_eye_ratio_tensor_frame_zero = c_s_eyes_lst[0]
+            c_d_eye_before_animation_frame_zero = [[combined_eye_ratio_tensor_frame_zero[0][:2].mean()]]
+            if c_d_eye_before_animation_frame_zero[0][0] < source_video_eye_retargeting_threshold:
+                c_d_eye_before_animation_frame_zero = [[0.39]]
+            combined_eye_ratio_tensor_before_animation = self.calc_combined_eye_ratio(c_d_eye_before_animation_frame_zero, s_lmk)
+            eye_delta_before_animation = self.retarget_eye(x_s, combined_eye_ratio_tensor_before_animation)
+        # animate
+        I_p_lst = []
+        for i in range(n_frames):
+            x_d_i_info = driving_template_dct['motion'][i]
+            x_d_i_info = dct2device(x_d_i_info, self.device)
+            # R
+            R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys
+            if i == 0:  # cache the first frame
+                R_d_0 = R_d_i
+                x_d_0_info = x_d_i_info.copy()
+            # debug
+            #print(f"frame {i:03d}, src scale {x_s_info['scale']}, 0 scale {x_d_0_info['scale']}, i scale {x_d_i_info['scale']}")
+            # delta
+            delta_new = x_s_info['exp'].clone()
+            if flag_relative_motion:
+                # R
+                if animation_region == "all" or animation_region == "pose":
+                    R_new = (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
+                else:
+                    R_new = R_s
+                # exp
+                if animation_region == "all" or animation_region == "exp":
+                    delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, eyes_idx, :]
+                # scale
+                if animation_region == "all":
+                    scale_new = x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
+                else:
+                    scale_new = x_s_info['scale']
+                # translation
+                if animation_region == "all" or animation_region == "pose":
+                    t_new = x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
+                else:
+                    t_new = x_s_info['t']
+            else:
+                # R
+                if animation_region == "all" or animation_region == "pose":
+                    R_new = R_d_i
+                else:
+                    R_new = R_s
+                # exp
+                if animation_region == "all" or animation_region == "exp":
+                    for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                        delta_new[:, idx, :] = x_d_i_info['exp'][:, idx, :]
+                    delta_new[:, 3:5, 1] = x_d_i_info['exp'][:, 3:5, 1]
+                    delta_new[:, 5, 2] = x_d_i_info['exp'][:, 5, 2]
+                    delta_new[:, 8, 2] = x_d_i_info['exp'][:, 8, 2]
+                    delta_new[:, 9, 1:] = x_d_i_info['exp'][:, 9, 1:]
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_i_info['exp'][:, lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_i_info['exp'][:, eyes_idx, :]
+                # scale
+                scale_new = x_s_info['scale']
+                # translation
+                if animation_region == "all" or animation_region == "pose":
+                    t_new = x_d_i_info['t']
+                else:
+                    t_new = x_s_info['t']
+            t_new[..., 2].fill_(0)  # zero tz
+            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+            if flag_relative_motion and driving_option == "expression-friendly":
+                if i == 0:
+                    x_d_0_new = x_d_i_new
+                    motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)
+                x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier
+                x_d_i_new = x_d_diff + x_s
+            # Algorithm 1 in Liveportrait:
+            if not flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # without stitching or retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new += lip_delta_before_animation
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+                else:
+                    pass
+            elif flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # with stitching and without retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new) + lip_delta_before_animation
+                else:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+            else:
+                eyes_delta, lip_delta = None, None
+                if flag_eye_retargeting and s_lmk is not None and c_d_eyes_lst is not None:
+                    c_d_eyes_i = c_d_eyes_lst[i]
+                    combined_eye_ratio_tensor = self.calc_combined_eye_ratio(c_d_eyes_i, s_lmk)
+                    eyes_delta = self.retarget_eye(x_s, combined_eye_ratio_tensor)
+                if flag_lip_retargeting and s_lmk is not None and c_d_lip_lst is not None:
+                    c_d_lip_i = c_d_lip_lst[i]
+                    combined_lip_ratio_tensor = self.calc_combined_lip_ratio(c_d_lip_i, s_lmk)
+                    # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+                    lip_delta = self.retarget_lip(x_s, combined_lip_ratio_tensor)
+                if flag_relative_motion:  # use x_s
+                    x_d_i_new = x_s + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                else:  # use x_d,i
+                    x_d_i_new = x_d_i_new + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                if flag_stitching:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+            x_d_i_new = x_s + (x_d_i_new - x_s) * driving_multiplier
+            out = self.warp_decode(f_s, x_s, x_d_i_new)
+            I_p_i = self.parse_output(out['out'])[0]
+            I_p_lst.append(I_p_i)
+        return I_p_lst
+    def read_image(self, image_path: str) -> list:
+        img_rgb = load_image_rgb(image_path)
+        img_rgb = resize_to_limit(img_rgb, self.cfg.source_max_dim, self.cfg.source_division)
+        source_rgb_list = [img_rgb]
+        print(f"Load image from {osp.realpath(image_path)} done.")
+        return source_rgb_list
+    def read_video(self, video_path: str, interval=None) -> list:
+        vr = VideoReader(video_path)
+        if interval is not None:
+            video_frames = vr.get_batch(np.arange(0, len(vr), interval)).numpy()
+        else:
+            video_frames = [vr[0].numpy(), vr[len(vr) // 2].numpy(), vr[-1].numpy()]
+        vr.seek(0)
+        driving_rgb_list = []
+        for video_frame in video_frames:
+            # h, w = video_frame.shape[:2]
+            # if h != self.cfg.output_height or w != self.cfg.output_width:
+            #     video_frame = cv2.resize(video_frame, (self.cfg.output_height, self.cfg.output_width))
+            driving_rgb_list.append(video_frame)
+        return driving_rgb_list
+    def prepare_videos(self, imgs) -> torch.Tensor:
+        """ construct the input as standard
+        imgs: NxBxHxWx3, uint8
+        """
+        if isinstance(imgs, list):
+            _imgs = np.array(imgs)[..., np.newaxis]  # TxHxWx3x1
+        elif isinstance(imgs, np.ndarray):
+            _imgs = imgs
+        else:
+            raise ValueError(f'imgs type error: {type(imgs)}')
+        y = _imgs.astype(np.float32) / 255.
+        y = np.clip(y, 0, 1)  # clip to 0~1
+        y = torch.from_numpy(y).permute(0, 4, 3, 1, 2)  # TxHxWx3x1 -> Tx1x3xHxW
+        y = y.to(self.device)
+        return y
+    def make_motion_template(self, I_lst, c_eyes_lst, c_lip_lst, **kwargs):
+        n_frames = I_lst.shape[0]
+        template_dct = {
+            'n_frames': n_frames,
+            'output_fps': kwargs.get('output_fps', 25),
+            'motion': [],
+            'c_eyes_lst': [],
+            'c_lip_lst': [],
+        }
+        for i in track(range(n_frames), description='Making motion templates...', total=n_frames):
+            # collect s, R, δ and t for inference
+            I_i = I_lst[i]
+            x_i_info = self.refine_kp(self.get_kp_info(I_i))
+            x_s = self.transform_keypoint(x_i_info)
+            R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])
+            item_dct = {
+                'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),
+                'R': R_i.cpu().numpy().astype(np.float32),
+                'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),
+                't': x_i_info['t'].cpu().numpy().astype(np.float32),
+                'kp': x_i_info['kp'].cpu().numpy().astype(np.float32),
+                'x_s': x_s.cpu().numpy().astype(np.float32),
+            }
+            template_dct['motion'].append(item_dct)
+            c_eyes = c_eyes_lst[i].astype(np.float32)
+            template_dct['c_eyes_lst'].append(c_eyes)
+            c_lip = c_lip_lst[i].astype(np.float32)
+            template_dct['c_lip_lst'].append(c_lip)
+        return template_dct
+    def load_template(self, wfp_template):
+        print(f"Load from template: {wfp_template}, NOT the video, so the cropping video and audio are both NULL.")
+        driving_template_dct = load(wfp_template)
+        c_d_eyes_lst = driving_template_dct['c_eyes_lst'] if 'c_eyes_lst' in driving_template_dct.keys() else driving_template_dct['c_d_eyes_lst'] # compatible with previous keys
+        c_d_lip_lst = driving_template_dct['c_lip_lst'] if 'c_lip_lst' in driving_template_dct.keys() else driving_template_dct['c_d_lip_lst']
+        driving_n_frames = driving_template_dct['n_frames']
+        flag_is_driving_video = True if driving_n_frames > 1 else False
+        n_frames = driving_n_frames
+        # set output_fps
+        output_fps = driving_template_dct.get('output_fps', 25)
+        print(f'The FPS of template: {output_fps}')
+        return driving_template_dct
+    def reconstruction(self, src_img, dst_imgs, video_path="template"):
+        # prepare source
+        src_img_256x256, s_lmk, _ = self.crop_image(src_img, do_crop=False)
+        #c_s_eyes_lst, c_s_lip_lst = self.calc_ratio([s_lmk])
+        c_s_eyes_lst = None
+        f_s, x_s_info = self.prepare_source(src_img_256x256)
+        # prepare driving videos
+        dst_imgs_256x256, d_lmk_lst = self.crop_driving_videos(dst_imgs, do_crop=False)
+        c_d_eyes_lst, c_d_lip_lst = self.calc_ratio(d_lmk_lst)
+        kp_infos = self.prepare_driving_videos(dst_imgs_256x256)
+        recs = self.driven(f_s, x_s_info, s_lmk, c_s_eyes_lst, kp_infos, c_d_eyes_lst, c_d_lip_lst)
+        return recs
+    def save_results(self, results, save_path, audio_path=None):
+        save_dir = osp.dirname(save_path)
+        save_name = osp.basename(save_path)
+        final_video = osp.join(save_dir, f'final_{save_name}')
+        images2video(results, wfp=save_path, fps=self.cfg.output_fps)
+        if audio_path is not None:
+            add_audio_to_video(save_path, audio_path, final_video)
+            os.remove(save_path)
+    def rec_score(self, video_path: str, interval=None, save_path=None):
+        video_frames = self.read_video(video_path, interval=interval)
+        #print(f"len frames: {len(video_frames)}, shape: {video_frames[0].shape}")
+        recs = self.reconstruction(video_frames[0], video_frames[1:], video_path)
+        if save_path is not None:
+            self.save_results(recs, save_path)
+        #print(f"len rec: {len(recs)}, shape: {recs[0].shape}")
+        psnrs = psnr(video_frames[1:], recs)
+        psnrs_np = np.array(psnrs)
+        psnr_mean, psnr_std = np.mean(psnrs_np), np.std(psnrs_np)
+        rec_score = {"mean": psnr_mean, "std": psnr_std}
+        return rec_score
+    @torch.no_grad()
+    def paste_back_by_face_mask(self, result, crop_info, src_img, crop_src_image, use_laplacian=False):
+        """
+        paste back the result to the original image with face mask
+        """
+        # detect src mask
+        crop_src_tensor = self.to_tensor(crop_src_image).unsqueeze(0).to(self.device)
+        src_msks = get_face_mask(self.face_parser, crop_src_tensor)
+        result_tensor = self.to_tensor(result).unsqueeze(0).to(self.device)
+        result_msks = get_face_mask(self.face_parser, result_tensor)
+        # combine masks
+        masks = []
+        for src_msk, result_msk in zip(src_msks, result_msks):
+            mask = np.clip(src_msk + result_msk, 0, 1)
+            masks.append(mask)
+        result = paste_back_with_face_mask(result, crop_info, src_img, masks[0], use_laplacian=use_laplacian)
+        return result
+    def driven_by_audio(self, src_img, kp_infos, save_path, audio_path=None, smooth=False):
+        # prepare source
+        # prepare source
+        src_img_256x256, s_lmk, crop_info = self.crop_image(src_img, do_crop=True)
+        #c_s_eyes_lst, c_s_lip_lst = self.calc_ratio([s_lmk])
+        c_s_eyes_lst = None
+        f_s, x_s_info = self.prepare_source(src_img_256x256)
+        mask_ori_float = prepare_paste_back(self.mask_crop, crop_info['M_c2o'], dsize=(src_img.shape[1], src_img.shape[0]))
+        # prepare driving videos
+        results = self.driven(f_s, x_s_info, s_lmk, c_s_eyes_lst, kp_infos, smooth=smooth)
+        frames=results.shape[0]
+        results = [paste_back(results[i], crop_info['M_c2o'], src_img, mask_ori_float) for i in range(frames)]
+        self.save_results(results, save_path, audio_path)
+    def mix_kp_infos(self, emo_kp_infos, lip_kp_infos, smooth=False, dtype="pt_tensor"):
+        driving_emo_template_dct = self.get_driving_template(emo_kp_infos, smooth=False, dtype=dtype)
+        if lip_kp_infos is not None:
+            driving_lip_template_dct = self.get_driving_template(lip_kp_infos, smooth=smooth, dtype=dtype)
+            driving_template_dct = {**driving_emo_template_dct}
+            n_frames = min(driving_emo_template_dct['n_frames'], driving_lip_template_dct['n_frames'])
+            driving_template_dct['n_frames'] = n_frames
+            for i in range(n_frames):
+                emo_motion = driving_emo_template_dct['motion'][i]['exp']
+                lib_motion = driving_lip_template_dct['motion'][i]['exp']
+                for lip_idx in [6, 12, 14, 17, 19, 20]:
+                    emo_motion[:, lip_idx, :] = lib_motion[:, lip_idx, :]
+                driving_template_dct['motion'][i]['exp'] = emo_motion
+        else:
+            driving_template_dct = driving_emo_template_dct
+        return driving_template_dct
+    def driven_by_mix(self, src_img, driving_video_path, kp_infos, save_path, audio_path=None, smooth=False):
+        # prepare source
+        src_img_256x256, s_lmk, crop_info = self.crop_image(src_img, do_crop=True)
+        c_s_eyes_lst, c_s_lip_lst = self.calc_ratio([s_lmk])
+        f_s, x_s_info = self.prepare_source(src_img_256x256)
+        mask_ori_float = prepare_paste_back(self.mask_crop, crop_info['M_c2o'], dsize=(src_img.shape[1], src_img.shape[0]))
+        # prepare driving videos
+        driving_imgs = self.read_video(driving_video_path, interval=1)
+        dst_imgs_256x256, d_lmk_lst = self.crop_driving_videos(driving_imgs, do_crop=True)
+        c_d_eyes_lst, c_d_lip_lst = self.calc_ratio(d_lmk_lst)
+        emo_kp_infos = self.prepare_driving_videos(dst_imgs_256x256)
+        # mix kp_infos
+        driving_template_dct = self.mix_kp_infos(emo_kp_infos, kp_infos, smooth=smooth)
+        # driven
+        results = self.driven_debug(f_s, x_s_info, s_lmk, c_s_eyes_lst, driving_template_dct, c_d_eyes_lst=c_d_eyes_lst, c_d_lip_lst=c_d_lip_lst)
+        results = [paste_back(result, crop_info['M_c2o'], src_img, mask_ori_float) for result in results]
+        print(results.shape)
+        self.save_results(results, save_path, audio_path)
+    def drive_video_by_mix(self, video_path, driving_video_path, kp_infos, save_path, audio_path):
+        # prepare driving videos
+        driving_imgs = self.read_video(driving_video_path, interval=1)
+        dst_imgs_256x256, d_lmk_lst = self.crop_driving_videos(driving_imgs, do_crop=True)
+        emo_kp_infos = self.prepare_driving_videos(dst_imgs_256x256)
+        # mix kp_infos
+        #driving_template_dct = self.get_driving_template(emo_kp_infos, smooth=True, dtype="np")
+        driving_template_dct = self.mix_kp_infos(emo_kp_infos, kp_infos, smooth=True, dtype="np")
+        # driven
+        self.video_lip_retargeting(
+            video_path, None,
+            save_path, audio_path,
+            driving_template_dct=driving_template_dct, retargeting_ragion="exp"
+        )
+    def load_source_video(self, video_info, n_frames=-1):
+        reader = imageio.get_reader(video_info, "ffmpeg")
+        ret = []
+        for idx, frame_rgb in enumerate(reader):
+            if n_frames > 0 and idx >= n_frames:
+                break
+            ret.append(frame_rgb)
+        reader.close()
+        return ret
+    def video_lip_retargeting(self, video_path, kp_infos, save_path, audio_path, c_d_eyes_lst=None, c_d_lip_lst=None, smooth=False, driving_template_dct=None, retargeting_ragion="exp"):
+        # 0. process source motion template
+        source_rgb_lst = load_video(video_path)
+        source_rgb_lst = [resize_to_limit(img, self.cfg.source_max_dim, self.cfg.source_division) for img in source_rgb_lst]
+        img_crop_256x256_lst, source_lmk_crop_lst, source_M_c2o_lst = self.crop_source_video(source_rgb_lst, do_crop=True)
+        c_s_eyes_lst, c_s_lip_lst = self.calc_ratio(source_lmk_crop_lst)
+        I_s_lst = self.prepare_videos(img_crop_256x256_lst)
+        source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=25)
+        # 1. prepare driving template
+        if driving_template_dct is None:
+            driving_template_dct = self.get_driving_template(kp_infos, smooth=smooth, dtype="np")
+        # 2. driving
+        n_frames = min(source_template_dct['n_frames'], driving_template_dct['n_frames'])
+        # driving params
+        I_p_lst = []
+        I_p_pstbk_lst = []
+        R_d_0, x_d_0_info = None, None
+        flag_normalize_lip = self.cfg.flag_normalize_lip
+        flag_relative_motion = True #self.cfg.flag_relative_motion
+        flag_source_video_eye_retargeting = self.cfg.flag_source_video_eye_retargeting
+        lip_normalize_threshold = self.cfg.lip_normalize_threshold
+        source_video_eye_retargeting_threshold = self.cfg.source_video_eye_retargeting_threshold
+        animation_region = 'lip' #self.cfg.animation_region
+        driving_option = self.cfg.driving_option
+        flag_stitching = self.cfg.flag_stitching
+        flag_eye_retargeting = self.cfg.flag_eye_retargeting
+        flag_lip_retargeting = self.cfg.flag_lip_retargeting
+        driving_multiplier = self.cfg.driving_multiplier
+        driving_smooth_observation_variance = self.cfg.driving_smooth_observation_variance
+        key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d'
+        if flag_relative_motion:
+            x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]
+            for i in range(n_frames):
+                for idx in [6, 12, 14, 17, 19, 20]:
+                    # lip motion use abs motion
+                    x_d_exp_lst[i][:, idx, :] = driving_template_dct['motion'][i]['exp'][:, idx, :]
+            x_d_exp_lst_smooth = ksmooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, self.device, driving_smooth_observation_variance)
+            if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]
+                x_d_r_lst_smooth = ksmooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, self.device, driving_smooth_observation_variance)
+        else:
+            x_d_exp_lst = [driving_template_dct['motion'][i]['exp'] for i in range(n_frames)]
+            x_d_exp_lst_smooth = ksmooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, self.device, driving_smooth_observation_variance)
+            if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                x_d_r_lst = [driving_template_dct['motion'][i][key_r] for i in range(n_frames)]
+                x_d_r_lst_smooth = ksmooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, self.device, driving_smooth_observation_variance)
+        # driving all
+        for i in track(range(n_frames), description='🚀Retargeting...', total=n_frames):
+            x_s_info = source_template_dct['motion'][i]
+            x_s_info = dct2device(x_s_info, self.device)
+            source_lmk = source_lmk_crop_lst[i]
+            img_crop_256x256 = img_crop_256x256_lst[i]
+            I_s = I_s_lst[i]
+            f_s = self.extract_feature_3d(I_s)
+            x_c_s = x_s_info['kp']
+            R_s = x_s_info['R']
+            x_s =x_s_info['x_s']
+            # let lip-open scalar to be 0 at first if the input is a video
+            lip_delta_before_animation = None
+            if flag_normalize_lip and flag_relative_motion and source_lmk is not None:
+                c_d_lip_before_animation = [0.]
+                combined_lip_ratio_tensor_before_animation = self.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
+                if combined_lip_ratio_tensor_before_animation[0][0] >= lip_normalize_threshold:
+                    lip_delta_before_animation = self.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+                else:
+                    lip_delta_before_animation = None
+            # let eye-open scalar to be the same as the first frame if the latter is eye-open state
+            eye_delta_before_animation = None
+            if flag_source_video_eye_retargeting and source_lmk is not None:
+                if i == 0:
+                    combined_eye_ratio_tensor_frame_zero = c_s_eyes_lst[0]
+                    c_d_eye_before_animation_frame_zero = [[combined_eye_ratio_tensor_frame_zero[0][:2].mean()]]
+                    if c_d_eye_before_animation_frame_zero[0][0] < source_video_eye_retargeting_threshold:
+                        c_d_eye_before_animation_frame_zero = [[0.39]]
+                combined_eye_ratio_tensor_before_animation = self.calc_combined_eye_ratio(c_d_eye_before_animation_frame_zero, source_lmk)
+                eye_delta_before_animation = self.retarget_eye(x_s, combined_eye_ratio_tensor_before_animation)
+            if flag_stitching:  # prepare for paste back
+                mask_ori_float = prepare_paste_back(self.mask_crop, source_M_c2o_lst[i], dsize=(source_rgb_lst[i].shape[1], source_rgb_lst[i].shape[0]))
+            x_d_i_info = driving_template_dct['motion'][i]
+            x_d_i_info = dct2device(x_d_i_info, self.device)
+            R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys
+            if i == 0:  # cache the first frame
+                R_d_0 = R_d_i
+                x_d_0_info = x_d_i_info.copy()
+            delta_new = x_s_info['exp'].clone()
+            if flag_relative_motion:
+                if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                    R_new = x_d_r_lst_smooth[i]
+                else:
+                    R_new = R_s
+                if animation_region == "all" or animation_region == "exp":
+                    for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]
+                elif animation_region == "all_wo_lip" or animation_region == "exp_wo_lip":
+                    for idx in [1, 2, 11, 13, 15, 16, 18]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :]
+                scale_new = x_s_info['scale']
+                t_new = x_s_info['t']
+            else:
+                if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                    R_new = x_d_r_lst_smooth[i]
+                else:
+                    R_new = R_s
+                if animation_region == "all" or animation_region == "exp":
+                    for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]
+                elif animation_region == "all_wo_lip" or animation_region == "exp_wo_lip":
+                    for idx in [1, 2, 11, 13, 15, 16, 18]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :]
+                scale_new = x_s_info['scale']
+                if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                    t_new = x_d_i_info['t']
+                else:
+                    t_new = x_s_info['t']
+            t_new[..., 2].fill_(0)  # zero tz
+            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+            # Algorithm 1:
+            if not flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # without stitching or retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new += lip_delta_before_animation
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+                else:
+                    pass
+            elif flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # with stitching and without retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new) + lip_delta_before_animation
+                else:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+            else:
+                eyes_delta, lip_delta = None, None
+                if flag_eye_retargeting and source_lmk is not None and c_d_eyes_lst is not None:
+                    c_d_eyes_i = c_d_eyes_lst[i]
+                    combined_eye_ratio_tensor = self.calc_combined_eye_ratio(c_d_eyes_i, source_lmk)
+                    # ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)
+                    eyes_delta = self.retarget_eye(x_s, combined_eye_ratio_tensor)
+                if flag_lip_retargeting and source_lmk is not None and c_d_lip_lst is not None:
+                    c_d_lip_i = c_d_lip_lst[i]
+                    combined_lip_ratio_tensor = self.calc_combined_lip_ratio(c_d_lip_i, source_lmk)
+                    # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+                    lip_delta = self.retarget_lip(x_s, combined_lip_ratio_tensor)
+                if flag_relative_motion:  # use x_s
+                    x_d_i_new = x_s + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                else:  # use x_d,i
+                    x_d_i_new = x_d_i_new + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                if flag_stitching:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+            x_d_i_new = x_s + (x_d_i_new - x_s) * driving_multiplier
+            out = self.warp_decode(f_s, x_s, x_d_i_new)
+            I_p_i = self.parse_output(out['out'])[0]
+            I_p_lst.append(I_p_i)
+            if flag_stitching:
+                # TODO: the paste back procedure is slow, considering optimize it using multi-threading or GPU
+                #I_p_pstbk = self.paste_back_by_face_mask(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], img_crop_256x256, use_laplacian=True)
+                I_p_pstbk = paste_back(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], mask_ori_float, use_laplacian=True)
+                I_p_pstbk_lst.append(I_p_pstbk)
+        if len(I_p_pstbk_lst) > 0:
+            self.save_results(I_p_pstbk_lst, save_path, audio_path)
+        else:
+            self.save_results(I_p_lst, save_path, audio_path)
+    @torch.no_grad()
+    def video_reconstruction_test(self, video_tensor, xs, save_path):
+        # video_tensor, (1, F, C, H, W), [-1, 1]
+        # xs, (1, F, 63)
+        result_lst = []
+        #ori_videos = []
+        video_tensor = video_tensor[0:1] * 0.5 + 0.5  # [-1, 1] -> [0, 1], 1xTx3xHxW
+        video_tensor = torch.clip(video_tensor, 0, 1)
+        video_tensor = video_tensor.permute(1, 0, 2, 3, 4) # 1xTx3xHxW -> Tx1x3xHxW
+        video = video_tensor.to(self.device)
+        xs = xs[0:1].permute(1, 0, 2)    # 1xTx63 -> Tx1x63
+        xs = xs.reshape(-1, 1, 21, 3)
+        xs = xs.to(self.device)
+        x_s_0 = xs[0]
+        I_s_0 = torch.nn.functional.interpolate(video[0], size=(256, 256), mode='bilinear')
+        f_s_0 = self.extract_feature_3d(I_s_0)
+        for i in range(video_tensor.shape[0]):
+            #I_s = video[i]   # 1x3xHxW
+            #ori_videos.append((I_s.squeeze(0).squeeze(0).permute(1, 2, 0).cpu().numpy()*255).astype(np.uint8))
+            x_s = self.stitching(x_s_0, xs[i])
+            out = self.warp_decode(f_s_0, x_s_0, x_s)
+            I_p_i = self.parse_output(out['out'])[0]
+            result_lst.append(I_p_i)
+        #save_dir = osp.dirname(save_path)
+        #ori_path = osp.join(save_dir, "ori.mp4")
+        #save_path = osp.join(save_dir, "rec.mp4")
+        self.save_results(result_lst, save_path, audio_path=None)
+        #self.save_results(ori_videos, ori_path, audio_path=None)
+    @torch.no_grad()
+    def self_driven(self, image_tensor, xs, save_path, length):
+        result_lst = []
+        image_tensor = image_tensor[0:1] * 0.5 + 0.5    # [-1, 1] -> [0, 1], 1x3xHxW
+        image_tensor = torch.clip(image_tensor, 0, 1)
+        image = image_tensor.to(self.device)
+        I_s_0 = torch.nn.functional.interpolate(image, size=(256, 256), mode='bilinear')
+        xs = xs[0:1].permute(1, 0, 2)    # 1xTx63 -> Tx1x63
+        xs = xs.reshape(-1, 1, 21, 3)
+        xs = xs.to(self.device)
+        x_s_0 = xs[0]
+        f_s_0 = self.extract_feature_3d(I_s_0)
+        for i in range(xs.shape[0]):
+            x_d = self.stitching(x_s_0, xs[i])
+            out = self.warp_decode(f_s_0, x_s_0, x_d)
+            I_p_i = self.parse_output(out['out'])[0]
+            result_lst.append(I_p_i)
+        assert len(result_lst) == length, f"length of result_lst is {len(result_lst)}, but length is {length}"
+        self.save_results(result_lst, save_path, audio_path=None)

src/examples/driving_audios/10.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79b53cbd91ebd7756b51f4d388a769b461a247f26acae5c362ca326e27c23626
+size 2880078

src/examples/driving_audios/5.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79b53cbd91ebd7756b51f4d388a769b461a247f26acae5c362ca326e27c23626
+size 2880078

src/examples/driving_audios/6.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90be6ae092eaa9be4e74e0bed56ef343a825bc2c899d2868e0e3aee494c86a04
+size 1323078

src/examples/driving_audios/tmp_5.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2f615328211bb938ab7f6b603631695106d2e23ceaa4dfcd4f491bc5dc2faca
+size 544044

src/examples/reference_images/1.jpg ADDED Viewed

Git LFS Details

SHA256: 362a14590bbfa4517e00338941f87f51fa9d6da0beaa827f6ba28a0e490888d4
Pointer size: 131 Bytes
Size of remote file: 225 kB

src/examples/reference_images/2.jpg ADDED Viewed

src/examples/reference_images/3.jpg ADDED Viewed

src/examples/reference_images/4.jpg ADDED Viewed

src/examples/reference_images/5.jpg ADDED Viewed

src/examples/reference_images/6.jpg ADDED Viewed

src/examples/reference_images/7.jpg ADDED Viewed

Git LFS Details

SHA256: 9f03a04f1de9055c626aa09c471115da0365d9d6c25a62c227e8eb3dfba53993
Pointer size: 131 Bytes
Size of remote file: 774 kB

src/examples/silent-audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:231cedffe295d0f5c8ea8569af9edc2471c262410689190bb705fb0adb62f63f
+size 352878

src/models/audio/__pycache__/audio_processer.cpython-310.pyc ADDED Viewed

Binary file (12.1 kB). View file

src/models/audio/__pycache__/audio_proj.cpython-310.pyc ADDED Viewed

Binary file (4.66 kB). View file

src/models/audio/__pycache__/hubert.cpython-310.pyc ADDED Viewed

Binary file (3.45 kB). View file

src/models/audio/__pycache__/wav2vec.cpython-310.pyc ADDED Viewed

Binary file (6 kB). View file

src/models/audio/__pycache__/wav2vec2.cpython-310.pyc ADDED Viewed

Binary file (4.32 kB). View file

src/models/audio/__pycache__/wav2vec_modified.cpython-310.pyc ADDED Viewed

Binary file (6.78 kB). View file

src/models/audio/audio_processer.py ADDED Viewed

	@@ -0,0 +1,407 @@

+"""Audio processer for talking data.
+Author: linzhihui.lzh
+Date: 2024-12-12
+"""
+import os
+from re import A
+import sys
+import os.path as osp
+from typing import List, Dict, Tuple, Optional, Union, Any
+import yaml
+from omegaconf import OmegaConf
+import math
+import librosa
+import numpy as np
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from pydub import AudioSegment
+# from audio_separator.separator import Separator
+sys.path.append(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__))))))
+from src.utils.rprint import rlog as log
+from src.utils.util import resample_audio
+from src.models.audio.wav2vec_modified import Wav2VecModel
+from src.models.audio.hubert import HubertModel
+def pad_audio(audio, audio_unit=320, pad_threshold=80):
+    batch_size, audio_len = audio.shape
+    n_units = audio_len // audio_unit
+    side_len = math.ceil((audio_unit * n_units + pad_threshold - audio_len) / 2)
+    if side_len >= 0:
+        reflect_len = side_len // 2
+        replicate_len = side_len % 2
+        if reflect_len > 0:
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+        if replicate_len > 0:
+            audio = F.pad(audio, (1, 1), mode='replicate')
+    return audio
+def cut_audio(audio_path: str, save_dir: str, length=60) -> List[str]:
+    """Cut audio into sub-divisions and return subfile paths. Supports wav format.
+    Args:
+        audio_path (str): the source audio file path
+        save_dir (str): the save directory of sub-divisions
+        length (int, optional): The max length of each sub-division. Defaults to 60 secs.
+    Returns:
+        List[str]: the subfile paths
+    """
+    audio_name = osp.basename(audio_path).split('.')[0]
+    audio = AudioSegment.from_wav(audio_path)
+    segment_length = length * 1000. # pydub uses milliseconds
+    num_segments = math.ceil(len(audio) / segment_length)
+    os.makedirs(save_dir, exist_ok=True)
+    audio_list = []
+    for i in range(num_segments):
+        start_time = i * segment_length
+        end_time = min((i + 1) * segment_length, len(audio))
+        segment = audio[start_time:end_time]
+        path = osp.join(save_dir, f"{audio_name}_segment_{i+1}.wav")
+        audio_list.append(path)
+        segment.export(path, format="wav")
+    return audio_list
+class AudioProcessor(object):
+    def __init__(self, cfg_path: str, is_training: bool = False) -> None:
+        cfg = OmegaConf.load(cfg_path)
+        self.cfg = cfg
+        self.is_training = is_training
+        log("========================================= Audio Processer =========================================")
+        log(OmegaConf.to_yaml(cfg))
+        # setting device
+        self.device_id = cfg.device_params.device_id
+        self.use_half = cfg.device_params.flag_use_half_precision
+        if cfg.device_params.flag_force_cpu:
+            self.device = 'cpu'
+        else:
+            try:
+                if torch.backends.mps.is_available():
+                    self.device = 'mps'
+                else:
+                    self.device = 'cuda:' + str(self.device_id)
+            except:
+                self.device = 'cuda:' + str(self.device_id)
+        # init audio separator
+        self.audio_separator = None
+        self.cache_dir = cfg.cache_dir
+        self.tmp_dir = cfg.tmp_dir
+        self.use_audio_separator = cfg.model_params.use_audio_separator
+        self.audio_separator_name = cfg.model_params.audio_separator_name
+        self.audio_separator_path = cfg.model_weights.audio_separator_path
+        self.set_audio_separator(cfg.cache_dir)
+        # load audio encoder, wav2vec or hubert
+        self.model_name = cfg.model_params.model_name
+        self.is_chinese = cfg.model_params.is_chinese
+        self.audio_encoder = self.load_model(
+            model_name = cfg.model_params.model_name,
+            model_type = cfg.model_params.model_type,
+            is_chinese = cfg.model_params.is_chinese,
+        )
+        self.only_last_features = cfg.model_params.only_last_features
+        if cfg.model_params.only_last_features:
+            self.feature_shape = (1, 768)
+        else:
+            self.feature_shape = (12, 768)     # features of 12 blocks
+        # init data params
+        self.sample_strategy = cfg.data_params.sample_strategy
+        self.sample_rate = cfg.data_params.sample_rate
+        self.fps = cfg.data_params.fps
+        self.audio_unit = cfg.data_params.sample_rate / cfg.data_params.fps   # num of audio samples per frame
+        self.max_length = cfg.data_params.max_length
+        self.subclip_len = cfg.data_params.sub_clip_length
+        self.save_to_cpu = cfg.data_params.save_to_cpu
+        self.pad_mode = cfg.data_params.audio_pad_mode
+        log("========================================= Audio Processer: Done =========================================")
+    def load_model(self, model_name: str="wav2vec", model_type: str="base", is_chinese: bool = False):
+        assert model_name in ["wav2vec", "hubert"], f"Unknown audio model {model_name}, only support wav2vec or hubert"
+        assert model_type in ["base", "large"], f"Unknown audio model type {model_type}, only support base or large"
+        if model_name == "wav2vec":
+            # load wav2vec model weights
+            if is_chinese:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.chinese.base
+                else:
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.chinese.large
+            else:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.default.base
+                else:
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.default.large
+            if model_weight_path is None:
+                raise ValueError(f"model_weight_path is None")
+            audio_encoder = Wav2VecModel.from_pretrained(model_weight_path, local_files_only=True).to(device=self.device)
+        else:
+            if is_chinese:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.hubert_path.chinese.base
+                else:
+                    model_weight_path = self.cfg.model_weights.hubert_path.chinese.large
+            else:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.hubert_path.default.base
+                else:
+                    model_weight_path = self.cfg.model_weights.hubert_path.default.large
+            if model_weight_path is None:
+                raise ValueError(f"model_weight_path is None")
+            audio_encoder = HubertModel.from_pretrained(model_weight_path, local_files_only=True).to(device=self.device)
+        log(f"{model_name}-{model_type}-chinese-{is_chinese} model has beed loaded from {model_weight_path}")
+        total_params = sum(p.numel() for p in audio_encoder.parameters())
+        print('Number of parameter: % .4fM' % (total_params / 1e6))
+        # weights initialization
+        audio_encoder.feature_extractor._freeze_parameters()
+        if not self.cfg.model_params.is_original:
+            frozen_layers = [0, 1]
+            for name, param in audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+        audio_encoder = audio_encoder.to(self.device)
+        if self.use_half:
+            audio_encoder = audio_encoder.half()
+        audio_encoder.eval()
+        return audio_encoder
+    def set_audio_separator(self, output_dir: str) -> None:
+        del self.audio_separator
+        if self.audio_separator_name is not None and self.use_audio_separator:
+            try:
+                os.makedirs(output_dir, exist_ok=True)
+            except OSError as _:
+                print("Fail to create the output cache dir.")
+            self.audio_separator = Separator(
+                output_dir=output_dir,
+                output_single_stem="vocals",
+                model_file_dir=self.audio_separator_path,
+            )
+            self.audio_separator.load_model(self.audio_separator_name)
+            assert self.audio_separator.model_instance is not None, "Fail to load audio separate model."
+        else:
+            self.audio_separator=None
+            log("Use audio directly without vocals seperator.")
+    def seperate_audio(self, audio_path: str, output_dir: Union[str, None] = None) -> str:
+        if output_dir is not None:
+            if output_dir != self.cache_dir:
+                # reload audio separator
+                self.set_audio_separator(output_dir)
+        if self.audio_separator is not None:
+            # 1. separate vocals
+            # TODO: process in memory
+            try:
+                outputs = self.audio_separator.separate(audio_path)
+                if len(outputs) <= 0:
+                    raise RuntimeError("Audio separate failed.")
+                vocal_audio_file = outputs[0]
+                vocal_audio_name, _ = os.path.splitext(vocal_audio_file)
+                vocal_audio_file = os.path.join(self.audio_separator.output_dir, vocal_audio_file)
+                vocal_audio_file = resample_audio(vocal_audio_file, os.path.join(self.audio_separator.output_dir, f"{vocal_audio_name}-16k.wav"), self.sample_rate)
+            except Exception as e:
+                log(f"Fail to separate vocals from {audio_path}, error info [{e}]")
+                vocal_audio_file=audio_path
+        else:
+            vocal_audio_file=audio_path
+        return vocal_audio_file
+    def load_audio(self, audio_path: str, mono: bool = True, duration: Optional[float] = None) -> Any:
+        try:
+            audio_data, sampling_rate = librosa.load(audio_path, sr=self.sample_rate, mono=mono, duration=duration)
+        except Exception as e:
+            raise RuntimeError(f"Fail to load audio from {audio_path}, error info [{e}]")
+        return audio_data, sampling_rate
+    def prepare_audio_data(self, audio_data: Union[np.ndarray, torch.Tensor], n_frames: Optional[int]=None) -> Tuple[List[Any], int]:
+        """Prepare audio data for processing.
+        """
+        clip_len = int(len(audio_data) / self.audio_unit)
+        if n_frames is not None:
+            if abs(n_frames - clip_len) > 2:
+                log(f"The number of frames must be close to the clip length (in 80ms), got {n_frames} and {clip_len}")
+                return [], n_frames
+            clip_len = n_frames
+        else:
+            n_frames = clip_len
+        # normalize audio, replace Wav2Vec2FeatureExtractor
+        if isinstance(audio_data, np.ndarray):
+            audio_data = torch.from_numpy(audio_data).to(self.device)
+        assert audio_data.ndim == 1, 'Audio must be 1D tensor.'
+        audio_data = (audio_data - torch.mean(audio_data)) / (torch.std(audio_data) + 1e-7)
+        #log(f"audio loaded! {audio_data.shape}")
+        # padding
+        # padding audio to fit the clip length
+        n_audio_samples = round(self.audio_unit * clip_len)
+        n_padding_audio_samples = n_audio_samples - len(audio_data)
+        n_padding_frames = math.ceil(n_padding_audio_samples / self.audio_unit)
+        if n_padding_audio_samples > 0:
+            if self.pad_mode == 'zero':
+                padding_value = 0
+            elif self.pad_mode == 'replicate':
+                padding_value = float(audio_data[-1])
+            else:
+                raise ValueError(f'Unknown pad mode: {self.pad_mode}')
+            audio_data = F.pad(audio_data, (0, n_padding_audio_samples), value=padding_value)
+        # devide audio into sub-divisions for saving GPU memory
+        audio_segments = []
+        if clip_len <= self.subclip_len:
+            n_subdivision = 1
+            subclip_len = clip_len
+        else:
+            n_subdivision = math.ceil(clip_len / self.subclip_len)
+            subclip_len = self.subclip_len
+        for i in range(0, n_subdivision):
+            start_idx = i * subclip_len
+            end_idx = min(start_idx + subclip_len, clip_len)
+            # debug
+            #log(f"[{i+1}/{n_subdivision}] data index [{round(start_idx * self.audio_unit)}, {round(end_idx * self.audio_unit)})")
+            audio_segments.append(
+                {
+                    "data": audio_data[round(start_idx * self.audio_unit):round(end_idx * self.audio_unit)].unsqueeze(0),
+                    "start_idx": start_idx,
+                    "end_idx": end_idx,
+                    "length": end_idx - start_idx
+                }
+            )
+        return audio_segments, n_frames
+    def get_audio_embedding(self, audio, clip_len: int) -> torch.Tensor:
+        if audio.ndim == 2:
+            # Extract audio features
+            assert audio.shape[1] == 16000 * clip_len / self.fps, \
+                f'Incorrect audio length {audio.shape[1]}'
+            # Extract audio features
+            if self.use_half:
+                audio = audio.half()
+            embeddings = self.audio_encoder(
+                pad_audio(audio), seq_len=clip_len, sample_strategy=self.sample_strategy, output_hidden_states=True
+            )  # (N, L, 768)
+            assert len(embeddings) > 0, "Fail to extract audio embedding"
+            if self.only_last_features:
+                audio_emb = embeddings.last_hidden_state.squeeze(0)
+            else:
+                audio_emb = torch.stack(
+                    embeddings.hidden_states[1:], dim=1
+                ).squeeze(0)
+                audio_emb = rearrange(audio_emb, "b s d -> s b d")
+        elif audio.ndim == 3:
+            assert audio.shape[1] == clip_len, f'Incorrect audio feature length {audio.shape[1]}'
+            audio_emb = audio
+        else:
+            raise ValueError(f'Incorrect audio input shape {audio.shape}')
+        return audio_emb
+    def get_audio_embeddings(self, audio_segments: List[Any]) -> Optional[torch.Tensor]:
+        audio_embs = []
+        for audio_segment in audio_segments:
+            if self.is_training:
+                audio_emb = self.get_audio_embedding(audio_segment["data"], audio_segment["length"])
+            else:
+                with torch.no_grad():
+                    audio_emb = self.get_audio_embedding(audio_segment["data"], audio_segment["length"])
+            audio_emb = audio_emb.cpu() if self.save_to_cpu else audio_emb
+            audio_embs.append(audio_emb)
+            #log(f"audio segment [{audio_segment['start_idx']}, {audio_segment['end_idx']}) has been processed.")
+        if len(audio_embs) == 0:
+            return None
+        audio_emb = torch.cat(audio_embs, dim=0)
+        return audio_emb
+    def preprocess(
+        self,
+        audio_path: str,
+        n_frames: Optional[int] = None,
+        duration: Optional[float] = None,
+        need_seperate: bool = False
+    ):
+        """ Preprocess a WAV audio file by separating the vocals from the background and resampling it to a 16 kHz sample rate.
+        The separated vocal track is then converted into wav2vec2 for further processing or analysis.
+        """
+        if need_seperate:
+            vocal_audio_file = self.seperate_audio(audio_path)
+        else:
+            vocal_audio_file = audio_path
+        audio_data, sampling_rate = self.load_audio(vocal_audio_file, duration=duration)
+        assert sampling_rate == 16000, "The sample rate of audio must be 16000"
+        audio_segments, n_frames = self.prepare_audio_data(audio_data, n_frames)
+        audio_emb = self.get_audio_embeddings(audio_segments)
+        if audio_emb is None:
+            log(f"{audio_path} has been processed, but no audio embedding, set as 'None'.")
+        #else:
+            #log(f"{audio_path} has been processed, audio embedding shape {audio_emb.shape}.")
+        return audio_emb, n_frames
+    def preprocess_long(
+        self,
+        audio_path: str,
+        need_seperate: bool = False
+    ):
+        audio_list = cut_audio(audio_path, self.tmp_dir, length=self.max_length)
+        audio_emb_list = []
+        l = 0
+        for idx, audio_path in enumerate(audio_list):
+            padding = (idx+1) == len(audio_list)
+            emb, length = self.preprocess(audio_path, need_seperate=need_seperate)
+            audio_emb_list.append(emb)
+            log(f"Processing audio {idx+1}/{len(audio_list)}, path: {audio_path} length: {length}")
+            l += length
+        audio_emb = torch.cat(audio_emb_list)
+        audio_length = l
+        # remove tmp file
+        for audio_path in audio_list:
+            os.remove(audio_path)
+        return audio_emb, audio_length
+    def __enter__(self):
+        return self

src/models/audio/audio_proj.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+This module provides the implementation of an Audio Projection Model, which is designed for
+audio processing tasks. The model takes audio embeddings as input and outputs context tokens
+that can be used for various downstream applications, such as audio analysis or synthesis.
+The AudioProjModel class is based on the ModelMixin class from the diffusers library, which
+provides a foundation for building custom models. This implementation includes multiple linear
+layers with ReLU activation functions and a LayerNorm for normalization.
+Key Features:
+- Audio embedding input with flexible sequence length and block structure.
+- Multiple linear layers for feature transformation.
+- ReLU activation for non-linear transformation.
+- LayerNorm for stabilizing and speeding up training.
+- Rearrangement of input embeddings to match the model's expected input shape.
+- Customizable number of blocks, channels, and context tokens for adaptability.
+The module is structured to be easily integrated into larger systems or used as a standalone
+component for audio feature extraction and processing.
+Classes:
+- AudioProjModel: A class representing the audio projection model with configurable parameters.
+Functions:
+- (none)
+Dependencies:
+- torch: For tensor operations and neural network components.
+- diffusers: For the ModelMixin base class.
+- einops: For tensor rearrangement operations.
+"""
+import torch
+from diffusers import ModelMixin
+from einops import rearrange
+from torch import nn
+class AudioProjModel(ModelMixin):
+    """Audio Projection Model
+    This class defines an audio projection model that takes audio embeddings as input
+    and produces context tokens as output. The model is based on the ModelMixin class
+    and consists of multiple linear layers and activation functions. It can be used
+    for various audio processing tasks.
+    Attributes:
+        seq_len (int): The length of the audio sequence.
+        blocks (int): The number of blocks in the audio projection model.
+        channels (int): The number of channels in the audio projection model.
+        intermediate_dim (int): The intermediate dimension of the model.
+        context_tokens (int): The number of context tokens in the output.
+        output_dim (int): The output dimension of the context tokens.
+    Methods:
+        __init__(self, seq_len=5, blocks=12, channels=768, intermediate_dim=512, context_tokens=32, output_dim=768):
+            Initializes the AudioProjModel with the given parameters.
+        forward(self, audio_embeds):
+            Defines the forward pass for the AudioProjModel.
+            Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+    """
+    def __init__(
+        self,
+        seq_len=5,
+        blocks=12,  # add a new parameter blocks
+        channels=768,  # add a new parameter channels
+        intermediate_dim=512,
+        output_dim=768,
+        context_tokens=32,
+    ):
+        super().__init__()
+        self.seq_len = seq_len
+        self.blocks = blocks
+        self.channels = channels
+        self.input_dim = (
+            seq_len * blocks * channels
+        )  # update input_dim to be the product of blocks and channels.
+        self.intermediate_dim = intermediate_dim
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+        # define multiple linear layers
+        self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
+        self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
+        self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
+        self.norm = nn.LayerNorm(output_dim)
+    def forward(self, audio_embeds):
+        """
+        Defines the forward pass for the AudioProjModel.
+        Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+        Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+        """
+        # merge
+        video_length = audio_embeds.shape[1]
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
+        audio_embeds = torch.relu(self.proj1(audio_embeds))
+        audio_embeds = torch.relu(self.proj2(audio_embeds))
+        context_tokens = self.proj3(audio_embeds).reshape(
+            batch_size, self.context_tokens, self.output_dim
+        )
+        context_tokens = self.norm(context_tokens)
+        context_tokens = rearrange(
+            context_tokens, "(bz f) m c -> bz f m c", f=video_length
+        )
+        return context_tokens

src/models/audio/hubert.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from transformers import HubertModel
+from transformers.modeling_outputs import BaseModelOutput
+_CONFIG_FOR_DOC = 'HubertConfig'
+def linear_interpolation(features, seq_len):
+    """
+    Transpose the features to interpolate linearly.
+    Args:
+        features (torch.Tensor): The extracted features to be interpolated.
+        seq_len (torch.Tensor): The sequence lengths of the features.
+    Returns:
+        torch.Tensor: The interpolated features.
+    """
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)
+class HubertModel_(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        seq_len: Optional[int],
+        sample_strategy: Optional[str] = "presample",
+        attention_mask: Optional[torch.LongTensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Forward pass of the HuBERT model.
+        Args:
+            self: The instance of the model.
+            input_values: The input values (waveform) to the model.
+            seq_len: The sequence length of the input values.
+            sample_strategy: The sample strategy to align features and seq_len, supports ['presample', 'postsample'].
+            attention_mask: Attention mask to be used for the model.
+            mask_time_indices: Mask indices to be used for the model.
+            output_attentions: If set to True, returns attentions.
+            output_hidden_states: If set to True, returns hidden states.
+            return_dict: If set to True, returns a BaseModelOutput instead of a tuple.
+        Returns:
+            The output of the HuBERT model.
+        """
+        # output_fps=25,
+        # attention_mask=None,
+        # output_attentions=None,
+        # output_hidden_states=None,
+        # return_dict=None,
+        # frame_num=None
+        assert sample_strategy in ["presample", "postsample"], f"sample_strategy must be in ['presample', 'postsample]"
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)  # (N, C, L)
+        extract_features = extract_features.transpose(1, 2)
+        if sample_strategy == "presample":
+            extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        # # Resample the audio feature @ 50 fps to `output_fps`.
+        # if frame_num is not None:
+        #     extract_features_len = round(frame_num * 50 / output_fps)
+        #     extract_features = extract_features[:, :, :extract_features_len]
+        # extract_features = linear_interpolation(extract_features, 50, output_fps, output_len=frame_num)
+        # extract_features = extract_features.transpose(1, 2)  # (N, L, C)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states,
+            mask_time_indices=mask_time_indices,
+            attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if sample_strategy == "postsample":
+            hidden_states = linear_interpolation(hidden_states, seq_len=seq_len)
+            for i in range(len(encoder_outputs.hidden_states)):
+                encoder_outputs.hidden_states[i] = linear_interpolation(encoder_outputs.hidden_states[i], seq_len=seq_len)
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

src/models/audio/hubert2.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from transformers import HubertModel
+from transformers.modeling_outputs import BaseModelOutput
+_CONFIG_FOR_DOC = 'HubertConfig'
+def linear_interpolation(features, seq_len):
+    """
+    Transpose the features to interpolate linearly.
+    Args:
+        features (torch.Tensor): The extracted features to be interpolated.
+        seq_len (torch.Tensor): The sequence lengths of the features.
+    Returns:
+        torch.Tensor: The interpolated features.
+    """
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)
+class HubertModel(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        seq_len: Optional[int],
+        sample_strategy: Optional[str] = "presample",
+        attention_mask: Optional[torch.LongTensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Forward pass of the HuBERT model.
+        Args:
+            self: The instance of the model.
+            input_values: The input values (waveform) to the model.
+            seq_len: The sequence length of the input values.
+            sample_strategy: The sample strategy to align features and seq_len, supports ['presample', 'postsample'].
+            attention_mask: Attention mask to be used for the model.
+            mask_time_indices: Mask indices to be used for the model.
+            output_attentions: If set to True, returns attentions.
+            output_hidden_states: If set to True, returns hidden states.
+            return_dict: If set to True, returns a BaseModelOutput instead of a tuple.
+        Returns:
+            The output of the HuBERT model.
+        """
+        # output_fps=25,
+        # attention_mask=None,
+        # output_attentions=None,
+        # output_hidden_states=None,
+        # return_dict=None,
+        # frame_num=None
+        assert sample_strategy in ["presample", "postsample"], f"sample_strategy must be in ['presample', 'postsample]"
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)  # (N, C, L)
+        extract_features = extract_features.transpose(1, 2)
+        if sample_strategy == "presample":
+            extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        # # Resample the audio feature @ 50 fps to `output_fps`.
+        # if frame_num is not None:
+        #     extract_features_len = round(frame_num * 50 / output_fps)
+        #     extract_features = extract_features[:, :, :extract_features_len]
+        # extract_features = linear_interpolation(extract_features, 50, output_fps, output_len=frame_num)
+        # extract_features = extract_features.transpose(1, 2)  # (N, L, C)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states,
+            mask_time_indices=mask_time_indices,
+            attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if sample_strategy == "postsample":
+            hidden_states = linear_interpolation(hidden_states, seq_len=seq_len)
+            for i in range(len(encoder_outputs.hidden_states)):
+                encoder_outputs.hidden_states[i] = linear_interpolation(encoder_outputs.hidden_states[i], seq_len=seq_len)
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

src/models/audio/wav2vec.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""
+This module defines the Wav2Vec model, which is a pre-trained model for speech recognition and understanding.
+It inherits from the Wav2Vec2Model class in the transformers library and provides additional functionalities
+such as feature extraction and encoding.
+Classes:
+    Wav2VecModel: Inherits from Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+Functions:
+    linear_interpolation: Interpolates the features based on the sequence length.
+"""
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+class Wav2VecModel(Wav2Vec2Model):
+    """
+    Wav2VecModel is a custom model class that extends the Wav2Vec2Model class from the transformers library.
+    It inherits all the functionality of the Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+    ...
+    Attributes:
+        base_model (Wav2Vec2Model): The base Wav2Vec2Model object.
+    Methods:
+        forward(input_values, seq_len, attention_mask=None, mask_time_indices=None
+        , output_attentions=None, output_hidden_states=None, return_dict=None):
+            Forward pass of the Wav2VecModel.
+            It takes input_values, seq_len, and other optional parameters as input and returns the output of the base model.
+        feature_extract(input_values, seq_len):
+            Extracts features from the input_values using the base model.
+        encode(extract_features, attention_mask=None, mask_time_indices=None, output_attentions=None, output_hidden_states=None, return_dict=None):
+            Encodes the extracted features using the base model and returns the encoded features.
+    """
+    def forward(
+        self,
+        input_values,
+        seq_len,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        """
+        Forward pass of the Wav2Vec model.
+        Args:
+            self: The instance of the model.
+            input_values: The input values (waveform) to the model.
+            seq_len: The sequence length of the input values.
+            attention_mask: Attention mask to be used for the model.
+            mask_time_indices: Mask indices to be used for the model.
+            output_attentions: If set to True, returns attentions.
+            output_hidden_states: If set to True, returns hidden states.
+            return_dict: If set to True, returns a BaseModelOutput instead of a tuple.
+        Returns:
+            The output of the Wav2Vec model.
+        """
+        self.config.output_attentions = True
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def feature_extract(
+        self,
+        input_values,
+        seq_len,
+    ):
+        """
+        Extracts features from the input values and returns the extracted features.
+        Parameters:
+        input_values (torch.Tensor): The input values to be processed.
+        seq_len (torch.Tensor): The sequence lengths of the input values.
+        Returns:
+        extracted_features (torch.Tensor): The extracted features from the input values.
+        """
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        return extract_features
+    def encode(
+        self,
+        extract_features,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        """
+        Encodes the input features into the output space.
+        Args:
+            extract_features (torch.Tensor): The extracted features from the audio signal.
+            attention_mask (torch.Tensor, optional): Attention mask to be used for padding.
+            mask_time_indices (torch.Tensor, optional): Masked indices for the time dimension.
+            output_attentions (bool, optional): If set to True, returns the attention weights.
+            output_hidden_states (bool, optional): If set to True, returns all hidden states.
+            return_dict (bool, optional): If set to True, returns a BaseModelOutput instead of the tuple.
+        Returns:
+            The encoded output features.
+        """
+        self.config.output_attentions = True
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+def linear_interpolation(features, seq_len):
+    """
+    Transpose the features to interpolate linearly.
+    Args:
+        features (torch.Tensor): The extracted features to be interpolated.
+        seq_len (torch.Tensor): The sequence lengths of the features.
+    Returns:
+        torch.Tensor: The interpolated features.
+    """
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)

src/models/audio/wav2vec2.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from packaging import version
+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+_CONFIG_FOR_DOC = 'Wav2Vec2Config'
+# the implementation of Wav2Vec2Model is borrowed from
+# https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model
+# initialize our encoder with the pre-trained wav2vec 2.0 weights.
+def _compute_mask_indices(shape: Tuple[int, int], mask_prob: float, mask_length: int,
+                          attention_mask: Optional[torch.Tensor] = None, min_masks: int = 0, ) -> np.ndarray:
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    all_num_mask = int(mask_prob * all_sz / float(mask_length) + np.random.rand())
+    all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand())
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+        lengths = np.full(num_mask, mask_length)
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+        min_len = min(lengths)
+        if sz - min_len <= num_mask:
+            min_len = sz - num_mask - 1
+        mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+        mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return mask
+# linear interpolation layer
+def linear_interpolation(features, input_fps, output_fps, output_len=None):
+    # features: (N, C, L)
+    seq_len = features.shape[2] / float(input_fps)
+    if output_len is None:
+        output_len = int(seq_len * output_fps)
+    output_features = F.interpolate(features, size=output_len, align_corners=False, mode='linear')
+    return output_features
+class Wav2Vec2Model(Wav2Vec2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.is_old_version = version.parse(transformers.__version__) < version.parse('4.7.0')
+    def forward(self, input_values, output_fps=25, attention_mask=None, output_attentions=None,
+                output_hidden_states=None, return_dict=None, frame_num=None):
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        print(f"data shape before feature extractor: {input_values.shape}")
+        hidden_states = self.feature_extractor(input_values)  # (N, C, L)
+        print(f"data shape after feature extractor: {hidden_states.shape}")
+        # Resample the audio feature @ 50 fps to `output_fps`.
+        if frame_num is not None:
+            hidden_states_len = round(frame_num * 50 / output_fps)
+            hidden_states = hidden_states[:, :, :hidden_states_len]
+        hidden_states = linear_interpolation(hidden_states, 50, output_fps, output_len=frame_num)
+        hidden_states = hidden_states.transpose(1, 2)  # (N, L, C)
+        if attention_mask is not None:
+            output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+            attention_mask = torch.zeros(hidden_states.shape[:2], dtype=hidden_states.dtype,
+                                         device=hidden_states.device)
+            attention_mask[(torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1)] = 1
+            attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        if self.is_old_version:
+            hidden_states = self.feature_projection(hidden_states)
+        else:
+            hidden_states = self.feature_projection(hidden_states)[0]
+        if self.config.apply_spec_augment and self.training:
+            batch_size, sequence_length, hidden_size = hidden_states.size()
+            if self.config.mask_time_prob > 0:
+                mask_time_indices = _compute_mask_indices((batch_size, sequence_length), self.config.mask_time_prob,
+                                                          self.config.mask_time_length, attention_mask=attention_mask,
+                                                          min_masks=2, )
+                hidden_states[torch.from_numpy(mask_time_indices)] = self.masked_spec_embed.to(hidden_states.dtype)
+            if self.config.mask_feature_prob > 0:
+                mask_feature_indices = _compute_mask_indices((batch_size, hidden_size), self.config.mask_feature_prob,
+                                                             self.config.mask_feature_length, )
+                mask_feature_indices = torch.from_numpy(mask_feature_indices).to(hidden_states.device)
+                hidden_states[mask_feature_indices[:, None].expand(-1, sequence_length, -1)] = 0
+        encoder_outputs = self.encoder(hidden_states, attention_mask=attention_mask,
+                                       output_attentions=output_attentions, output_hidden_states=output_hidden_states,
+                                       return_dict=return_dict, )
+        hidden_states = encoder_outputs[0]
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+        for i in range(len(encoder_outputs.hidden_states)):
+            print(f"hidden states {i} after encoder: {encoder_outputs.hidden_states[i].shape}")
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_outputs.hidden_states,
+                               attentions=encoder_outputs.attentions, )

src/models/audio/wav2vec_modified.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""
+This module defines the Wav2Vec model, which is a pre-trained model for speech recognition and understanding.
+It inherits from the Wav2Vec2Model class in the transformers library and provides additional functionalities
+such as feature extraction and encoding.
+Classes:
+    Wav2VecModel: Inherits from Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+Functions:
+    linear_interpolation: Interpolates the features based on the sequence length.
+"""
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput, Wav2Vec2BaseModelOutput
+class Wav2VecModel(Wav2Vec2Model):
+    """
+    Wav2VecModel is a custom model class that extends the Wav2Vec2Model class from the transformers library.
+    It inherits all the functionality of the Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+    ...
+    Attributes:
+        base_model (Wav2Vec2Model): The base Wav2Vec2Model object.
+    Methods:
+        forward(input_values, seq_len, attention_mask=None, mask_time_indices=None
+        , output_attentions=None, output_hidden_states=None, return_dict=None):
+            Forward pass of the Wav2VecModel.
+            It takes input_values, seq_len, and other optional parameters as input and returns the output of the base model.
+        feature_extract(input_values, seq_len):
+            Extracts features from the input_values using the base model.
+        encode(extract_features, attention_mask=None, mask_time_indices=None, output_attentions=None, output_hidden_states=None, return_dict=None):
+            Encodes the extracted features using the base model and returns the encoded features.
+    """
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        seq_len: Optional[int],
+        sample_strategy: Optional[str] = "presample",
+        attention_mask: Optional[torch.LongTensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+        """
+        Forward pass of the Wav2Vec model.
+        Args:
+            self: The instance of the model.
+            input_values: The input values (waveform) to the model.
+            seq_len: The sequence length of the input values.
+            sample_strategy: The sample strategy to align features and seq_len, supports ['presample', 'postsample'].
+            attention_mask: Attention mask to be used for the model.
+            mask_time_indices: Mask indices to be used for the model.
+            output_attentions: If set to True, returns attentions.
+            output_hidden_states: If set to True, returns hidden states.
+            return_dict: If set to True, returns a BaseModelOutput instead of a tuple.
+        Returns:
+            The output of the Wav2Vec model.
+        """
+        assert sample_strategy in ["presample", "postsample"], f"sample_strategy must be in ['presample', 'postsample]"
+        self.config.output_attentions = True
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        if sample_strategy == "presample":
+            extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if sample_strategy == "postsample":
+            hidden_states = linear_interpolation(hidden_states, seq_len=seq_len)
+            for i in range(len(encoder_outputs.hidden_states)):
+                encoder_outputs.hidden_states[i] = linear_interpolation(encoder_outputs.hidden_states[i], seq_len=seq_len)
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def feature_extract(
+        self,
+        input_values,
+        seq_len,
+    ):
+        """
+        Extracts features from the input values and returns the extracted features.
+        Parameters:
+        input_values (torch.Tensor): The input values to be processed.
+        seq_len (torch.Tensor): The sequence lengths of the input values.
+        Returns:
+        extracted_features (torch.Tensor): The extracted features from the input values.
+        """
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        return extract_features
+    def encode(
+        self,
+        extract_features,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        """
+        Encodes the input features into the output space.
+        Args:
+            extract_features (torch.Tensor): The extracted features from the audio signal.
+            attention_mask (torch.Tensor, optional): Attention mask to be used for padding.
+            mask_time_indices (torch.Tensor, optional): Masked indices for the time dimension.
+            output_attentions (bool, optional): If set to True, returns the attention weights.
+            output_hidden_states (bool, optional): If set to True, returns all hidden states.
+            return_dict (bool, optional): If set to True, returns a BaseModelOutput instead of the tuple.
+        Returns:
+            The encoded output features.
+        """
+        self.config.output_attentions = True
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+def linear_interpolation(features, seq_len):
+    """
+    Transpose the features to interpolate linearly.
+    Args:
+        features (torch.Tensor): The extracted features to be interpolated.
+        seq_len (torch.Tensor): The sequence lengths of the features.
+    Returns:
+        torch.Tensor: The interpolated features.
+    """
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)