diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..2c0365400a28a16ae2a220475ac274b58d8a2195 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/examples/driving_audios/10.wav filter=lfs diff=lfs merge=lfs -text
+src/examples/driving_audios/5.wav filter=lfs diff=lfs merge=lfs -text
+src/examples/driving_audios/6.wav filter=lfs diff=lfs merge=lfs -text
+src/examples/driving_audios/tmp_5.wav filter=lfs diff=lfs merge=lfs -text
+src/examples/reference_images/1.jpg filter=lfs diff=lfs merge=lfs -text
+src/examples/reference_images/7.jpg filter=lfs diff=lfs merge=lfs -text
+src/examples/silent-audio.wav filter=lfs diff=lfs merge=lfs -text
+src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/t1.jpg filter=lfs diff=lfs merge=lfs -text
diff --git a/configs/audio2motion/inference/inference.yaml b/configs/audio2motion/inference/inference.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc61f2b433f4fe5082dc23cbf5dfba4f28f50de3
--- /dev/null
+++ b/configs/audio2motion/inference/inference.yaml
@@ -0,0 +1,35 @@
+
+output_fps: 25
+## appearance and motion feature extractor
+appearance_feature_extractor_path:  pretrain_weights/decode/v1/first_stage/base_models/appearance_feature_extractor.pth
+motion_extractor_path: pretrain_weights/decode/v1/first_stage/base_models/motion_extractor.pth
+## SPADEGenerator
+spade_generator_path: pretrain_weights/decode/v1/first_stage/base_models/spade_generator.pth
+warping_module_path:  pretrain_weights/decode/v1/first_stage/base_models/warping_module.pth
+## stitching retargeting module
+stitching_retargeting_module_path:  pretrain_weights/decode/v1/first_stage/retargeting_models/stitching_retargeting_module.pth
+#
+
+# audio processer config
+audio_model_config: configs/audio2motion/model/audio_processer_config.yaml
+
+# motion processer config
+motion_processer_config: configs/audio2motion/model/liveportrait_config.yaml
+
+# motion generator model
+motion_models_config: configs/audio2motion/model/config.yaml
+use_ref_kp: False
+motion_generator_path: pretrain_weights/moda/net-200.pth
+need_normalized: True
+
+# other configs
+device_id: 0
+batch_size: 100
+
+source_max_dim: 1280 # the max dim of height and width of source image or video
+source_division: 2 # make sure the height and width of source image or video can be divided by this number
+input_height: 256
+input_width: 256
+source_fps: 25
+min_video_length: 50
+max_video_length: 500
\ No newline at end of file
diff --git a/configs/audio2motion/model/audio_processer_config.yaml b/configs/audio2motion/model/audio_processer_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf46183e496211790e9b3ad1d080f66d357f2a7c
--- /dev/null
+++ b/configs/audio2motion/model/audio_processer_config.yaml
@@ -0,0 +1,36 @@
+# models settings
+model_params:
+  model_name: hubert   # wav2vec or hubert
+  model_type: base   # base large
+  is_chinese: True
+  is_original: True
+  only_last_features: False
+  use_audio_separator: False
+  audio_separator_name: Kim_Vocal_2.onnx
+
+# model weights
+model_weights:
+  audio_separator_path: pretrain_weights/audio/audio_separator
+  hubert_path:
+    chinese:
+      base: pretrain_weights/audio/chinese-hubert-base
+# data settings
+data_params:
+  sample_rate: 16000
+  max_length: 60         # seconds
+  sub_clip_length: 3000  # samples
+  fps: 25
+  sample_strategy: "presample"
+  audio_pad_mode: replicate  # pad mode for audio, replicate or zero
+  save_to_cpu: True    # saving gpu memory
+
+# device settings
+device_params:
+  device_id: 0
+  flag_force_cpu: False
+  flag_use_half_precision: False
+  
+cache_dir: preprocessed/HDTF/vocals
+tmp_dir: src/tmp
+
+
diff --git a/configs/audio2motion/model/config.yaml b/configs/audio2motion/model/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..206c5068d49153733c41ed37de11db60cd80a567
--- /dev/null
+++ b/configs/audio2motion/model/config.yaml
@@ -0,0 +1,59 @@
+model_name: TalkingHeadDiT-B
+audio_projector:
+  type: MLP
+  pretrained_model_path: None
+  device: cuda
+  params:
+    model_name: MLP-S-3
+    sequence_length: 1
+    blocks: 12 
+    audio_feat_dim: 768 
+    keypoint_dim: 63
+    feature_dim: 512
+    output_dim: 256
+    context_tokens: 1
+    audio_embedder_type: simple
+    audio_cond_dim: 63
+motion_generator:
+  type: DiT
+  pretrained_model_path: None
+  device: cuda
+  params:
+    model_name: DiT-S-8-8
+    architecture: decoder
+    use_emo: True
+    input_dim: 70
+    output_dim: 70
+    exp_dim: 63
+    n_prev_frames: 1
+    n_pred_frames: 80
+    use_indicator: False
+    feature_dim: 256
+    n_heads: 8
+    n_layers: 8
+    mlp_ratio: 4
+    no_use_learnable_pe: True
+    norm_type: rms_norm  # [rms_norm|layer_norm]
+    qk_norm: rms_norm    # [rms_norm|layer_norm|null]
+    steps: 1000
+noise_scheduler:
+  type: flow_matching
+  sample_mode: sample
+  device: cuda
+  params:
+    time_shifting: True
+    num_train_timesteps: 1000
+    num_inference_steps: 10
+    eta: 0.2
+    beta_start: 0.0001
+    beta_end: 0.02
+    s: 0.008
+    mode: cosine
+train:
+  audio_drop_prob: 0.3
+  cond_drop_prob: 0.2
+  motion_drop_prob: 0.3
+  audio_drop_ratio : 0.2
+  motion_drop_ratio: 0.1
+  pre_drop_ratio : 0.0
+device_specific: True
\ No newline at end of file
diff --git a/configs/audio2motion/model/crop_config.yaml b/configs/audio2motion/model/crop_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7bc2dae73944edcd684baee9831585fa37990555
--- /dev/null
+++ b/configs/audio2motion/model/crop_config.yaml
@@ -0,0 +1,21 @@
+insightface_root: pretrain_weights/decode/v1/insightface
+landmark_ckpt_path: pretrain_weights/decode/v1/first_stage/landmark.onnx
+xpose_config_file_path: src/utils/UniPose_SwinT.py
+device_id: 0                         # gpu device id
+flag_force_cpu: False                # force cpu inference, WIP
+det_thresh: 0.15                      # detection threshold
+########## source image or video cropping option ##########
+dsize: 512                           # crop size
+scale: 2.3                           # scale factor
+vx_ratio: 0                          # vx ratio
+vy_ratio: -0.125                     # vy ratio +up, -down
+max_face_num: 0                      # max face number, 0 mean no limit
+flag_do_rot: True                    # whether to conduct the rotation when flag_do_crop is True
+animal_face_type: animal_face_9      # animal_face_68 -> 68 landmark points, animal_face_9 -> 9 landmarks
+########## driving video auto cropping option ##########
+scale_crop_driving_video: 2.2        # 2.0 # scale factor for cropping driving video
+vx_ratio_crop_driving_video: 0.0     # adjust x offset
+vy_ratio_crop_driving_video: -0.1  # adjust y offset
+direction: large-small               # direction of cropping
+source_max_dim: 1280
+source_division: 2
\ No newline at end of file
diff --git a/configs/audio2motion/model/liveportrait_config.yaml b/configs/audio2motion/model/liveportrait_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8701b84b6e68c49781995aea9d8d2c73f4c0b294
--- /dev/null
+++ b/configs/audio2motion/model/liveportrait_config.yaml
@@ -0,0 +1,59 @@
+# model config
+models_config: configs/audio2motion/model/models.yaml
+
+# 1. face appearance feature
+appearance_feature_extractor_path: pretrain_weights/decode/v1/first_stage/base_models/appearance_feature_extractor.pth
+
+# 2. motion feature
+motion_extractor_path: pretrain_weights/decode/v1/first_stage/base_models/motion_extractor.pth
+
+# 3. stitching retargeting module
+stitching_retargeting_module_path: pretrain_weights/decode/v1/first_stage/retargeting_models/stitching_retargeting_module.pth
+
+# 4. feature warper
+warping_module_path: pretrain_weights/decode/v1/first_stage/base_models/warping_module.pth
+
+# 5. SPADEGenerator
+spade_generator_path: pretrain_weights/decode/v1/first_stage/base_models/spade_generator.pth
+
+# 6. cropper
+crop_cfg: "configs/audio2motion/model/crop_config.yaml"
+
+# 7. face parser
+face_parser_weight_path: "pretrain_weights/face/face-parsing/79999_iter.pth"
+resnet_weight_path: "pretrain_weights/face/face-parsing/resnet18-5c106cde.pth"
+
+# motion template
+need_normalized: True
+
+# others
+batch_size: 100
+source_max_dim: 1920 # the max dim of height and width of source image or video
+source_division: 2 # make sure the height and width of source image or video can be divided by this number
+input_height: 256
+input_width: 256
+output_height: 512
+output_width: 512
+output_fps: 25
+
+# driving params
+flag_do_torch_compile: False
+flag_use_half_precision: True
+flag_relative_motion: False
+flag_normalize_lip: False
+flag_source_video_eye_retargeting: False
+flag_eye_retargeting: False
+flag_lip_retargeting: False
+flag_stitching: True
+
+lip_normalize_threshold: 0.03 # threshold for flag_normalize_lip
+source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+anchor_frame: 0 # TO IMPLEMENT
+
+driving_option: "expression-friendly" # "expression-friendly" or "pose-friendly"
+driving_multiplier: 1.0 # be used only when driving_option is "expression-friendly"
+lib_multiplier: 1.0
+driving_smooth_observation_variance: 3e-7 # the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+animation_region: "all" #["exp", "pose", "lip", "eyes", "all"], the region where the animation was performed, "exp" means the expression, "pose" means the head pose
+mask_crop: src/utils/resources/mask_template.png
+lip_array: src/utils/resources/lip_array.pkl
\ No newline at end of file
diff --git a/configs/audio2motion/model/models.yaml b/configs/audio2motion/model/models.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..131d1c65025c31e37af9239e211ea14454128a2e
--- /dev/null
+++ b/configs/audio2motion/model/models.yaml
@@ -0,0 +1,43 @@
+model_params:
+  appearance_feature_extractor_params: # the F in the paper
+    image_channel: 3
+    block_expansion: 64
+    num_down_blocks: 2
+    max_features: 512
+    reshape_channel: 32
+    reshape_depth: 16
+    num_resblocks: 6
+  motion_extractor_params: # the M in the paper
+    num_kp: 21
+    backbone: convnextv2_tiny
+  warping_module_params: # the W in the paper
+    num_kp: 21
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+    reshape_channel: 32
+    estimate_occlusion_map: True
+    dense_motion_params:
+      block_expansion: 32
+      max_features: 1024
+      num_blocks: 5
+      reshape_depth: 16
+      compress: 4
+  spade_generator_params: # the G in the paper
+    upscale: 2 # represents upsample factor 256x256 -> 512x512
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+  stitching_retargeting_module_params: # the S in the paper
+    stitching:
+      input_size: 126 # (21*3)*2
+      hidden_sizes: [128, 128, 64]
+      output_size: 65 # (21*3)+2(tx,ty)
+    lip:
+      input_size: 65 # (21*3)+2
+      hidden_sizes: [128, 128, 64]
+      output_size: 63 # (21*3)
+    eye:
+      input_size: 66 # (21*3)+3
+      hidden_sizes: [256, 256, 128, 128, 64]
+      output_size: 63 # (21*3)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..790aa163b25ba86c8ebd487239b1a1bb9c8bed53
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,45 @@
+--find-links https://download.pytorch.org/whl/torch_stable.html
+
+accelerate==0.28.0
+audio-separator==0.17.2
+av==12.1.0
+bitsandbytes==0.43.1
+decord==0.6.0
+diffusers==0.27.2
+einops==0.8.0
+huggingface==0.0.1
+huggingface-hub==0.25.1
+insightface==0.7.3
+librosa==0.10.2.post1
+mediapipe[vision]==0.10.14
+mlflow==2.13.1
+moviepy==1.0.3
+numpy==1.26.4
+omegaconf==2.3.0
+onnx2torch==1.5.14
+onnx==1.16.1
+onnxruntime-gpu==1.18.0
+opencv-python==4.10.0.84
+pillow==10.3.0
+pyyaml==6.0.1
+setuptools==70.0.0
+torch==2.2.2+cu121
+torchaudio==2.2.2
+torchvision==0.17.2+cu121
+transformers==4.39.2
+xformers==0.0.25.post1
+isort==5.13.2
+pre-commit==3.7.1
+scipy==1.13.1
+imageio==2.34.2
+lmdb==1.4.1
+rich==13.7.1
+ffmpeg-python==0.2.0
+scikit-image==0.24.0
+albumentations==1.4.10
+matplotlib==3.9.0
+imageio-ffmpeg==0.5.1
+tyro==0.8.5
+gradio==5.1.0
+pykalman==0.9.7
+tensorboardX==2.6.2.2
\ No newline at end of file
diff --git a/src/datasets/mean.pt b/src/datasets/mean.pt
new file mode 100644
index 0000000000000000000000000000000000000000..122e4f5fb17d8d240615b45e07948dc68e21121d
--- /dev/null
+++ b/src/datasets/mean.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db742e76a39bbf81fb5b09fcc488bad0cbab9355df509d8e91967b58d02c6dfc
+size 2582
diff --git a/src/datasets/preprocess/__pycache__/flow_filter.cpython-310.pyc b/src/datasets/preprocess/__pycache__/flow_filter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c690a4b941e7b2815ef71318d4fdd2575fdf8b4
Binary files /dev/null and b/src/datasets/preprocess/__pycache__/flow_filter.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/__pycache__/video_crop.cpython-310.pyc b/src/datasets/preprocess/__pycache__/video_crop.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61f702026478654336e7280c19b642c919ecbdf8
Binary files /dev/null and b/src/datasets/preprocess/__pycache__/video_crop.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/__pycache__/visualize.cpython-310.pyc b/src/datasets/preprocess/__pycache__/visualize.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4d2f9bf9dbcade810da70a879d9353b371ad94d
Binary files /dev/null and b/src/datasets/preprocess/__pycache__/visualize.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-310.pyc b/src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f8675d07ef3e63b6681bec63edf6be6b81e4e34
Binary files /dev/null and b/src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-312.pyc b/src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f3ff2cc75d8fa9ffa1200864cb933cf952a03ef
Binary files /dev/null and b/src/datasets/preprocess/extract_features/__pycache__/audio_processer.cpython-312.pyc differ
diff --git a/src/datasets/preprocess/extract_features/__pycache__/feature_extractor_pipeline.cpython-310.pyc b/src/datasets/preprocess/extract_features/__pycache__/feature_extractor_pipeline.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fe75c1e748ee302871ef0fe68dc280748fc6ff8
Binary files /dev/null and b/src/datasets/preprocess/extract_features/__pycache__/feature_extractor_pipeline.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/extract_features/__pycache__/motion_processer.cpython-310.pyc b/src/datasets/preprocess/extract_features/__pycache__/motion_processer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..294a488eee8f3a434289e1702d04a8edd91da55e
Binary files /dev/null and b/src/datasets/preprocess/extract_features/__pycache__/motion_processer.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/extract_features/__pycache__/test_processer.cpython-310.pyc b/src/datasets/preprocess/extract_features/__pycache__/test_processer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a55c771d02843c56cd6457c35f68ae84afcf66cb
Binary files /dev/null and b/src/datasets/preprocess/extract_features/__pycache__/test_processer.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/extract_features/audio_processer.py b/src/datasets/preprocess/extract_features/audio_processer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d112ae8fd0b5c0edb87e540a638e061e876b6848
--- /dev/null
+++ b/src/datasets/preprocess/extract_features/audio_processer.py
@@ -0,0 +1,471 @@
+
+import os
+from posixpath import isfile
+from re import A
+import sys
+import os.path as osp
+
+from typing import List, Dict, Tuple, Optional, Union, Any
+
+import yaml
+from omegaconf import OmegaConf
+
+import math
+import librosa
+import soundfile
+import numpy as np
+
+from einops import rearrange
+
+import torch
+import torch.nn.functional as F
+
+from pydub import AudioSegment
+from audio_separator.separator import Separator
+
+from transformers import Wav2Vec2FeatureExtractor, HubertModel
+
+from src.utils.rprint import rlog as log
+from src.utils.util import resample_audio
+
+from src.models.audio.wav2vec_modified import Wav2VecModel
+from src.models.audio.hubert import HubertModel_ as HubertModel
+
+
+def pad_audio(audio, audio_unit=320, pad_threshold=80):
+    batch_size, audio_len = audio.shape
+    n_units = audio_len // audio_unit
+    side_len = math.ceil((audio_unit * n_units + pad_threshold - audio_len) / 2)
+    if side_len >= 0:
+        reflect_len = side_len // 2
+        replicate_len = side_len % 2
+        if reflect_len > 0:
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+        if replicate_len > 0:
+            audio = F.pad(audio, (1, 1), mode='replicate')
+
+    return audio
+
+
+def cut_audio(audio_path: str, save_dir: str, length=60) -> List[str]:
+    """Cut audio into sub-divisions and return subfile paths. Supports wav format.
+
+    Args:
+        audio_path (str): the source audio file path
+        save_dir (str): the save directory of sub-divisions
+        length (int, optional): The max length of each sub-division. Defaults to 60 secs.
+
+    Returns:
+        List[str]: the subfile paths
+    """
+    audio_name = osp.basename(audio_path).split('.')[0]
+    audio = AudioSegment.from_wav(audio_path)
+    segment_length = length * 1000. # pydub uses milliseconds
+    num_segments = math.ceil(len(audio) / segment_length)
+    
+    os.makedirs(save_dir, exist_ok=True)
+    audio_list = []
+    
+    if num_segments > 1:
+        for i in range(num_segments):
+            start_time = i * segment_length
+            end_time = min((i + 1) * segment_length, len(audio))
+            segment = audio[start_time:end_time]
+            
+            path = osp.join(save_dir, f"{audio_name}_segment_{i+1}.wav")
+            audio_list.append(path)
+            segment.export(path, format="wav")
+    else:
+        audio_list = [audio_path]
+    return audio_list
+    
+    
+class AudioProcessor(object):
+    def __init__(self, cfg_path: str, is_training: bool = False, device_id=0) -> None:
+        cfg = OmegaConf.load(cfg_path)
+        self.cfg = cfg
+        self.is_training = is_training
+        log("========================================= Audio Processer =========================================")
+        log(OmegaConf.to_yaml(cfg))
+
+        # setting device 
+        self.device_id = device_id
+        self.use_half = cfg.device_params.flag_use_half_precision
+        if cfg.device_params.flag_force_cpu:
+            self.device = 'cpu'
+        else:
+            try:
+                if torch.backends.mps.is_available():
+                    self.device = 'mps'
+                else:
+                    self.device = 'cuda:' + str(self.device_id)
+            except:
+                self.device = 'cuda:' + str(self.device_id)
+
+        # init audio separator
+        self.audio_separator = None
+        self.cache_dir = cfg.cache_dir
+        self.tmp_dir = cfg.tmp_dir
+        self.use_audio_separator = cfg.model_params.use_audio_separator
+        self.audio_separator_name = cfg.model_params.audio_separator_name
+        self.audio_separator_path = cfg.model_weights.audio_separator_path
+        self.set_audio_separator(cfg.cache_dir)
+        
+        # load audio encoder, wav2vec or hubert
+        self.model_name = cfg.model_params.model_name
+        self.is_chinese = cfg.model_params.is_chinese
+        self.audio_encoder, self.feature_extractor = self.load_model(
+            model_name = cfg.model_params.model_name, 
+            model_type = cfg.model_params.model_type, 
+            is_chinese = cfg.model_params.is_chinese, 
+        )
+        self.only_last_features = cfg.model_params.only_last_features
+        if cfg.model_params.only_last_features:
+            self.feature_shape = (1, 768)
+        else:
+            self.feature_shape = (12, 768)     # features of 12 blocks
+        
+        # init data params
+        self.sample_strategy = cfg.data_params.sample_strategy
+        self.sample_rate = cfg.data_params.sample_rate
+        self.fps = cfg.data_params.fps
+        self.audio_unit = cfg.data_params.sample_rate / cfg.data_params.fps   # num of audio samples per frame
+        self.max_length = cfg.data_params.max_length
+        self.subclip_len = cfg.data_params.sub_clip_length
+        self.save_to_cpu = cfg.data_params.save_to_cpu
+        self.pad_mode = cfg.data_params.audio_pad_mode
+
+        log("========================================= Audio Processer: Done =========================================")
+        
+    def load_model(self, model_name: str="wav2vec", model_type: str="base", is_chinese: bool = False):
+        assert model_name in ["wav2vec", "hubert"], f"Unknown audio model {model_name}, only support wav2vec or hubert"
+        assert model_type in ["base", "large"], f"Unknown audio model type {model_type}, only support base or large"
+
+        if model_name == "wav2vec":
+            # load wav2vec model weights
+            if is_chinese:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.chinese.base
+                else:
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.chinese.large
+            else:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.default.base
+                else:
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.default.large
+            if model_weight_path is None:
+                raise ValueError(f"model_weight_path is None")
+            audio_encoder = Wav2VecModel.from_pretrained(model_weight_path, local_files_only=True).to(device=self.device)
+        else:
+            if is_chinese:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.hubert_path.chinese.base
+                else:
+                    model_weight_path = self.cfg.model_weights.hubert_path.chinese.large
+            else:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.hubert_path.default.base
+                else:
+                    model_weight_path = self.cfg.model_weights.hubert_path.default.large
+            if model_weight_path is None:
+                raise ValueError(f"model_weight_path is None")
+            audio_encoder = HubertModel.from_pretrained(model_weight_path, local_files_only=True).to(device=self.device)
+
+        log(f"{model_name}-{model_type}-chinese-{is_chinese} model has beed loaded from {model_weight_path}")
+        total_params = sum(p.numel() for p in audio_encoder.parameters())
+        print('Number of parameter: % .4fM' % (total_params / 1e6))
+        
+        # weights initialization
+        audio_encoder.feature_extractor._freeze_parameters()
+        if not self.cfg.model_params.is_original:
+            frozen_layers = [0, 1]
+            for name, param in audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+
+        audio_encoder = audio_encoder.to(self.device)
+        if self.use_half:
+            audio_encoder = audio_encoder.half()
+        audio_encoder.eval()
+
+        # feature extractor
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_weight_path)
+
+        return audio_encoder, feature_extractor
+
+    def set_audio_separator(self, output_dir: str) -> None:
+        del self.audio_separator
+        
+        if self.audio_separator_name is not None and self.use_audio_separator:
+            try:
+                os.makedirs(output_dir, exist_ok=True)
+            except OSError as _:
+                print("Fail to create the output cache dir.")
+            self.audio_separator = Separator(
+                output_dir=output_dir,
+                output_single_stem="vocals",
+                model_file_dir=self.audio_separator_path,
+            )
+            self.audio_separator.load_model(self.audio_separator_name)
+            assert self.audio_separator.model_instance is not None, "Fail to load audio separate model."
+        else:
+            self.audio_separator=None
+            log("Use audio directly without vocals seperator.")
+    
+    def seperate_audio(self, audio_path: str, output_dir: Union[str, None] = None) -> str:
+        if output_dir is not None:
+            if output_dir != self.cache_dir:
+                # reload audio separator
+                self.set_audio_separator(output_dir)
+        
+        if self.audio_separator is not None:
+            # 1. separate vocals
+            # TODO: process in memory
+            try:
+                outputs = self.audio_separator.separate(audio_path)
+                if len(outputs) <= 0:
+                    raise RuntimeError("Audio separate failed.")
+
+                vocal_audio_file = outputs[0]
+                vocal_audio_name, _ = os.path.splitext(vocal_audio_file)
+                vocal_audio_file = os.path.join(self.audio_separator.output_dir, vocal_audio_file)
+                vocal_audio_file = resample_audio(vocal_audio_file, os.path.join(self.audio_separator.output_dir, f"{vocal_audio_name}-16k.wav"), self.sample_rate)
+            except Exception as e:
+                log(f"Fail to separate vocals from {audio_path}, error info [{e}]")
+                vocal_audio_file=audio_path
+        else:
+            vocal_audio_file=audio_path
+        
+        return vocal_audio_file
+    
+    def load_audio(self, audio_path: str, mono: bool = True, duration: Optional[float] = None) -> Any:
+        try:
+            audio_data, sampling_rate = librosa.load(audio_path, sr=self.sample_rate, mono=mono, duration=duration)
+        except Exception as e:
+            raise RuntimeError(f"Fail to load audio from {audio_path}, error info [{e}]")
+        return audio_data, sampling_rate
+
+    def prepare_audio_data(self, audio_data: Union[np.ndarray, torch.Tensor], n_frames: Optional[int]=None) -> Tuple[List[Any], int]:
+        """Prepare audio data for processing.
+        """
+        #print(f"==========> Using Wav2Vec2FeatureExtractor to extract audio features")
+        audio_data = np.squeeze(self.feature_extractor(audio_data, sampling_rate=self.sample_rate).input_values)
+
+        clip_len = int(len(audio_data) / self.audio_unit)
+        if n_frames is not None:
+            if abs(n_frames - clip_len) > 7:
+                log(f"The number of frames must be close to the clip length (in 280ms), got {n_frames} and {clip_len}")
+                return [], n_frames
+            clip_len = n_frames
+        else:
+            n_frames = clip_len
+
+        if isinstance(audio_data, np.ndarray):
+            audio_data = torch.from_numpy(audio_data).float().to(self.device)
+        assert audio_data.ndim == 1, 'Audio must be 1D tensor.'
+
+        # padding
+        # padding audio to fit the clip length
+        n_audio_samples = round(self.audio_unit * clip_len)
+        n_padding_audio_samples = n_audio_samples - len(audio_data)
+        n_padding_frames = math.ceil(n_padding_audio_samples / self.audio_unit)
+        if n_padding_audio_samples > 0:
+            if self.pad_mode == 'zero':
+                padding_value = 0
+            elif self.pad_mode == 'replicate':
+                padding_value = float(audio_data[-1])
+            else:
+                raise ValueError(f'Unknown pad mode: {self.pad_mode}')
+            audio_data = F.pad(audio_data, (0, n_padding_audio_samples), value=padding_value)
+        
+        # devide audio into sub-divisions for saving GPU memory
+        audio_segments = []
+        if clip_len <= self.subclip_len:
+            n_subdivision = 1
+            subclip_len = clip_len
+        else:
+            n_subdivision = math.ceil(clip_len / self.subclip_len)
+            subclip_len = self.subclip_len
+        
+        for i in range(0, n_subdivision):
+            start_idx = i * subclip_len
+            end_idx = min(start_idx + subclip_len, clip_len)
+            # debug
+            #log(f"[{i+1}/{n_subdivision}] data index [{round(start_idx * self.audio_unit)}, {round(end_idx * self.audio_unit)})")
+            audio_segments.append(
+                {
+                    "data": audio_data[round(start_idx * self.audio_unit):round(end_idx * self.audio_unit)].unsqueeze(0),
+                    "start_idx": start_idx,
+                    "end_idx": end_idx,
+                    "length": end_idx - start_idx
+                }
+            )
+        return audio_segments, n_frames
+        
+    def get_audio_embedding(self, audio, clip_len: int) -> torch.Tensor:
+        if audio.ndim == 2:
+            # Extract audio features
+            assert audio.shape[1] == 16000 * clip_len / self.fps, \
+                f'Incorrect audio length {audio.shape[1]}'
+            
+            # Extract audio features
+            if self.use_half:
+                audio = audio.half()
+            embeddings = self.audio_encoder(
+                pad_audio(audio), seq_len=clip_len, sample_strategy=self.sample_strategy, output_hidden_states=True
+            )  # (N, L, 768)
+            assert len(embeddings) > 0, "Fail to extract audio embedding"
+            
+            if self.only_last_features:
+                audio_emb = embeddings.last_hidden_state.squeeze(0)
+            else:
+                audio_emb = torch.stack(
+                    embeddings.hidden_states[1:], dim=1
+                ).squeeze(0)
+                audio_emb = rearrange(audio_emb, "b s d -> s b d")
+            
+        elif audio.ndim == 3:
+            assert audio.shape[1] == clip_len, f'Incorrect audio feature length {audio.shape[1]}'
+            audio_emb = audio
+        else:
+            raise ValueError(f'Incorrect audio input shape {audio.shape}')
+        
+        return audio_emb
+
+    def get_audio_embeddings(self, audio_segments: List[Any]) -> Optional[torch.Tensor]:
+        audio_embs = []
+        for audio_segment in audio_segments:
+            if self.is_training:
+                audio_emb = self.get_audio_embedding(audio_segment["data"], audio_segment["length"])
+            else:
+                with torch.no_grad():
+                    audio_emb = self.get_audio_embedding(audio_segment["data"], audio_segment["length"])
+            
+            audio_emb = audio_emb.cpu() if self.save_to_cpu else audio_emb
+            audio_embs.append(audio_emb)
+            #log(f"audio segment [{audio_segment['start_idx']}, {audio_segment['end_idx']}) has been processed.") 
+        
+        if len(audio_embs) == 0:
+            return None
+
+        audio_emb = torch.cat(audio_embs, dim=0)
+        
+        return audio_emb
+
+    def preprocess(
+        self, 
+        audio_path: str, 
+        n_frames: Optional[int] = None, 
+        duration: Optional[float] = None, 
+        need_seperate: bool = False
+    ):
+        """ Preprocess a WAV audio file by separating the vocals from the background and resampling it to a 16 kHz sample rate.
+        The separated vocal track is then converted into wav2vec2 for further processing or analysis.
+        """
+        if need_seperate:
+            vocal_audio_file = self.seperate_audio(audio_path)
+        else:
+            vocal_audio_file = audio_path
+        
+        audio_data, sampling_rate = self.load_audio(vocal_audio_file, duration=duration)
+    
+        assert sampling_rate == 16000, "The sample rate of audio must be 16000"
+        audio_segments, n_frames = self.prepare_audio_data(audio_data, n_frames)
+        audio_emb = self.get_audio_embeddings(audio_segments)
+        if audio_emb is None:
+            log(f"{audio_path} has been processed, but no audio embedding, set as 'None'.")
+        #else:
+            #log(f"{audio_path} has been processed, audio embedding shape {audio_emb.shape}.") 
+        return audio_emb, n_frames
+    
+    def preprocess_long(
+        self, 
+        audio_path: str, 
+        need_seperate: bool = False
+    ):
+        audio_list = cut_audio(audio_path, self.tmp_dir, length=self.max_length)
+        audio_emb_list = []
+        l = 0
+
+        for idx, audio_path in enumerate(audio_list):
+            padding = (idx+1) == len(audio_list)
+            emb, length = self.preprocess(audio_path, need_seperate=need_seperate)
+            audio_emb_list.append(emb)
+            log(f"Processing audio {idx+1}/{len(audio_list)}, path: {audio_path} length: {length}")
+            l += length
+        
+        audio_emb = torch.cat(audio_emb_list)
+        audio_length = l
+
+        # remove tmp file
+        if len(audio_list) > 1:
+            for audio_path in audio_list:
+                os.remove(audio_path)
+        
+        return audio_emb, audio_length
+
+    def add_silent_audio(self, audio_path: str, silent_audio_path: Optional[str] = None, add_duration: float = 1., linear_fusion=False, mode="post"):
+        # mode, pre, post, both
+        assert mode in ["pre", "post", "both"], f"Unkown mode: {mode}, only support pre, post, both"
+        if silent_audio_path is None:
+            return audio_path, 0
+        else:
+            audio_dir = osp.dirname(audio_path)
+            audio_name = osp.basename(audio_path)
+            temp_audio_path = osp.join(audio_dir, f"tmp_{audio_name}")
+            if osp.isfile(temp_audio_path):
+                os.remove(temp_audio_path)
+
+            audio, sr1 = librosa.load(audio_path, mono=True, sr=16000)
+            # denoise
+            audio = librosa.effects.preemphasis(audio)       # enhance voice
+            # load silent audio
+            silent_audio, sr2 = librosa.load(silent_audio_path, mono=True, sr=16000)
+            silent_audio = silent_audio[:int(add_duration*sr2)]
+            
+            if linear_fusion:
+                short_len = min(len(audio), len(silent_audio))
+                fusion_ratio = np.linspace(0, 1.0, num=short_len)
+                # get pre padding audio
+                pre_pad_audio = fusion_ratio * silent_audio[:short_len] + (1 - fusion_ratio) * audio[:short_len]
+                if short_len < len(silent_audio):
+                    pre_pad_audio = np.hstack((pre_pad_audio, silent_audio[short_len:]))
+                pre_pad_audio = np.flip(pre_pad_audio, axis=0)
+                
+                # get post padding audio
+                post_pad_audio = (1 - fusion_ratio) * silent_audio[-short_len:] + fusion_ratio * audio[-short_len:]
+                if short_len < len(silent_audio):
+                    post_pad_audio = np.hstack((silent_audio[:-short_len], post_pad_audio))
+                post_pad_audio = np.flip(post_pad_audio, axis=0)
+            else:
+                pre_pad_audio = silent_audio
+                post_pad_audio = silent_audio
+            
+            # padding audio
+            if mode == "both":
+                combined_audio = np.hstack((pre_pad_audio, audio, post_pad_audio))
+            elif mode == "pre":
+                combined_audio = np.hstack((pre_pad_audio, audio))
+            else:
+                combined_audio = np.hstack((audio, post_pad_audio))
+
+            add_nframes = math.floor(add_duration * sr2 / self.audio_unit)
+            #print(f"audio length: {len(audio)}, pre_pad_audio length: {len(pre_pad_audio)}, post_pad_audio length: {len(post_pad_audio)}, combined_length: {len(combined_audio)}, total add {add_nframes*2} frames")
+            #print(f"audio duration: {librosa.get_duration(audio, sr=sr1)}, silent duration: {librosa.get_duration(silent_audio, sr=sr2)}, combined duration: {librosa.get_duration(combined_audio, sr=sr2)}")
+            soundfile.write(temp_audio_path, combined_audio, sr2)
+
+            return temp_audio_path, add_nframes
+    
+    def get_long_audio_emb(self, audio_path: str) -> torch.Tensor:
+        audio_emb, length = self.preprocess_long(audio_path)
+        log(f"Load audio from {osp.realpath(audio_path)} done, audio_emb shape: {audio_emb.shape}.")
+        return audio_emb
+
+    def __enter__(self):
+        return self
+
diff --git a/src/datasets/preprocess/extract_features/face_segmentation/__init__.py b/src/datasets/preprocess/extract_features/face_segmentation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4f3edf63ece3a81257cc4ceb25b8879accad246
--- /dev/null
+++ b/src/datasets/preprocess/extract_features/face_segmentation/__init__.py
@@ -0,0 +1,88 @@
+import cv2
+import numpy as np
+
+import torch
+from torchvision import transforms
+
+from .bisenet import BiSeNet
+
+
+def vis_parsing_maps(im, parsing_anno, stride, save_im=False, save_path='parsing_map_on_im2.jpg'):
+    # Colors for all 20 parts
+    part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
+                   [255, 0, 85], [255, 0, 170],
+                   [0, 255, 0], [85, 255, 0], [170, 255, 0],
+                   [0, 255, 85], [0, 255, 170],
+                   [0, 0, 255], [85, 0, 255], [170, 0, 255],
+                   [0, 85, 255], [0, 170, 255],
+                   [255, 255, 0], [255, 255, 85], [255, 255, 170],
+                   [255, 0, 255], [255, 85, 255], [255, 170, 255],
+                   [0, 255, 255], [85, 255, 255], [170, 255, 255]]
+
+    im = np.array(im)
+    vis_im = im.copy().astype(np.uint8)
+    vis_parsing_anno = parsing_anno.copy().astype(np.uint8)
+    vis_parsing_anno = cv2.resize(vis_parsing_anno, None, fx=stride, fy=stride, interpolation=cv2.INTER_NEAREST)
+    vis_parsing_anno_color = np.zeros((vis_parsing_anno.shape[0], vis_parsing_anno.shape[1], 3)) + 255
+
+    num_of_class = np.max(vis_parsing_anno)
+
+    for pi in range(1, num_of_class + 1):
+        index = np.where(vis_parsing_anno == pi)
+        vis_parsing_anno_color[index[0], index[1], :] = part_colors[pi]
+
+    vis_parsing_anno_color = vis_parsing_anno_color.astype(np.uint8)
+    # print(vis_parsing_anno_color.shape, vis_im.shape)
+    vis_im = cv2.addWeighted(cv2.cvtColor(vis_im, cv2.COLOR_RGB2BGR), 0.4, vis_parsing_anno_color, 0.6, 0)
+
+    # Save result or not
+    if save_im:
+        cv2.imwrite(save_path[:-4] +'.png', vis_parsing_anno)
+        cv2.imwrite(save_path, vis_im, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
+
+    # return vis_im
+
+def get_face_mask(face_parser, images, batch_size=128):
+    # images: Bx3xHxW
+    kernel = np.ones((13, 13), np.float32) 
+    face_masks = []
+    for i in range(0, images.shape[0], batch_size):
+        images_batch = images[i:i+batch_size]
+        with torch.no_grad():
+            out = face_parser(images_batch)[0]
+            parsing = out.cpu().numpy().argmax(1)
+            masks = np.zeros_like(parsing, np.float32)
+            for idx in range(1, 14):
+                masks[parsing == idx] = 1
+            
+            for mask in masks:
+                mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=2)
+                mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)
+                mask = cv2.dilate(mask, kernel, iterations=3)
+                face_masks.append(mask)
+
+    return face_masks
+
+
+def build_face_parser(weight_path, resnet_weight_path, n_classes=19, device_id=0):
+    model_state_dict = torch.load(weight_path, weights_only=False)
+    bisenet = BiSeNet(n_classes, resnet_weight_path=resnet_weight_path)
+    # load model
+    #bisenet.load_state_dict(model_state_dict, strict=True)
+    bisenet_state_dict = bisenet.state_dict()
+    for k, v in model_state_dict.items():
+        if 'fc' in k: continue
+        bisenet_state_dict.update({k: v})
+    bisenet.load_state_dict(bisenet_state_dict)
+    bisenet.to(f"cuda:{device_id}")
+
+    to_tensor = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Resize((512, 512)),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+
+    return bisenet.eval(), to_tensor
+
+
+
diff --git a/src/datasets/preprocess/extract_features/face_segmentation/__pycache__/__init__.cpython-310.pyc b/src/datasets/preprocess/extract_features/face_segmentation/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b985936ef8eb99b81e298d44b2d9c8752db262e
Binary files /dev/null and b/src/datasets/preprocess/extract_features/face_segmentation/__pycache__/__init__.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/extract_features/face_segmentation/__pycache__/bisenet.cpython-310.pyc b/src/datasets/preprocess/extract_features/face_segmentation/__pycache__/bisenet.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..429aa1db90dd2d33d8ed5febcc6e330c6f906dd1
Binary files /dev/null and b/src/datasets/preprocess/extract_features/face_segmentation/__pycache__/bisenet.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/extract_features/face_segmentation/__pycache__/resnet.cpython-310.pyc b/src/datasets/preprocess/extract_features/face_segmentation/__pycache__/resnet.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe3c1dd763cda94f20e53bc4ac987807777f3dc7
Binary files /dev/null and b/src/datasets/preprocess/extract_features/face_segmentation/__pycache__/resnet.cpython-310.pyc differ
diff --git a/src/datasets/preprocess/extract_features/face_segmentation/bisenet.py b/src/datasets/preprocess/extract_features/face_segmentation/bisenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a733b8038f835d4d241dec55edba106fda6749a
--- /dev/null
+++ b/src/datasets/preprocess/extract_features/face_segmentation/bisenet.py
@@ -0,0 +1,285 @@
+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+
+from .resnet import Resnet18
+# from modules.bn import InPlaceABNSync as BatchNorm2d
+
+
+class ConvBNReLU(nn.Module):
+    def __init__(self, in_chan, out_chan, ks=3, stride=1, padding=1, *args, **kwargs):
+        super(ConvBNReLU, self).__init__()
+        self.conv = nn.Conv2d(in_chan,
+                out_chan,
+                kernel_size = ks,
+                stride = stride,
+                padding = padding,
+                bias = False)
+        self.bn = nn.BatchNorm2d(out_chan)
+        self.init_weight()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(self.bn(x))
+        return x
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+class BiSeNetOutput(nn.Module):
+    def __init__(self, in_chan, mid_chan, n_classes, *args, **kwargs):
+        super(BiSeNetOutput, self).__init__()
+        self.conv = ConvBNReLU(in_chan, mid_chan, ks=3, stride=1, padding=1)
+        self.conv_out = nn.Conv2d(mid_chan, n_classes, kernel_size=1, bias=False)
+        self.init_weight()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.conv_out(x)
+        return x
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+class AttentionRefinementModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(AttentionRefinementModule, self).__init__()
+        self.conv = ConvBNReLU(in_chan, out_chan, ks=3, stride=1, padding=1)
+        self.conv_atten = nn.Conv2d(out_chan, out_chan, kernel_size= 1, bias=False)
+        self.bn_atten = nn.BatchNorm2d(out_chan)
+        self.sigmoid_atten = nn.Sigmoid()
+        self.init_weight()
+
+    def forward(self, x):
+        feat = self.conv(x)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv_atten(atten)
+        atten = self.bn_atten(atten)
+        atten = self.sigmoid_atten(atten)
+        out = torch.mul(feat, atten)
+        return out
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+
+class ContextPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(ContextPath, self).__init__()
+        backbone_weight_path = kwargs.get("resnet_weight_path", None)
+        self.resnet = Resnet18(backbone_weight_path)
+        self.arm16 = AttentionRefinementModule(256, 128)
+        self.arm32 = AttentionRefinementModule(512, 128)
+        self.conv_head32 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_head16 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_avg = ConvBNReLU(512, 128, ks=1, stride=1, padding=0)
+
+        self.init_weight()
+
+    def forward(self, x):
+        H0, W0 = x.size()[2:]
+        feat8, feat16, feat32 = self.resnet(x)
+        H8, W8 = feat8.size()[2:]
+        H16, W16 = feat16.size()[2:]
+        H32, W32 = feat32.size()[2:]
+
+        avg = F.avg_pool2d(feat32, feat32.size()[2:])
+        avg = self.conv_avg(avg)
+        avg_up = F.interpolate(avg, (H32, W32), mode='nearest')
+
+        feat32_arm = self.arm32(feat32)
+        feat32_sum = feat32_arm + avg_up
+        feat32_up = F.interpolate(feat32_sum, (H16, W16), mode='nearest')
+        feat32_up = self.conv_head32(feat32_up)
+
+        feat16_arm = self.arm16(feat16)
+        feat16_sum = feat16_arm + feat32_up
+        feat16_up = F.interpolate(feat16_sum, (H8, W8), mode='nearest')
+        feat16_up = self.conv_head16(feat16_up)
+
+        return feat8, feat16_up, feat32_up  # x8, x8, x16
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+### This is not used, since I replace this with the resnet feature with the same size
+class SpatialPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(SpatialPath, self).__init__()
+        self.conv1 = ConvBNReLU(3, 64, ks=7, stride=2, padding=3)
+        self.conv2 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv3 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv_out = ConvBNReLU(64, 128, ks=1, stride=1, padding=0)
+        self.init_weight()
+
+    def forward(self, x):
+        feat = self.conv1(x)
+        feat = self.conv2(feat)
+        feat = self.conv3(feat)
+        feat = self.conv_out(feat)
+        return feat
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+class FeatureFusionModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(FeatureFusionModule, self).__init__()
+        self.convblk = ConvBNReLU(in_chan, out_chan, ks=1, stride=1, padding=0)
+        self.conv1 = nn.Conv2d(out_chan,
+                out_chan//4,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.conv2 = nn.Conv2d(out_chan//4,
+                out_chan,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+        self.init_weight()
+
+    def forward(self, fsp, fcp):
+        fcat = torch.cat([fsp, fcp], dim=1)
+        feat = self.convblk(fcat)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv1(atten)
+        atten = self.relu(atten)
+        atten = self.conv2(atten)
+        atten = self.sigmoid(atten)
+        feat_atten = torch.mul(feat, atten)
+        feat_out = feat_atten + feat
+        return feat_out
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+class BiSeNet(nn.Module):
+    def __init__(self, n_classes, *args, **kwargs):
+        super(BiSeNet, self).__init__()
+        backbone_weight_path = kwargs.get("resnet_weight_path", None)
+        self.cp = ContextPath(resnet_weight_path=backbone_weight_path)
+        ## here self.sp is deleted
+        self.ffm = FeatureFusionModule(256, 256)
+        self.conv_out = BiSeNetOutput(256, 256, n_classes)
+        self.conv_out16 = BiSeNetOutput(128, 64, n_classes)
+        self.conv_out32 = BiSeNetOutput(128, 64, n_classes)
+        self.init_weight()
+
+    def forward(self, x):
+        H, W = x.size()[2:]
+        feat_res8, feat_cp8, feat_cp16 = self.cp(x)  # here return res3b1 feature
+        feat_sp = feat_res8  # use res3b1 feature to replace spatial path feature
+        feat_fuse = self.ffm(feat_sp, feat_cp8)
+
+        feat_out = self.conv_out(feat_fuse)
+        feat_out16 = self.conv_out16(feat_cp8)
+        feat_out32 = self.conv_out32(feat_cp16)
+
+        feat_out = F.interpolate(feat_out, (H, W), mode='bilinear', align_corners=True)
+        feat_out16 = F.interpolate(feat_out16, (H, W), mode='bilinear', align_corners=True)
+        feat_out32 = F.interpolate(feat_out32, (H, W), mode='bilinear', align_corners=True)
+        return feat_out, feat_out16, feat_out32
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params = [], [], [], []
+        for name, child in self.named_children():
+            child_wd_params, child_nowd_params = child.get_params()
+            if isinstance(child, FeatureFusionModule) or isinstance(child, BiSeNetOutput):
+                lr_mul_wd_params += child_wd_params
+                lr_mul_nowd_params += child_nowd_params
+            else:
+                wd_params += child_wd_params
+                nowd_params += child_nowd_params
+        return wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params
+
+
+if __name__ == "__main__":
+    net = BiSeNet(19)
+    net.cuda()
+    net.eval()
+    in_ten = torch.randn(16, 3, 640, 480).cuda()
+    out, out16, out32 = net(in_ten)
+    print(out.shape)
+
+    net.get_params()
\ No newline at end of file
diff --git a/src/datasets/preprocess/extract_features/face_segmentation/resnet.py b/src/datasets/preprocess/extract_features/face_segmentation/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..17aeac72fda5e2b19fe3d78a49971c193ff948e7
--- /dev/null
+++ b/src/datasets/preprocess/extract_features/face_segmentation/resnet.py
@@ -0,0 +1,113 @@
+
+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as modelzoo
+
+# from modules.bn import InPlaceABNSync as BatchNorm2d
+
+resnet18_url = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, in_chan, out_chan, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(in_chan, out_chan, stride)
+        self.bn1 = nn.BatchNorm2d(out_chan)
+        self.conv2 = conv3x3(out_chan, out_chan)
+        self.bn2 = nn.BatchNorm2d(out_chan)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        if in_chan != out_chan or stride != 1:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_chan, out_chan,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_chan),
+                )
+
+    def forward(self, x):
+        residual = self.conv1(x)
+        residual = F.relu(self.bn1(residual))
+        residual = self.conv2(residual)
+        residual = self.bn2(residual)
+
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        out = shortcut + residual
+        out = self.relu(out)
+        return out
+
+
+def create_layer_basic(in_chan, out_chan, bnum, stride=1):
+    layers = [BasicBlock(in_chan, out_chan, stride=stride)]
+    for i in range(bnum-1):
+        layers.append(BasicBlock(out_chan, out_chan, stride=1))
+    return nn.Sequential(*layers)
+
+
+class Resnet18(nn.Module):
+    def __init__(self, backbone_weight_path=None):
+        super(Resnet18, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = create_layer_basic(64, 64, bnum=2, stride=1)
+        self.layer2 = create_layer_basic(64, 128, bnum=2, stride=2)
+        self.layer3 = create_layer_basic(128, 256, bnum=2, stride=2)
+        self.layer4 = create_layer_basic(256, 512, bnum=2, stride=2)
+        self.init_weight(backbone_weight_path)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(self.bn1(x))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        feat8 = self.layer2(x) # 1/8
+        feat16 = self.layer3(feat8) # 1/16
+        feat32 = self.layer4(feat16) # 1/32
+        return feat8, feat16, feat32
+
+    def init_weight(self, backbone_weight_path=None):
+        if backbone_weight_path is None:
+            state_dict = modelzoo.load_url(resnet18_url)
+        else:
+            state_dict = torch.load(backbone_weight_path, weights_only=False)
+        self_state_dict = self.state_dict()
+        for k, v in state_dict.items():
+            if 'fc' in k: continue
+            self_state_dict.update({k: v})
+        self.load_state_dict(self_state_dict)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module,  nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+if __name__ == "__main__":
+    net = Resnet18()
+    x = torch.randn(16, 3, 224, 224)
+    out = net(x)
+    print(out[0].size())
+    print(out[1].size())
+    print(out[2].size())
+    net.get_params()
diff --git a/src/datasets/preprocess/extract_features/motion_processer.py b/src/datasets/preprocess/extract_features/motion_processer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac01ba46ba3a76224f42eee606fef99ff6ab72b6
--- /dev/null
+++ b/src/datasets/preprocess/extract_features/motion_processer.py
@@ -0,0 +1,1420 @@
+"""
+Motion feature extractor 
+"""
+import os
+import os.path as osp
+import sys
+import pickle
+from omegaconf import OmegaConf
+
+import torch
+
+from PIL import Image
+import numpy as np
+import cv2
+import imageio
+import pickle
+import time
+from decord import VideoReader # must after import torch
+
+from rich.progress import track
+
+
+
+
+sys.path.append(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__)))))))
+from src.datasets.preprocess.extract_features.face_segmentation import build_face_parser, get_face_mask, vis_parsing_maps
+from src.thirdparty.liveportrait.src.utils.helper import load_model, concat_feat
+from src.thirdparty.liveportrait.src.utils.io import load_image_rgb, resize_to_limit, load_video
+from src.thirdparty.liveportrait.src.utils.video import get_fps, images2video, add_audio_to_video
+from src.thirdparty.liveportrait.src.utils.camera import headpose_pred_to_degree, get_rotation_matrix
+
+from src.thirdparty.liveportrait.src.utils.cropper import Cropper
+from src.thirdparty.liveportrait.src.utils.crop import prepare_paste_back, paste_back, paste_back_with_face_mask
+from src.thirdparty.liveportrait.src.utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio
+from src.thirdparty.liveportrait.src.utils.helper import mkdir, basename, dct2device, is_image, calc_motion_multiplier
+from src.utils.filter import smooth as ksmooth
+from src.utils.filter import smooth_
+
+from skimage.metrics import peak_signal_noise_ratio
+import warnings
+
+
+def psnr(imgs1, imgs2):
+    psnrs = []
+    for img1, img2 in zip(imgs1, imgs2):
+        psnr = peak_signal_noise_ratio(img1, img2, data_range=255)
+        psnrs.append(psnr)
+    return psnrs
+
+
+def suffix(filename):
+    """a.jpg -> jpg"""
+    pos = filename.rfind(".")
+    if pos == -1:
+        return ""
+    return filename[pos + 1:]
+
+def dump(wfp, obj):
+    wd = osp.split(wfp)[0]
+    if wd != "" and not osp.exists(wd):
+        mkdir(wd)
+
+    _suffix = suffix(wfp)
+    if _suffix == "npy":
+        np.save(wfp, obj)
+    elif _suffix == "pkl":
+        pickle.dump(obj, open(wfp, "wb"))
+    else:
+        raise Exception("Unknown type: {}".format(_suffix))
+
+def load(fp):
+    suffix_ = suffix(fp)
+
+    if suffix_ == "npy":
+        return np.load(fp)
+    elif suffix_ == "pkl":
+        return pickle.load(open(fp, "rb"))
+    else:
+        raise Exception(f"Unknown type: {suffix}")
+
+
+def remove_suffix(filepath):
+    """a/b/c.jpg -> a/b/c"""
+    return osp.join(osp.dirname(filepath), basename(filepath))
+    
+
+class MotionProcesser(object):
+    def __init__(self, cfg_path, device_id=0) -> None:
+        device = f"cuda:{device_id}"
+        cfg = OmegaConf.load(cfg_path)
+        print(f"Load cfg from {osp.realpath(cfg_path)} done.")
+        print(f"=============================== Driven CFG ===============================")
+        print(OmegaConf.to_yaml(cfg))
+        print(f"=============================== ========== ===============================")
+        models_config = OmegaConf.load(cfg.models_config)
+
+        # 1. init appearance feature extractor
+        self.appearance_feature_extractor = load_model(
+            cfg.appearance_feature_extractor_path, 
+            models_config, 
+            device, 
+            'appearance_feature_extractor'
+        )
+        print(f'1. Load appearance_feature_extractor from {osp.realpath(cfg.appearance_feature_extractor_path)} done.')
+
+        # 2. # init motion extractor
+        self.motion_extractor = load_model(
+            cfg.motion_extractor_path, 
+            models_config, 
+            device, 
+            'motion_extractor'
+        )
+        print(f'2. Load motion_extractor from {osp.realpath(cfg.motion_extractor_path)} done.')
+        
+        # 3. init S and R
+        if cfg.stitching_retargeting_module_path is not None and osp.exists(cfg.stitching_retargeting_module_path):
+            self.stitching_retargeting_module = load_model(
+                cfg.stitching_retargeting_module_path, 
+                models_config, 
+                device, 
+                'stitching_retargeting_module'
+            )
+            print(f'3. Load stitching_retargeting_module from {osp.realpath(cfg.stitching_retargeting_module_path)} done.')
+        else:
+            self.stitching_retargeting_module = None
+        
+        # 4. init motion warper
+        self.warping_module = load_model(
+            cfg.warping_module_path, 
+            models_config, 
+            device, 
+            'warping_module'
+        )
+        print(f"4. Load warping_module from {osp.realpath(cfg.warping_module_path)} done.")
+
+        # 5. init decoder
+        self.spade_generator = load_model(
+            cfg.spade_generator_path, 
+            models_config, 
+            device, 
+            'spade_generator'
+        )
+        print(f"Load generator from {osp.realpath(cfg.spade_generator_path)} done.")
+
+        # # Optimize for inference
+        self.compile = cfg.flag_do_torch_compile
+        if self.compile:
+            torch._dynamo.config.suppress_errors = True  # Suppress errors and fall back to eager execution
+            self.warping_module = torch.compile(self.warping_module, mode='max-autotune')
+            self.spade_generator = torch.compile(self.spade_generator, mode='max-autotune')
+
+        # 6. init cropper
+        crop_cfg = OmegaConf.load(cfg.crop_cfg)
+        self.cropper = Cropper(crop_cfg=crop_cfg, image_type="human_face", device_id=device_id)
+    
+        self.cfg = cfg
+        self.models_config = models_config
+        self.device = device
+    
+
+        # 7. load crop mask
+        self.mask_crop = cv2.imread(cfg.mask_crop, cv2.IMREAD_COLOR)
+        # 8. load lib array
+        with open(cfg.lip_array, 'rb') as f:
+            self.lip_array = pickle.load(f)
+
+        # 9. load face parser
+        self.face_parser, self.to_tensor = build_face_parser(weight_path=cfg.face_parser_weight_path, resnet_weight_path=cfg.resnet_weight_path, device_id=device_id)
+
+    def inference_ctx(self):    
+        ctx = torch.autocast(device_type=self.device[:4], dtype=torch.float16,
+                                 enabled=self.cfg.flag_use_half_precision)
+        return ctx
+
+    @torch.no_grad()
+    def extract_feature_3d(self, x: torch.Tensor) -> torch.Tensor:
+        """ get the appearance feature of the image by F
+        x: Bx3xHxW, normalized to 0~1
+        """
+        with self.inference_ctx():
+            feature_3d = self.appearance_feature_extractor(x)
+
+        return feature_3d.float()
+
+    @torch.no_grad()
+    def get_kp_info(self, x: torch.Tensor, **kwargs) -> dict:
+        """ get the implicit keypoint information
+        x: Bx3xHxW, normalized to 0~1
+        flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape
+        return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'
+        """
+        with self.inference_ctx():
+            kp_info = self.motion_extractor(x)
+
+            if self.cfg.flag_use_half_precision:
+                # float the dict
+                for k, v in kp_info.items():
+                    if isinstance(v, torch.Tensor):
+                        kp_info[k] = v.float()
+
+        return kp_info
+
+    @torch.no_grad()
+    def refine_kp(self, kp_info):
+        bs = kp_info['exp'].shape[0]
+        kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None]  # Bx1
+        kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None]  # Bx1
+        kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None]  # Bx1
+        kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3)  # BxNx3
+        if 'kp' in kp_info.keys():
+            kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3)  # BxNx3
+
+        return kp_info
+
+    @torch.no_grad()
+    def transform_keypoint(self, kp_info: dict):
+        """
+        transform the implicit keypoints with the pose, shift, and expression deformation
+        kp: BxNx3
+        """
+        kp = kp_info['kp']    # (bs, k, 3)
+        pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
+
+        t, exp = kp_info['t'], kp_info['exp']
+        scale = kp_info['scale']
+
+        pitch = headpose_pred_to_degree(pitch)
+        yaw = headpose_pred_to_degree(yaw)
+        roll = headpose_pred_to_degree(roll)
+
+        bs = kp.shape[0]
+        if kp.ndim == 2:
+            num_kp = kp.shape[1] // 3  # Bx(num_kpx3)
+        else:
+            num_kp = kp.shape[1]  # Bxnum_kpx3
+
+        rot_mat = get_rotation_matrix(pitch, yaw, roll)    # (bs, 3, 3)
+
+        # Eqn.2: s * (R * x_c,s + exp) + t
+        kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)
+        kp_transformed *= scale[..., None]  # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
+        kp_transformed[:, :, 0:2] += t[:, None, 0:2]  # remove z, only apply tx ty
+
+        return kp_transformed
+
+    @torch.no_grad()
+    def stitching(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+        """ conduct the stitching
+        kp_source: Bxnum_kpx3
+        kp_driving: Bxnum_kpx3
+        """
+
+        if self.stitching_retargeting_module is not None:
+            bs, num_kp = kp_source.shape[:2]
+            kp_driving_new = kp_driving.clone()
+            # stich
+            feat_stiching = concat_feat(kp_source, kp_driving_new)
+            delta = self.stitching_retargeting_module['stitching'](feat_stiching) # Bxnum_kpx3
+
+            delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3)  # 1x20x3
+            delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2)  # 1x1x2
+
+            kp_driving_new += delta_exp
+            kp_driving_new[..., :2] += delta_tx_ty
+
+            return kp_driving_new
+
+        return kp_driving
+
+    @torch.no_grad()
+    def warp_decode(self, feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> dict[str, torch.Tensor]:
+        """ get the image after the warping of the implicit keypoints
+        feature_3d: Bx32x16x64x64, feature volume
+        kp_source: BxNx3
+        kp_driving: BxNx3
+        """
+        # The line 18 in Algorithm 1: D(W(f_s; x_s, x′_d,i)）
+        with self.inference_ctx():
+            if self.compile:
+                # Mark the beginning of a new CUDA Graph step
+                torch.compiler.cudagraph_mark_step_begin()
+            # get decoder input
+            ret_dct = self.warping_module(feature_3d, kp_source=kp_source, kp_driving=kp_driving)
+
+            # print(f"=============================================================================")
+            # for out_key, out_value in ret_dct.items():
+            #     if isinstance(out_value, str) or isinstance(out_value, int) or isinstance(out_value, float):
+            #         print(f"{out_key}: {out_value}")
+            #     elif isinstance(out_value, torch.Tensor):
+            #         print(f"{out_key}: tensor shape {out_value.shape}, min: {torch.min(out_value)}, max: {torch.max(out_value)}, mean: {torch.mean(out_value)}, std: {torch.std(out_value)}")
+            #     else:
+            #         print(f"{out_key}: data type {type(out_value)}")
+            # decode
+            ret_dct['out'] = self.spade_generator(feature=ret_dct['out'])
+
+            # float the dict
+            if self.cfg.flag_use_half_precision:
+                for k, v in ret_dct.items():
+                    if isinstance(v, torch.Tensor):
+                        ret_dct[k] = v.float()
+
+        return ret_dct
+    
+    def parse_output(self, out: torch.Tensor) -> np.ndarray:
+        """ construct the output as standard
+        return: 1xHxWx3, uint8
+        """
+        out = np.transpose(out.cpu().numpy(), [0, 2, 3, 1])  # 1x3xHxW -> 1xHxWx3
+        out = np.clip(out, 0, 1)  # clip to 0~1
+        out = np.clip(out * 255, 0, 255).astype(np.uint8)  # 0~1 -> 0~255
+
+        return out
+
+    @torch.no_grad()
+    def calc_combined_eye_ratio(self, c_d_eyes_i, source_lmk):
+        c_s_eyes = calc_eye_close_ratio(source_lmk[None])
+        c_s_eyes_tensor = torch.from_numpy(c_s_eyes).float().to(self.device)
+        c_d_eyes_i_tensor = torch.Tensor([c_d_eyes_i[0][0]]).reshape(1, 1).to(self.device)
+        # [c_s,eyes, c_d,eyes,i]
+        combined_eye_ratio_tensor = torch.cat([c_s_eyes_tensor, c_d_eyes_i_tensor], dim=1)
+        return combined_eye_ratio_tensor
+
+    @torch.no_grad()
+    def calc_combined_lip_ratio(self, c_d_lip_i, source_lmk):
+        c_s_lip = calc_lip_close_ratio(source_lmk[None])
+        c_s_lip_tensor = torch.from_numpy(c_s_lip).float().to(self.device)
+        c_d_lip_i_tensor = torch.Tensor([c_d_lip_i[0]]).to(self.device).reshape(1, 1) # 1x1
+        # [c_s,lip, c_d,lip,i]
+        combined_lip_ratio_tensor = torch.cat([c_s_lip_tensor, c_d_lip_i_tensor], dim=1) # 1x2
+        return combined_lip_ratio_tensor
+
+    def calc_ratio(self, lmk_lst):
+        input_eye_ratio_lst = []
+        input_lip_ratio_lst = []
+        for lmk in lmk_lst:
+            # for eyes retargeting
+            input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))
+            # for lip retargeting
+            input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))
+        return input_eye_ratio_lst, input_lip_ratio_lst
+
+    @torch.no_grad()
+    def retarget_lip(self, kp_source: torch.Tensor, lip_close_ratio: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        lip_close_ratio: Bx2
+        Return: Bx(3*num_kp)
+        """
+        feat_lip = concat_feat(kp_source, lip_close_ratio)
+
+        delta = self.stitching_retargeting_module['lip'](feat_lip)
+
+        return delta.reshape(-1, kp_source.shape[1], 3)
+
+    @torch.no_grad()
+    def retarget_eye(self, kp_source: torch.Tensor, eye_close_ratio: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        eye_close_ratio: Bx3
+        Return: Bx(3*num_kp)
+        """
+        feat_eye = concat_feat(kp_source, eye_close_ratio)
+
+        delta = self.stitching_retargeting_module['eye'](feat_eye)
+
+        return delta.reshape(-1, kp_source.shape[1], 3)
+
+    def crop_image(self, img, do_crop=False):
+        ######## process source info ########
+        if do_crop:
+            crop_info = self.cropper.crop_source_image(img, self.cropper.crop_cfg)
+            if crop_info is None:
+                raise Exception("No face detected in the source image!")
+            lmk = crop_info['lmk_crop']
+            img_crop_256x256 = crop_info['img_crop_256x256']
+        else:
+            crop_info = None
+            lmk = self.cropper.calc_lmk_from_cropped_image(img)
+            img_crop_256x256 = cv2.resize(img, (256, 256))  # force to resize to 256x256
+        return img_crop_256x256, lmk, crop_info
+
+    def crop_source_video(self, img_lst, do_crop=False):
+        if do_crop:
+            ret_s = self.cropper.crop_source_video(img_lst, self.cropper.crop_cfg)
+            print(f'Source video is cropped, {len(ret_s["frame_crop_lst"])} frames are processed.')
+            img_crop_256x256_lst, lmk_crop_lst, M_c2o_lst = ret_s['frame_crop_lst'], ret_s['lmk_crop_lst'], ret_s['M_c2o_lst']
+        else:
+            M_c2o_lst = None
+            lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(img_lst)
+            img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in img_lst]  # force to resize to 256x256
+        return img_crop_256x256_lst, lmk_crop_lst, M_c2o_lst
+    
+    def crop_driving_videos(self, img_lst, do_crop=False):
+        if do_crop:
+            ret_d = self.cropper.crop_driving_video(img_lst)
+            print(f'Driving video is cropped, {len(ret_d["frame_crop_lst"])} frames are processed.')
+            img_crop_lst, lmk_crop_lst = ret_d['frame_crop_lst'], ret_d['lmk_crop_lst']
+            img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in img_lst]
+        else:
+            lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(img_lst)
+            img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in img_lst]  # force to resize to 256x256
+        return img_crop_256x256_lst, lmk_crop_lst
+
+    def prepare_source(self, src_img):
+        """ construct the input as standard
+        img: HxWx3, uint8, 256x256
+        """
+        # processing source image to tensor
+        h, w = src_img.shape[:2]
+        if h != self.cfg.input_height or w != self.cfg.input_width:
+            x = cv2.resize(src_img, (self.cfg.input_width, self.cfg.input_height))
+        else:
+            x = src_img.copy()
+        
+        if x.ndim == 3:
+            x = x[np.newaxis].astype(np.float32) / 255.  # HxWx3 -> 1xHxWx3, normalized to 0~1
+        elif x.ndim == 4:
+            x = x.astype(np.float32) / 255.  # BxHxWx3, normalized to 0~1
+        else:
+            raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')
+        
+        x = np.clip(x, 0, 1)  # clip to 0~1
+        x = torch.from_numpy(x).permute(0, 3, 1, 2)  # 1xHxWx3 -> 1x3xHxW
+        x = x.to(self.device)
+
+        # extract features
+        I_s = x
+        f_s = self.extract_feature_3d(I_s)
+        x_s_info = self.get_kp_info(I_s)
+        
+        return f_s, x_s_info
+    
+    def process_clips(self, clips):
+        """ construct the input as standard
+        clips: NxBxHxWx3, uint8
+        """
+        # resize to 256 x 256
+        imgs = []
+        for img in clips:
+            h, w = img.shape[:2]
+            if h != self.cfg.input_height or w != self.cfg.input_width:
+                img = cv2.resize(img, (self.cfg.input_width, self.cfg.input_height))
+            else:
+                img = img.copy()
+            imgs.append(img)
+
+        # processing video frames to tensor
+        if isinstance(imgs, list):
+            _imgs = np.array(imgs)[..., np.newaxis]  # TxHxWx3x1
+        elif isinstance(imgs, np.ndarray):
+            _imgs = imgs
+        else:
+            raise ValueError(f'imgs type error: {type(imgs)}')
+
+        y = _imgs.astype(np.float32) / 255.
+        y = np.clip(y, 0, 1)  # clip to 0~1
+        y = torch.from_numpy(y).permute(0, 4, 3, 1, 2)  # TxHxWx3x1 -> Tx1x3xHxW
+        y = y.to(self.device)
+
+        return y
+
+    def prepare_driving_videos(self, vid_frames, feat_type="tensor"):
+        """ get driving kp infos
+        vid_frames: image list of HxWx3, uint8
+        """
+        # extract features
+        total_len = len(vid_frames)
+        kp_infos = {"pitch": [], "yaw": [], "roll": [], "t": [], "exp": [], "scale": [], "kp": []}
+        for start_idx in range(0, total_len, self.cfg.batch_size):
+            frames = vid_frames[start_idx: min(start_idx + self.cfg.batch_size, total_len)]
+            frames = self.process_clips(frames).squeeze(1)
+            kp_info = self.get_kp_info(frames)
+
+            for k, v in kp_info.items():
+                kp_infos[k].append(v)
+
+        # combine the kp_infos
+        for k, v in kp_infos.items():
+            kp_infos[k] = torch.cat(v, dim=0)
+
+        if feat_type == "np":
+            for k, v in kp_infos.items():
+                kp_infos[k] = v.cpu().numpy()
+
+        return kp_infos
+
+    def get_driving_template(self, kp_infos, smooth=False, dtype="pt_tensor"):
+        kp_infos = self.refine_kp(kp_infos)
+        motion_list = []
+        n_frames = len(kp_infos["exp"])
+        for idx in range(n_frames):
+            exp = kp_infos["exp"][idx]
+            scale = kp_infos["scale"][idx]
+            t = kp_infos["t"][idx]
+            pitch = kp_infos["pitch"][idx]
+            yaw = kp_infos["yaw"][idx]
+            roll = kp_infos["roll"][idx]
+            
+            R = get_rotation_matrix(pitch, yaw, roll)
+            R = R.reshape(1, 3, 3)    
+            
+            exp = exp.reshape(1, 21, 3)
+            scale = scale.reshape(1, 1)
+            t = t.reshape(1, 3)
+            pitch = pitch.reshape(1, 1)
+            yaw = yaw.reshape(1, 1)
+            roll = roll.reshape(1, 1)
+
+            if dtype == "np":
+                R = R.cpu().numpy().astype(np.float32)
+                exp = exp.cpu().numpy().astype(np.float32)
+                scale = scale.cpu().numpy().astype(np.float32)
+                t = t.cpu().numpy().astype(np.float32)
+                pitch = pitch.cpu().numpy().astype(np.float32)
+                yaw = yaw.cpu().numpy().astype(np.float32)
+                roll = roll.cpu().numpy().astype(np.float32)
+            
+            motion_list.append(
+                {"exp": exp, "scale": scale, "R": R, "t": t, "pitch": pitch, "yaw": yaw, "roll": roll}
+            )
+        tgt_motion = {'n_frames': n_frames, 'output_fps': 25, 'motion': motion_list}
+
+        if smooth:
+            print("Smoothing motion sequence...")
+            tgt_motion = smooth_(tgt_motion, method="ema")
+        return tgt_motion
+
+    @torch.no_grad()
+    def update_delta_new_eyeball_direction(self, eyeball_direction_x, eyeball_direction_y, delta_new, **kwargs):
+        if eyeball_direction_x > 0:
+                delta_new[0, 11, 0] += eyeball_direction_x * 0.0007
+                delta_new[0, 15, 0] += eyeball_direction_x * 0.001
+        else:
+            delta_new[0, 11, 0] += eyeball_direction_x * 0.001
+            delta_new[0, 15, 0] += eyeball_direction_x * 0.0007
+
+        delta_new[0, 11, 1] += eyeball_direction_y * -0.001
+        delta_new[0, 15, 1] += eyeball_direction_y * -0.001
+        blink = -eyeball_direction_y / 2.
+
+        delta_new[0, 11, 1] += blink * -0.001
+        delta_new[0, 13, 1] += blink * 0.0003
+        delta_new[0, 15, 1] += blink * -0.001
+        delta_new[0, 16, 1] += blink * 0.0003
+
+        return delta_new
+
+    def driven(self, f_s, x_s_info, s_lmk, c_s_eyes_lst, kp_infos, c_d_eyes_lst=None, c_d_lip_lst=None, smooth=False):
+        # source kp info
+        x_d_i_news=[]
+        x_ss=[]
+        f_ss=[]
+        x_s_info = self.refine_kp(x_s_info)
+        R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+        x_s = self.transform_keypoint(x_s_info)
+        x_c_s = x_s_info["kp"]
+
+        # driving kp infos
+        driving_template_dct = self.get_driving_template(kp_infos, smooth)
+        n_frames = driving_template_dct['n_frames']
+
+        # driving params
+        flag_normalize_lip = self.cfg.flag_normalize_lip
+        flag_relative_motion = self.cfg.flag_relative_motion
+        flag_source_video_eye_retargeting = self.cfg.flag_source_video_eye_retargeting
+        lip_normalize_threshold = self.cfg.lip_normalize_threshold
+        source_video_eye_retargeting_threshold = self.cfg.source_video_eye_retargeting_threshold
+        animation_region = self.cfg.animation_region
+        driving_option = self.cfg.driving_option
+        flag_stitching = self.cfg.flag_stitching
+        flag_eye_retargeting = self.cfg.flag_eye_retargeting
+        flag_lip_retargeting = self.cfg.flag_lip_retargeting
+        driving_multiplier = self.cfg.driving_multiplier
+        lib_multiplier = self.cfg.lib_multiplier
+
+        # let lip-open scalar to be 0 at first
+        lip_delta_before_animation, eye_delta_before_animation = None, None
+        if flag_normalize_lip and flag_relative_motion and s_lmk is not None:
+            c_d_lip_before_animation = [0.]
+            combined_lip_ratio_tensor_before_animation = self.calc_combined_lip_ratio(c_d_lip_before_animation, s_lmk)
+            if combined_lip_ratio_tensor_before_animation[0][0] >= lip_normalize_threshold:
+                lip_delta_before_animation = self.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+
+        # let eye-open scalar to be the same as the first frame if the latter is eye-open state
+        if flag_source_video_eye_retargeting and s_lmk is not None:
+            combined_eye_ratio_tensor_frame_zero = c_s_eyes_lst[0]
+            c_d_eye_before_animation_frame_zero = [[combined_eye_ratio_tensor_frame_zero[0][:2].mean()]]
+            if c_d_eye_before_animation_frame_zero[0][0] < source_video_eye_retargeting_threshold:
+                c_d_eye_before_animation_frame_zero = [[0.39]]
+            combined_eye_ratio_tensor_before_animation = self.calc_combined_eye_ratio(c_d_eye_before_animation_frame_zero, s_lmk)
+            eye_delta_before_animation = self.retarget_eye(x_s, combined_eye_ratio_tensor_before_animation)
+        
+        # animate 
+        I_p_lst = []
+        for i in range(n_frames):
+            x_d_i_info = driving_template_dct['motion'][i]
+            x_d_i_info = dct2device(x_d_i_info, self.device)
+            # R
+            R_d_i = x_d_i_info['R']
+            if i == 0:  # cache the first frame
+                R_d_0 = R_d_i
+                x_d_0_info = x_d_i_info.copy()
+            
+            # enhance lip
+            # if i > 0:
+            #     for lip_idx in [6, 12, 14, 17, 19, 20]:
+            #         x_d_i_info['exp'][:, lip_idx, :] = x_d_0_info['exp'][:, lip_idx, :] + (x_d_i_info['exp'][:, lip_idx, :] - x_d_0_info['exp'][:, lip_idx, :]) * lib_multiplier
+            
+            # normalize eye_ball, TODO
+            x_d_i_info['exp'] = self.update_delta_new_eyeball_direction(0, -5, x_d_i_info['exp'])
+
+            # debug
+            #print(f"frame {i:03d}, src scale {x_s_info['scale']}, 0 scale {x_d_0_info['scale']}, i scale {x_d_i_info['scale']}")
+            # delta
+            delta_new = x_s_info['exp'].clone()
+            if flag_relative_motion:
+                # R
+                if animation_region == "all" or animation_region == "pose":
+                    R_new = (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
+                else:
+                    R_new = R_s
+
+                # exp
+                if animation_region == "all" or animation_region == "exp":
+                    delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, eyes_idx, :]
+                
+                # scale
+                if animation_region == "all":
+                    scale_new = x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
+                else:
+                    scale_new = x_s_info['scale']
+
+                # translation
+                if animation_region == "all" or animation_region == "pose":
+                    t_new = x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
+                else:
+                    t_new = x_s_info['t']
+            else:
+                # R
+                if animation_region == "all" or animation_region == "pose":
+                    R_new = R_d_i
+                else:
+                    R_new = R_s
+
+                # exp
+                if animation_region == "all" or animation_region == "exp":
+                    EYE_IDX=[1,2,6,11,12,13,14,15,16,17,18,19,20]
+                    delta_new[:, EYE_IDX, :] = x_d_i_info['exp'][:, EYE_IDX, :]
+                    # for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                    #     delta_new[:, idx, :] = x_d_i_info['exp'][:, idx, :]
+                    delta_new[:, 3:5, 1] = x_d_i_info['exp'][:, 3:5, 1]
+                    delta_new[:, 5, 2] = x_d_i_info['exp'][:, 5, 2]
+                    delta_new[:, 8, 2] = x_d_i_info['exp'][:, 8, 2]
+                    delta_new[:, 9, 1:] = x_d_i_info['exp'][:, 9, 1:]
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_i_info['exp'][:, lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_i_info['exp'][:, eyes_idx, :]
+                
+                # scale
+                scale_new = x_s_info['scale']
+
+                # translation
+                if animation_region == "all" or animation_region == "pose":
+                    t_new = x_d_i_info['t']
+                else:
+                    t_new = x_s_info['t']
+
+            t_new[..., 2].fill_(0)  # zero tz
+
+            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+
+            if flag_relative_motion and driving_option == "expression-friendly":
+                if i == 0:
+                    x_d_0_new = x_d_i_new
+                    motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)
+                x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier
+                x_d_i_new = x_d_diff + x_s
+            
+            # Algorithm 1 in Liveportrait:
+            if not flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # without stitching or retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new += lip_delta_before_animation
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+                else:
+                    pass
+            elif flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # with stitching and without retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new) + lip_delta_before_animation
+                else:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+            else:
+                eyes_delta, lip_delta = None, None
+                if flag_eye_retargeting and s_lmk is not None and c_d_eyes_lst is not None:
+                    c_d_eyes_i = c_d_eyes_lst[i]
+                    combined_eye_ratio_tensor = self.calc_combined_eye_ratio(c_d_eyes_i, s_lmk)
+                    eyes_delta = self.retarget_eye(x_s, combined_eye_ratio_tensor)
+
+                if flag_lip_retargeting and s_lmk is not None and c_d_lip_lst is not None:
+                    c_d_lip_i = c_d_lip_lst[i]
+                    combined_lip_ratio_tensor = self.calc_combined_lip_ratio(c_d_lip_i, s_lmk)
+                    # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+                    lip_delta = self.retarget_lip(x_s, combined_lip_ratio_tensor)
+
+                if flag_relative_motion:  # use x_s
+                    x_d_i_new = x_s + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                else:  # use x_d,i
+                    x_d_i_new = x_d_i_new + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+
+                if flag_stitching:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+
+            x_d_i_new = x_s + (x_d_i_new - x_s) * driving_multiplier
+            x_d_i_news.append(x_d_i_new)
+        f_s_s= f_s.expand(n_frames, *f_s.shape[1:]) 
+        x_s_s = x_s.expand(n_frames, *x_s.shape[1:])  
+        x_d_i_new = torch.cat(x_d_i_news, dim=0)        
+        for start in range(0, n_frames, 100):
+            end = min(start + 100,n_frames)
+            with torch.no_grad(), torch.autocast('cuda'):
+                out = self.warp_decode(f_s_s[start:end], x_s_s[start:end], x_d_i_new[start:end])        
+                I_p_lst.append(out['out'])
+        I_p=torch.cat(I_p_lst, dim=0) 
+        I_p_i = self.parse_output(I_p)
+        return I_p_i 
+
+    def driven_debug(self, f_s, x_s_info, s_lmk, c_s_eyes_lst, driving_template_dct, c_d_eyes_lst=None, c_d_lip_lst=None):
+        # source kp info
+        x_s_info = self.refine_kp(x_s_info)
+        x_c_s = x_s_info["kp"]
+        R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+        x_s = self.transform_keypoint(x_s_info)
+        
+        n_frames = driving_template_dct['n_frames']
+
+        # driving params
+        flag_normalize_lip = self.cfg.flag_normalize_lip
+        flag_relative_motion = self.cfg.flag_relative_motion
+        flag_source_video_eye_retargeting = self.cfg.flag_source_video_eye_retargeting
+        lip_normalize_threshold = self.cfg.lip_normalize_threshold
+        source_video_eye_retargeting_threshold = self.cfg.source_video_eye_retargeting_threshold
+        animation_region = self.cfg.animation_region
+        driving_option = self.cfg.driving_option
+        flag_stitching = self.cfg.flag_stitching
+        flag_eye_retargeting = self.cfg.flag_eye_retargeting
+        flag_lip_retargeting = self.cfg.flag_lip_retargeting
+        driving_multiplier = self.cfg.driving_multiplier
+
+        # let lip-open scalar to be 0 at first
+        lip_delta_before_animation, eye_delta_before_animation = None, None
+        if flag_normalize_lip and flag_relative_motion and s_lmk is not None:
+            c_d_lip_before_animation = [0.]
+            combined_lip_ratio_tensor_before_animation = self.calc_combined_lip_ratio(c_d_lip_before_animation, s_lmk)
+            if combined_lip_ratio_tensor_before_animation[0][0] >= lip_normalize_threshold:
+                lip_delta_before_animation = self.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+
+        # let eye-open scalar to be the same as the first frame if the latter is eye-open state
+        if flag_source_video_eye_retargeting and s_lmk is not None:
+            combined_eye_ratio_tensor_frame_zero = c_s_eyes_lst[0]
+            c_d_eye_before_animation_frame_zero = [[combined_eye_ratio_tensor_frame_zero[0][:2].mean()]]
+            if c_d_eye_before_animation_frame_zero[0][0] < source_video_eye_retargeting_threshold:
+                c_d_eye_before_animation_frame_zero = [[0.39]]
+            combined_eye_ratio_tensor_before_animation = self.calc_combined_eye_ratio(c_d_eye_before_animation_frame_zero, s_lmk)
+            eye_delta_before_animation = self.retarget_eye(x_s, combined_eye_ratio_tensor_before_animation)
+        
+        # animate 
+        I_p_lst = []
+        for i in range(n_frames):
+            x_d_i_info = driving_template_dct['motion'][i]
+            x_d_i_info = dct2device(x_d_i_info, self.device)
+            # R
+            R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys
+            if i == 0:  # cache the first frame
+                R_d_0 = R_d_i
+                x_d_0_info = x_d_i_info.copy()
+            
+            # debug
+            #print(f"frame {i:03d}, src scale {x_s_info['scale']}, 0 scale {x_d_0_info['scale']}, i scale {x_d_i_info['scale']}")
+            # delta
+            delta_new = x_s_info['exp'].clone()
+            if flag_relative_motion:
+                # R
+                if animation_region == "all" or animation_region == "pose":
+                    R_new = (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
+                else:
+                    R_new = R_s
+
+                # exp
+                if animation_region == "all" or animation_region == "exp":
+                    delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, eyes_idx, :]
+                
+                # scale
+                if animation_region == "all":
+                    scale_new = x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
+                else:
+                    scale_new = x_s_info['scale']
+
+                # translation
+                if animation_region == "all" or animation_region == "pose":
+                    t_new = x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
+                else:
+                    t_new = x_s_info['t']
+            else:
+                # R
+                if animation_region == "all" or animation_region == "pose":
+                    R_new = R_d_i
+                else:
+                    R_new = R_s
+
+                # exp
+                if animation_region == "all" or animation_region == "exp":
+                    for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                        delta_new[:, idx, :] = x_d_i_info['exp'][:, idx, :]
+                    delta_new[:, 3:5, 1] = x_d_i_info['exp'][:, 3:5, 1]
+                    delta_new[:, 5, 2] = x_d_i_info['exp'][:, 5, 2]
+                    delta_new[:, 8, 2] = x_d_i_info['exp'][:, 8, 2]
+                    delta_new[:, 9, 1:] = x_d_i_info['exp'][:, 9, 1:]
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_i_info['exp'][:, lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_i_info['exp'][:, eyes_idx, :]
+                
+                # scale
+                scale_new = x_s_info['scale']
+
+                # translation
+                if animation_region == "all" or animation_region == "pose":
+                    t_new = x_d_i_info['t']
+                else:
+                    t_new = x_s_info['t']
+
+            t_new[..., 2].fill_(0)  # zero tz
+
+            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+
+            if flag_relative_motion and driving_option == "expression-friendly":
+                if i == 0:
+                    x_d_0_new = x_d_i_new
+                    motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)
+                x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier
+                x_d_i_new = x_d_diff + x_s
+            
+            # Algorithm 1 in Liveportrait:
+            if not flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # without stitching or retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new += lip_delta_before_animation
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+                else:
+                    pass
+            elif flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # with stitching and without retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new) + lip_delta_before_animation
+                else:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+            else:
+                eyes_delta, lip_delta = None, None
+                if flag_eye_retargeting and s_lmk is not None and c_d_eyes_lst is not None:
+                    c_d_eyes_i = c_d_eyes_lst[i]
+                    combined_eye_ratio_tensor = self.calc_combined_eye_ratio(c_d_eyes_i, s_lmk)
+                    eyes_delta = self.retarget_eye(x_s, combined_eye_ratio_tensor)
+
+                if flag_lip_retargeting and s_lmk is not None and c_d_lip_lst is not None:
+                    c_d_lip_i = c_d_lip_lst[i]
+                    combined_lip_ratio_tensor = self.calc_combined_lip_ratio(c_d_lip_i, s_lmk)
+                    # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+                    lip_delta = self.retarget_lip(x_s, combined_lip_ratio_tensor)
+
+                if flag_relative_motion:  # use x_s
+                    x_d_i_new = x_s + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                else:  # use x_d,i
+                    x_d_i_new = x_d_i_new + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+
+                if flag_stitching:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+
+            x_d_i_new = x_s + (x_d_i_new - x_s) * driving_multiplier
+            out = self.warp_decode(f_s, x_s, x_d_i_new)
+            I_p_i = self.parse_output(out['out'])[0]
+            I_p_lst.append(I_p_i)
+        
+        return I_p_lst 
+
+    def read_image(self, image_path: str) -> list:
+        img_rgb = load_image_rgb(image_path)
+        img_rgb = resize_to_limit(img_rgb, self.cfg.source_max_dim, self.cfg.source_division)
+        source_rgb_list = [img_rgb]
+        print(f"Load image from {osp.realpath(image_path)} done.")
+        return source_rgb_list
+
+    def read_video(self, video_path: str, interval=None) -> list:
+        vr = VideoReader(video_path)
+        if interval is not None:
+            video_frames = vr.get_batch(np.arange(0, len(vr), interval)).numpy()
+        else:
+            video_frames = [vr[0].numpy(), vr[len(vr) // 2].numpy(), vr[-1].numpy()]
+        vr.seek(0)
+        driving_rgb_list = []
+        for video_frame in video_frames:
+            # h, w = video_frame.shape[:2]
+            # if h != self.cfg.output_height or w != self.cfg.output_width:
+            #     video_frame = cv2.resize(video_frame, (self.cfg.output_height, self.cfg.output_width))
+            driving_rgb_list.append(video_frame)
+
+        return driving_rgb_list
+
+    def prepare_videos(self, imgs) -> torch.Tensor:
+        """ construct the input as standard
+        imgs: NxBxHxWx3, uint8
+        """
+        if isinstance(imgs, list):
+            _imgs = np.array(imgs)[..., np.newaxis]  # TxHxWx3x1
+        elif isinstance(imgs, np.ndarray):
+            _imgs = imgs
+        else:
+            raise ValueError(f'imgs type error: {type(imgs)}')
+
+        y = _imgs.astype(np.float32) / 255.
+        y = np.clip(y, 0, 1)  # clip to 0~1
+        y = torch.from_numpy(y).permute(0, 4, 3, 1, 2)  # TxHxWx3x1 -> Tx1x3xHxW
+        y = y.to(self.device)
+
+        return y
+
+    def make_motion_template(self, I_lst, c_eyes_lst, c_lip_lst, **kwargs):
+        n_frames = I_lst.shape[0]
+        template_dct = {
+            'n_frames': n_frames,
+            'output_fps': kwargs.get('output_fps', 25),
+            'motion': [],
+            'c_eyes_lst': [],
+            'c_lip_lst': [],
+        }
+
+        for i in track(range(n_frames), description='Making motion templates...', total=n_frames):
+            # collect s, R, δ and t for inference
+            I_i = I_lst[i]
+            x_i_info = self.refine_kp(self.get_kp_info(I_i))
+            x_s = self.transform_keypoint(x_i_info)
+            R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])
+
+            item_dct = {
+                'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),
+                'R': R_i.cpu().numpy().astype(np.float32),
+                'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),
+                't': x_i_info['t'].cpu().numpy().astype(np.float32),
+                'kp': x_i_info['kp'].cpu().numpy().astype(np.float32),
+                'x_s': x_s.cpu().numpy().astype(np.float32),
+            }
+
+            template_dct['motion'].append(item_dct)
+
+            c_eyes = c_eyes_lst[i].astype(np.float32)
+            template_dct['c_eyes_lst'].append(c_eyes)
+
+            c_lip = c_lip_lst[i].astype(np.float32)
+            template_dct['c_lip_lst'].append(c_lip)
+
+        return template_dct
+
+    def load_template(self, wfp_template):
+        print(f"Load from template: {wfp_template}, NOT the video, so the cropping video and audio are both NULL.")
+        driving_template_dct = load(wfp_template)
+        c_d_eyes_lst = driving_template_dct['c_eyes_lst'] if 'c_eyes_lst' in driving_template_dct.keys() else driving_template_dct['c_d_eyes_lst'] # compatible with previous keys
+        c_d_lip_lst = driving_template_dct['c_lip_lst'] if 'c_lip_lst' in driving_template_dct.keys() else driving_template_dct['c_d_lip_lst']
+        driving_n_frames = driving_template_dct['n_frames']
+        flag_is_driving_video = True if driving_n_frames > 1 else False
+        n_frames = driving_n_frames
+
+        # set output_fps
+        output_fps = driving_template_dct.get('output_fps', 25)
+        print(f'The FPS of template: {output_fps}')
+        return driving_template_dct
+
+    def reconstruction(self, src_img, dst_imgs, video_path="template"):
+        # prepare source
+        src_img_256x256, s_lmk, _ = self.crop_image(src_img, do_crop=False)
+        #c_s_eyes_lst, c_s_lip_lst = self.calc_ratio([s_lmk])
+        c_s_eyes_lst = None
+        f_s, x_s_info = self.prepare_source(src_img_256x256)
+        
+        # prepare driving videos
+        dst_imgs_256x256, d_lmk_lst = self.crop_driving_videos(dst_imgs, do_crop=False)
+        c_d_eyes_lst, c_d_lip_lst = self.calc_ratio(d_lmk_lst)
+        kp_infos = self.prepare_driving_videos(dst_imgs_256x256)
+
+
+        recs = self.driven(f_s, x_s_info, s_lmk, c_s_eyes_lst, kp_infos, c_d_eyes_lst, c_d_lip_lst)
+        return recs
+
+    def save_results(self, results, save_path, audio_path=None):
+        save_dir = osp.dirname(save_path)
+        save_name = osp.basename(save_path)
+        final_video = osp.join(save_dir, f'final_{save_name}')
+
+        images2video(results, wfp=save_path, fps=self.cfg.output_fps)
+
+        if audio_path is not None:
+            add_audio_to_video(save_path, audio_path, final_video)
+            os.remove(save_path)
+    
+    def rec_score(self, video_path: str, interval=None, save_path=None):
+        video_frames = self.read_video(video_path, interval=interval)
+        #print(f"len frames: {len(video_frames)}, shape: {video_frames[0].shape}")
+        recs = self.reconstruction(video_frames[0], video_frames[1:], video_path)
+        if save_path is not None:
+            self.save_results(recs, save_path)
+        #print(f"len rec: {len(recs)}, shape: {recs[0].shape}")
+        psnrs = psnr(video_frames[1:], recs)
+        psnrs_np = np.array(psnrs)
+        psnr_mean, psnr_std = np.mean(psnrs_np), np.std(psnrs_np)
+        rec_score = {"mean": psnr_mean, "std": psnr_std}
+        return rec_score
+
+    @torch.no_grad()
+    def paste_back_by_face_mask(self, result, crop_info, src_img, crop_src_image, use_laplacian=False):
+        """
+        paste back the result to the original image with face mask
+        """
+        # detect src mask
+        crop_src_tensor = self.to_tensor(crop_src_image).unsqueeze(0).to(self.device)
+        src_msks = get_face_mask(self.face_parser, crop_src_tensor)
+        result_tensor = self.to_tensor(result).unsqueeze(0).to(self.device)
+        result_msks = get_face_mask(self.face_parser, result_tensor)
+        # combine masks
+        masks = []
+        for src_msk, result_msk in zip(src_msks, result_msks):
+            mask = np.clip(src_msk + result_msk, 0, 1)
+            masks.append(mask)
+        result = paste_back_with_face_mask(result, crop_info, src_img, masks[0], use_laplacian=use_laplacian)
+        return result
+
+    def driven_by_audio(self, src_img, kp_infos, save_path, audio_path=None, smooth=False):
+        # prepare source
+        # prepare source
+        src_img_256x256, s_lmk, crop_info = self.crop_image(src_img, do_crop=True)
+        #c_s_eyes_lst, c_s_lip_lst = self.calc_ratio([s_lmk])
+        c_s_eyes_lst = None
+        f_s, x_s_info = self.prepare_source(src_img_256x256)
+
+        mask_ori_float = prepare_paste_back(self.mask_crop, crop_info['M_c2o'], dsize=(src_img.shape[1], src_img.shape[0]))
+        
+        # prepare driving videos
+        results = self.driven(f_s, x_s_info, s_lmk, c_s_eyes_lst, kp_infos, smooth=smooth)
+        frames=results.shape[0]
+        results = [paste_back(results[i], crop_info['M_c2o'], src_img, mask_ori_float) for i in range(frames)]
+        self.save_results(results, save_path, audio_path)
+    def mix_kp_infos(self, emo_kp_infos, lip_kp_infos, smooth=False, dtype="pt_tensor"):
+        driving_emo_template_dct = self.get_driving_template(emo_kp_infos, smooth=False, dtype=dtype)
+        if lip_kp_infos is not None:
+            driving_lip_template_dct = self.get_driving_template(lip_kp_infos, smooth=smooth, dtype=dtype)
+            driving_template_dct = {**driving_emo_template_dct}
+            n_frames = min(driving_emo_template_dct['n_frames'], driving_lip_template_dct['n_frames'])
+            driving_template_dct['n_frames'] = n_frames
+            for i in range(n_frames):
+                emo_motion = driving_emo_template_dct['motion'][i]['exp']
+                lib_motion = driving_lip_template_dct['motion'][i]['exp']
+                for lip_idx in [6, 12, 14, 17, 19, 20]:
+                    emo_motion[:, lip_idx, :] = lib_motion[:, lip_idx, :]
+                driving_template_dct['motion'][i]['exp'] = emo_motion
+        else:
+            driving_template_dct = driving_emo_template_dct
+        
+        return driving_template_dct
+
+    def driven_by_mix(self, src_img, driving_video_path, kp_infos, save_path, audio_path=None, smooth=False):
+        # prepare source
+        src_img_256x256, s_lmk, crop_info = self.crop_image(src_img, do_crop=True)
+        c_s_eyes_lst, c_s_lip_lst = self.calc_ratio([s_lmk])
+        f_s, x_s_info = self.prepare_source(src_img_256x256)
+        mask_ori_float = prepare_paste_back(self.mask_crop, crop_info['M_c2o'], dsize=(src_img.shape[1], src_img.shape[0]))
+        # prepare driving videos
+        driving_imgs = self.read_video(driving_video_path, interval=1)
+        dst_imgs_256x256, d_lmk_lst = self.crop_driving_videos(driving_imgs, do_crop=True)
+        c_d_eyes_lst, c_d_lip_lst = self.calc_ratio(d_lmk_lst)
+        emo_kp_infos = self.prepare_driving_videos(dst_imgs_256x256)
+        # mix kp_infos
+        driving_template_dct = self.mix_kp_infos(emo_kp_infos, kp_infos, smooth=smooth)
+        # driven 
+        results = self.driven_debug(f_s, x_s_info, s_lmk, c_s_eyes_lst, driving_template_dct, c_d_eyes_lst=c_d_eyes_lst, c_d_lip_lst=c_d_lip_lst)
+        results = [paste_back(result, crop_info['M_c2o'], src_img, mask_ori_float) for result in results]
+        print(results.shape)
+        self.save_results(results, save_path, audio_path)
+    
+    def drive_video_by_mix(self, video_path, driving_video_path, kp_infos, save_path, audio_path):
+        # prepare driving videos
+        driving_imgs = self.read_video(driving_video_path, interval=1)
+        dst_imgs_256x256, d_lmk_lst = self.crop_driving_videos(driving_imgs, do_crop=True)
+        emo_kp_infos = self.prepare_driving_videos(dst_imgs_256x256)
+        # mix kp_infos
+        #driving_template_dct = self.get_driving_template(emo_kp_infos, smooth=True, dtype="np")
+        driving_template_dct = self.mix_kp_infos(emo_kp_infos, kp_infos, smooth=True, dtype="np")
+        # driven
+        self.video_lip_retargeting(
+            video_path, None, 
+            save_path, audio_path, 
+            driving_template_dct=driving_template_dct, retargeting_ragion="exp"
+        )
+
+    def load_source_video(self, video_info, n_frames=-1):
+        reader = imageio.get_reader(video_info, "ffmpeg")
+
+        ret = []
+        for idx, frame_rgb in enumerate(reader):
+            if n_frames > 0 and idx >= n_frames:
+                break
+            ret.append(frame_rgb)
+
+        reader.close()
+        
+        return ret
+
+    def video_lip_retargeting(self, video_path, kp_infos, save_path, audio_path, c_d_eyes_lst=None, c_d_lip_lst=None, smooth=False, driving_template_dct=None, retargeting_ragion="exp"):
+        # 0. process source motion template
+        source_rgb_lst = load_video(video_path)
+        source_rgb_lst = [resize_to_limit(img, self.cfg.source_max_dim, self.cfg.source_division) for img in source_rgb_lst]
+        img_crop_256x256_lst, source_lmk_crop_lst, source_M_c2o_lst = self.crop_source_video(source_rgb_lst, do_crop=True)
+        c_s_eyes_lst, c_s_lip_lst = self.calc_ratio(source_lmk_crop_lst)
+        I_s_lst = self.prepare_videos(img_crop_256x256_lst)
+        source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=25)
+        # 1. prepare driving template
+        if driving_template_dct is None:
+            driving_template_dct = self.get_driving_template(kp_infos, smooth=smooth, dtype="np")
+        # 2. driving
+        n_frames = min(source_template_dct['n_frames'], driving_template_dct['n_frames'])
+        # driving params
+        I_p_lst = []
+        I_p_pstbk_lst = []
+        R_d_0, x_d_0_info = None, None
+        flag_normalize_lip = self.cfg.flag_normalize_lip
+        flag_relative_motion = True #self.cfg.flag_relative_motion
+        flag_source_video_eye_retargeting = self.cfg.flag_source_video_eye_retargeting
+        lip_normalize_threshold = self.cfg.lip_normalize_threshold
+        source_video_eye_retargeting_threshold = self.cfg.source_video_eye_retargeting_threshold
+        animation_region = 'lip' #self.cfg.animation_region
+        driving_option = self.cfg.driving_option
+        flag_stitching = self.cfg.flag_stitching
+        flag_eye_retargeting = self.cfg.flag_eye_retargeting
+        flag_lip_retargeting = self.cfg.flag_lip_retargeting
+        driving_multiplier = self.cfg.driving_multiplier
+        driving_smooth_observation_variance = self.cfg.driving_smooth_observation_variance
+        
+        key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d'
+        if flag_relative_motion:
+            x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]
+            for i in range(n_frames):
+                for idx in [6, 12, 14, 17, 19, 20]:
+                    # lip motion use abs motion
+                    x_d_exp_lst[i][:, idx, :] = driving_template_dct['motion'][i]['exp'][:, idx, :]
+            x_d_exp_lst_smooth = ksmooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, self.device, driving_smooth_observation_variance)
+            
+            if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]
+                x_d_r_lst_smooth = ksmooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, self.device, driving_smooth_observation_variance)
+        else:
+            x_d_exp_lst = [driving_template_dct['motion'][i]['exp'] for i in range(n_frames)]
+            x_d_exp_lst_smooth = ksmooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, self.device, driving_smooth_observation_variance)
+
+            if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                x_d_r_lst = [driving_template_dct['motion'][i][key_r] for i in range(n_frames)]
+                x_d_r_lst_smooth = ksmooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, self.device, driving_smooth_observation_variance)
+        
+        # driving all
+        for i in track(range(n_frames), description='🚀Retargeting...', total=n_frames):
+            x_s_info = source_template_dct['motion'][i]
+            x_s_info = dct2device(x_s_info, self.device)
+
+            source_lmk = source_lmk_crop_lst[i]
+            img_crop_256x256 = img_crop_256x256_lst[i]
+            I_s = I_s_lst[i]
+            f_s = self.extract_feature_3d(I_s)
+
+            x_c_s = x_s_info['kp']
+            R_s = x_s_info['R']
+            x_s =x_s_info['x_s']
+
+            # let lip-open scalar to be 0 at first if the input is a video
+            lip_delta_before_animation = None
+            if flag_normalize_lip and flag_relative_motion and source_lmk is not None:
+                c_d_lip_before_animation = [0.]
+                combined_lip_ratio_tensor_before_animation = self.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
+                if combined_lip_ratio_tensor_before_animation[0][0] >= lip_normalize_threshold:
+                    lip_delta_before_animation = self.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+                else:
+                    lip_delta_before_animation = None
+
+            # let eye-open scalar to be the same as the first frame if the latter is eye-open state
+            eye_delta_before_animation = None
+            if flag_source_video_eye_retargeting and source_lmk is not None:
+                if i == 0:
+                    combined_eye_ratio_tensor_frame_zero = c_s_eyes_lst[0]
+                    c_d_eye_before_animation_frame_zero = [[combined_eye_ratio_tensor_frame_zero[0][:2].mean()]]
+                    if c_d_eye_before_animation_frame_zero[0][0] < source_video_eye_retargeting_threshold:
+                        c_d_eye_before_animation_frame_zero = [[0.39]]
+                combined_eye_ratio_tensor_before_animation = self.calc_combined_eye_ratio(c_d_eye_before_animation_frame_zero, source_lmk)
+                eye_delta_before_animation = self.retarget_eye(x_s, combined_eye_ratio_tensor_before_animation)
+            
+            if flag_stitching:  # prepare for paste back
+                mask_ori_float = prepare_paste_back(self.mask_crop, source_M_c2o_lst[i], dsize=(source_rgb_lst[i].shape[1], source_rgb_lst[i].shape[0]))
+            
+            x_d_i_info = driving_template_dct['motion'][i]
+            x_d_i_info = dct2device(x_d_i_info, self.device)
+            R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys
+
+            if i == 0:  # cache the first frame
+                R_d_0 = R_d_i
+                x_d_0_info = x_d_i_info.copy()
+            
+            delta_new = x_s_info['exp'].clone()
+            if flag_relative_motion:
+                if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                    R_new = x_d_r_lst_smooth[i]
+                else:
+                    R_new = R_s
+                if animation_region == "all" or animation_region == "exp":
+                    for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]
+                elif animation_region == "all_wo_lip" or animation_region == "exp_wo_lip":
+                    for idx in [1, 2, 11, 13, 15, 16, 18]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :]
+                
+                scale_new = x_s_info['scale']
+                t_new = x_s_info['t']
+            else:
+                if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                    R_new = x_d_r_lst_smooth[i] 
+                else:
+                    R_new = R_s
+                if animation_region == "all" or animation_region == "exp":
+                    for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1] 
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2] 
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2] 
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]
+                elif animation_region == "all_wo_lip" or animation_region == "exp_wo_lip":
+                    for idx in [1, 2, 11, 13, 15, 16, 18]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]
+                elif animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :]
+                elif animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :]
+                scale_new = x_s_info['scale']
+                if animation_region == "all" or animation_region == "pose" or "all" in animation_region:
+                    t_new = x_d_i_info['t']
+                else:
+                    t_new = x_s_info['t']
+
+            t_new[..., 2].fill_(0)  # zero tz
+            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+
+            # Algorithm 1:
+            if not flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # without stitching or retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new += lip_delta_before_animation
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+                else:
+                    pass
+            elif flag_stitching and not flag_eye_retargeting and not flag_lip_retargeting:
+                # with stitching and without retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new) + lip_delta_before_animation
+                else:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+            else:
+                eyes_delta, lip_delta = None, None
+                if flag_eye_retargeting and source_lmk is not None and c_d_eyes_lst is not None:
+                    c_d_eyes_i = c_d_eyes_lst[i]
+                    combined_eye_ratio_tensor = self.calc_combined_eye_ratio(c_d_eyes_i, source_lmk)
+                    # ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)
+                    eyes_delta = self.retarget_eye(x_s, combined_eye_ratio_tensor)
+                if flag_lip_retargeting and source_lmk is not None and c_d_lip_lst is not None:
+                    c_d_lip_i = c_d_lip_lst[i]
+                    combined_lip_ratio_tensor = self.calc_combined_lip_ratio(c_d_lip_i, source_lmk)
+                    # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+                    lip_delta = self.retarget_lip(x_s, combined_lip_ratio_tensor)
+
+                if flag_relative_motion:  # use x_s
+                    x_d_i_new = x_s + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                else:  # use x_d,i
+                    x_d_i_new = x_d_i_new + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+
+                if flag_stitching:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+
+            x_d_i_new = x_s + (x_d_i_new - x_s) * driving_multiplier
+            out = self.warp_decode(f_s, x_s, x_d_i_new)
+            I_p_i = self.parse_output(out['out'])[0]
+            I_p_lst.append(I_p_i)
+
+            if flag_stitching:
+                # TODO: the paste back procedure is slow, considering optimize it using multi-threading or GPU
+                #I_p_pstbk = self.paste_back_by_face_mask(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], img_crop_256x256, use_laplacian=True)
+                I_p_pstbk = paste_back(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], mask_ori_float, use_laplacian=True)
+                I_p_pstbk_lst.append(I_p_pstbk)
+            
+        if len(I_p_pstbk_lst) > 0:
+            self.save_results(I_p_pstbk_lst, save_path, audio_path)
+        else:
+            self.save_results(I_p_lst, save_path, audio_path)
+
+    @torch.no_grad()
+    def video_reconstruction_test(self, video_tensor, xs, save_path):
+        # video_tensor, (1, F, C, H, W), [-1, 1]
+        # xs, (1, F, 63)
+        result_lst = []
+        #ori_videos = []
+        video_tensor = video_tensor[0:1] * 0.5 + 0.5  # [-1, 1] -> [0, 1], 1xTx3xHxW
+        video_tensor = torch.clip(video_tensor, 0, 1)
+        video_tensor = video_tensor.permute(1, 0, 2, 3, 4) # 1xTx3xHxW -> Tx1x3xHxW
+        video = video_tensor.to(self.device)
+        xs = xs[0:1].permute(1, 0, 2)    # 1xTx63 -> Tx1x63
+        xs = xs.reshape(-1, 1, 21, 3)
+        xs = xs.to(self.device)
+
+        x_s_0 = xs[0]
+        I_s_0 = torch.nn.functional.interpolate(video[0], size=(256, 256), mode='bilinear')
+        f_s_0 = self.extract_feature_3d(I_s_0)
+
+        for i in range(video_tensor.shape[0]):
+            #I_s = video[i]   # 1x3xHxW
+            #ori_videos.append((I_s.squeeze(0).squeeze(0).permute(1, 2, 0).cpu().numpy()*255).astype(np.uint8))
+            x_s = self.stitching(x_s_0, xs[i])
+            out = self.warp_decode(f_s_0, x_s_0, x_s)
+            I_p_i = self.parse_output(out['out'])[0]
+            result_lst.append(I_p_i)
+
+        #save_dir = osp.dirname(save_path)
+        #ori_path = osp.join(save_dir, "ori.mp4")
+        #save_path = osp.join(save_dir, "rec.mp4")
+        self.save_results(result_lst, save_path, audio_path=None)
+        #self.save_results(ori_videos, ori_path, audio_path=None)
+
+    @torch.no_grad()
+    def self_driven(self, image_tensor, xs, save_path, length):
+        result_lst = []
+        image_tensor = image_tensor[0:1] * 0.5 + 0.5    # [-1, 1] -> [0, 1], 1x3xHxW
+        image_tensor = torch.clip(image_tensor, 0, 1)
+        image = image_tensor.to(self.device)
+        I_s_0 = torch.nn.functional.interpolate(image, size=(256, 256), mode='bilinear')
+
+        xs = xs[0:1].permute(1, 0, 2)    # 1xTx63 -> Tx1x63
+        xs = xs.reshape(-1, 1, 21, 3)
+        xs = xs.to(self.device)
+
+        x_s_0 = xs[0]
+        f_s_0 = self.extract_feature_3d(I_s_0)
+
+        for i in range(xs.shape[0]):
+            x_d = self.stitching(x_s_0, xs[i])
+            out = self.warp_decode(f_s_0, x_s_0, x_d)
+            I_p_i = self.parse_output(out['out'])[0]
+            result_lst.append(I_p_i)
+
+        assert len(result_lst) == length, f"length of result_lst is {len(result_lst)}, but length is {length}"
+
+        self.save_results(result_lst, save_path, audio_path=None)
+
+
diff --git a/src/examples/driving_audios/10.wav b/src/examples/driving_audios/10.wav
new file mode 100644
index 0000000000000000000000000000000000000000..1648311a3ccf767c42088391ed0e920269d7ce97
--- /dev/null
+++ b/src/examples/driving_audios/10.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79b53cbd91ebd7756b51f4d388a769b461a247f26acae5c362ca326e27c23626
+size 2880078
diff --git a/src/examples/driving_audios/5.wav b/src/examples/driving_audios/5.wav
new file mode 100644
index 0000000000000000000000000000000000000000..1648311a3ccf767c42088391ed0e920269d7ce97
--- /dev/null
+++ b/src/examples/driving_audios/5.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79b53cbd91ebd7756b51f4d388a769b461a247f26acae5c362ca326e27c23626
+size 2880078
diff --git a/src/examples/driving_audios/6.wav b/src/examples/driving_audios/6.wav
new file mode 100644
index 0000000000000000000000000000000000000000..04349c25241530d81ca353eeb7ff91fceedbad18
--- /dev/null
+++ b/src/examples/driving_audios/6.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90be6ae092eaa9be4e74e0bed56ef343a825bc2c899d2868e0e3aee494c86a04
+size 1323078
diff --git a/src/examples/driving_audios/tmp_5.wav b/src/examples/driving_audios/tmp_5.wav
new file mode 100644
index 0000000000000000000000000000000000000000..5351ce292f77f98674d5c47d52ff95db13f5412f
--- /dev/null
+++ b/src/examples/driving_audios/tmp_5.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2f615328211bb938ab7f6b603631695106d2e23ceaa4dfcd4f491bc5dc2faca
+size 544044
diff --git a/src/examples/reference_images/1.jpg b/src/examples/reference_images/1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f51520806078f6876290ff7cdcf6945e56f2b258
--- /dev/null
+++ b/src/examples/reference_images/1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:362a14590bbfa4517e00338941f87f51fa9d6da0beaa827f6ba28a0e490888d4
+size 224540
diff --git a/src/examples/reference_images/2.jpg b/src/examples/reference_images/2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7a9b00030ade7ed0be9fc7fecd5982c93ee4e330
Binary files /dev/null and b/src/examples/reference_images/2.jpg differ
diff --git a/src/examples/reference_images/3.jpg b/src/examples/reference_images/3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3c1161cb418b84fbce4dad5c29deff41090202ff
Binary files /dev/null and b/src/examples/reference_images/3.jpg differ
diff --git a/src/examples/reference_images/4.jpg b/src/examples/reference_images/4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..60069f8c062a10f056198dda87e2e1ee91ef91e6
Binary files /dev/null and b/src/examples/reference_images/4.jpg differ
diff --git a/src/examples/reference_images/5.jpg b/src/examples/reference_images/5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d4f5e47b522e9a8f46576000c13ba201cc3927e3
Binary files /dev/null and b/src/examples/reference_images/5.jpg differ
diff --git a/src/examples/reference_images/6.jpg b/src/examples/reference_images/6.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..22f1940d69740c7505268cdbb3505db4333e430f
Binary files /dev/null and b/src/examples/reference_images/6.jpg differ
diff --git a/src/examples/reference_images/7.jpg b/src/examples/reference_images/7.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c9e4ddba8c7f8757e9de71b975c4a948ba940a40
--- /dev/null
+++ b/src/examples/reference_images/7.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f03a04f1de9055c626aa09c471115da0365d9d6c25a62c227e8eb3dfba53993
+size 774017
diff --git a/src/examples/silent-audio.wav b/src/examples/silent-audio.wav
new file mode 100644
index 0000000000000000000000000000000000000000..933cfa5a4a2a3857b224ab06c1fef2fcc759eab4
--- /dev/null
+++ b/src/examples/silent-audio.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:231cedffe295d0f5c8ea8569af9edc2471c262410689190bb705fb0adb62f63f
+size 352878
diff --git a/src/models/audio/__pycache__/audio_processer.cpython-310.pyc b/src/models/audio/__pycache__/audio_processer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2621e68c41aa8397e9fe3fb1409ada6ae07e119
Binary files /dev/null and b/src/models/audio/__pycache__/audio_processer.cpython-310.pyc differ
diff --git a/src/models/audio/__pycache__/audio_proj.cpython-310.pyc b/src/models/audio/__pycache__/audio_proj.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50c0c28fa0c4cb86b9721f39dbb6f4098b7fa7aa
Binary files /dev/null and b/src/models/audio/__pycache__/audio_proj.cpython-310.pyc differ
diff --git a/src/models/audio/__pycache__/hubert.cpython-310.pyc b/src/models/audio/__pycache__/hubert.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a29a52400e21933ee03c00869508f3abc2d1d20a
Binary files /dev/null and b/src/models/audio/__pycache__/hubert.cpython-310.pyc differ
diff --git a/src/models/audio/__pycache__/wav2vec.cpython-310.pyc b/src/models/audio/__pycache__/wav2vec.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d267be918cd5cad0acf122f2cec7c166085d3608
Binary files /dev/null and b/src/models/audio/__pycache__/wav2vec.cpython-310.pyc differ
diff --git a/src/models/audio/__pycache__/wav2vec2.cpython-310.pyc b/src/models/audio/__pycache__/wav2vec2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb1cc6bd90a7dc7438aed1e72ccabbf3b8bbc9dc
Binary files /dev/null and b/src/models/audio/__pycache__/wav2vec2.cpython-310.pyc differ
diff --git a/src/models/audio/__pycache__/wav2vec_modified.cpython-310.pyc b/src/models/audio/__pycache__/wav2vec_modified.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfed37a31b3ee3920d9035a1a06d0d4e2690bf05
Binary files /dev/null and b/src/models/audio/__pycache__/wav2vec_modified.cpython-310.pyc differ
diff --git a/src/models/audio/audio_processer.py b/src/models/audio/audio_processer.py
new file mode 100644
index 0000000000000000000000000000000000000000..64fe9d6afe1dd56e79c2fb54604f434bf38260c1
--- /dev/null
+++ b/src/models/audio/audio_processer.py
@@ -0,0 +1,407 @@
+"""Audio processer for talking data.
+Author: linzhihui.lzh
+Date: 2024-12-12
+"""
+import os
+from re import A
+import sys
+import os.path as osp
+
+from typing import List, Dict, Tuple, Optional, Union, Any
+
+import yaml
+from omegaconf import OmegaConf
+
+import math
+import librosa
+import numpy as np
+
+from einops import rearrange
+
+import torch
+import torch.nn.functional as F
+
+from pydub import AudioSegment
+# from audio_separator.separator import Separator
+
+sys.path.append(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__))))))
+from src.utils.rprint import rlog as log
+from src.utils.util import resample_audio
+
+from src.models.audio.wav2vec_modified import Wav2VecModel
+from src.models.audio.hubert import HubertModel
+
+
+def pad_audio(audio, audio_unit=320, pad_threshold=80):
+    batch_size, audio_len = audio.shape
+    n_units = audio_len // audio_unit
+    side_len = math.ceil((audio_unit * n_units + pad_threshold - audio_len) / 2)
+    if side_len >= 0:
+        reflect_len = side_len // 2
+        replicate_len = side_len % 2
+        if reflect_len > 0:
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+        if replicate_len > 0:
+            audio = F.pad(audio, (1, 1), mode='replicate')
+
+    return audio
+
+
+def cut_audio(audio_path: str, save_dir: str, length=60) -> List[str]:
+    """Cut audio into sub-divisions and return subfile paths. Supports wav format.
+
+    Args:
+        audio_path (str): the source audio file path
+        save_dir (str): the save directory of sub-divisions
+        length (int, optional): The max length of each sub-division. Defaults to 60 secs.
+
+    Returns:
+        List[str]: the subfile paths
+    """
+    audio_name = osp.basename(audio_path).split('.')[0]
+    audio = AudioSegment.from_wav(audio_path)
+    segment_length = length * 1000. # pydub uses milliseconds
+    num_segments = math.ceil(len(audio) / segment_length)
+    
+    os.makedirs(save_dir, exist_ok=True)
+    audio_list = []
+    
+    for i in range(num_segments):
+        start_time = i * segment_length
+        end_time = min((i + 1) * segment_length, len(audio))
+        segment = audio[start_time:end_time]
+        
+        path = osp.join(save_dir, f"{audio_name}_segment_{i+1}.wav")
+        audio_list.append(path)
+        segment.export(path, format="wav")
+    return audio_list
+    
+    
+class AudioProcessor(object):
+    def __init__(self, cfg_path: str, is_training: bool = False) -> None:
+        cfg = OmegaConf.load(cfg_path)
+        self.cfg = cfg
+        self.is_training = is_training
+        log("========================================= Audio Processer =========================================")
+        log(OmegaConf.to_yaml(cfg))
+
+        # setting device 
+        self.device_id = cfg.device_params.device_id
+        self.use_half = cfg.device_params.flag_use_half_precision
+        if cfg.device_params.flag_force_cpu:
+            self.device = 'cpu'
+        else:
+            try:
+                if torch.backends.mps.is_available():
+                    self.device = 'mps'
+                else:
+                    self.device = 'cuda:' + str(self.device_id)
+            except:
+                self.device = 'cuda:' + str(self.device_id)
+
+        # init audio separator
+        self.audio_separator = None
+        self.cache_dir = cfg.cache_dir
+        self.tmp_dir = cfg.tmp_dir
+        self.use_audio_separator = cfg.model_params.use_audio_separator
+        self.audio_separator_name = cfg.model_params.audio_separator_name
+        self.audio_separator_path = cfg.model_weights.audio_separator_path
+        self.set_audio_separator(cfg.cache_dir)
+        
+        # load audio encoder, wav2vec or hubert
+        self.model_name = cfg.model_params.model_name
+        self.is_chinese = cfg.model_params.is_chinese
+        self.audio_encoder = self.load_model(
+            model_name = cfg.model_params.model_name, 
+            model_type = cfg.model_params.model_type, 
+            is_chinese = cfg.model_params.is_chinese, 
+        )
+        self.only_last_features = cfg.model_params.only_last_features
+        if cfg.model_params.only_last_features:
+            self.feature_shape = (1, 768)
+        else:
+            self.feature_shape = (12, 768)     # features of 12 blocks
+        
+        # init data params
+        self.sample_strategy = cfg.data_params.sample_strategy
+        self.sample_rate = cfg.data_params.sample_rate
+        self.fps = cfg.data_params.fps
+        self.audio_unit = cfg.data_params.sample_rate / cfg.data_params.fps   # num of audio samples per frame
+        self.max_length = cfg.data_params.max_length
+        self.subclip_len = cfg.data_params.sub_clip_length
+        self.save_to_cpu = cfg.data_params.save_to_cpu
+        self.pad_mode = cfg.data_params.audio_pad_mode
+
+        log("========================================= Audio Processer: Done =========================================")
+        
+    def load_model(self, model_name: str="wav2vec", model_type: str="base", is_chinese: bool = False):
+        assert model_name in ["wav2vec", "hubert"], f"Unknown audio model {model_name}, only support wav2vec or hubert"
+        assert model_type in ["base", "large"], f"Unknown audio model type {model_type}, only support base or large"
+
+        if model_name == "wav2vec":
+            # load wav2vec model weights
+            if is_chinese:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.chinese.base
+                else:
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.chinese.large
+            else:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.default.base
+                else:
+                    model_weight_path = self.cfg.model_weights.wav2vec_path.default.large
+            if model_weight_path is None:
+                raise ValueError(f"model_weight_path is None")
+            audio_encoder = Wav2VecModel.from_pretrained(model_weight_path, local_files_only=True).to(device=self.device)
+        else:
+            if is_chinese:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.hubert_path.chinese.base
+                else:
+                    model_weight_path = self.cfg.model_weights.hubert_path.chinese.large
+            else:
+                if model_type == "base":
+                    model_weight_path = self.cfg.model_weights.hubert_path.default.base
+                else:
+                    model_weight_path = self.cfg.model_weights.hubert_path.default.large
+            if model_weight_path is None:
+                raise ValueError(f"model_weight_path is None")    
+            audio_encoder = HubertModel.from_pretrained(model_weight_path, local_files_only=True).to(device=self.device)
+
+        log(f"{model_name}-{model_type}-chinese-{is_chinese} model has beed loaded from {model_weight_path}")
+        total_params = sum(p.numel() for p in audio_encoder.parameters())
+        print('Number of parameter: % .4fM' % (total_params / 1e6))
+        
+        # weights initialization
+        audio_encoder.feature_extractor._freeze_parameters()
+        if not self.cfg.model_params.is_original:
+            frozen_layers = [0, 1]
+            for name, param in audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+
+        audio_encoder = audio_encoder.to(self.device)
+        if self.use_half:
+            audio_encoder = audio_encoder.half()
+        audio_encoder.eval()
+        return audio_encoder
+
+    def set_audio_separator(self, output_dir: str) -> None:
+        del self.audio_separator
+        
+        if self.audio_separator_name is not None and self.use_audio_separator:
+            try:
+                os.makedirs(output_dir, exist_ok=True)
+            except OSError as _:
+                print("Fail to create the output cache dir.")
+            self.audio_separator = Separator(
+                output_dir=output_dir,
+                output_single_stem="vocals",
+                model_file_dir=self.audio_separator_path,
+            )
+            self.audio_separator.load_model(self.audio_separator_name)
+            assert self.audio_separator.model_instance is not None, "Fail to load audio separate model."
+        else:
+            self.audio_separator=None
+            log("Use audio directly without vocals seperator.")
+    
+    def seperate_audio(self, audio_path: str, output_dir: Union[str, None] = None) -> str:
+        if output_dir is not None:
+            if output_dir != self.cache_dir:
+                # reload audio separator
+                self.set_audio_separator(output_dir)
+        
+        if self.audio_separator is not None:
+            # 1. separate vocals
+            # TODO: process in memory
+            try:
+                outputs = self.audio_separator.separate(audio_path)
+                if len(outputs) <= 0:
+                    raise RuntimeError("Audio separate failed.")
+
+                vocal_audio_file = outputs[0]
+                vocal_audio_name, _ = os.path.splitext(vocal_audio_file)
+                vocal_audio_file = os.path.join(self.audio_separator.output_dir, vocal_audio_file)
+                vocal_audio_file = resample_audio(vocal_audio_file, os.path.join(self.audio_separator.output_dir, f"{vocal_audio_name}-16k.wav"), self.sample_rate)
+            except Exception as e:
+                log(f"Fail to separate vocals from {audio_path}, error info [{e}]")
+                vocal_audio_file=audio_path
+        else:
+            vocal_audio_file=audio_path
+        
+        return vocal_audio_file
+
+    def load_audio(self, audio_path: str, mono: bool = True, duration: Optional[float] = None) -> Any:
+        try:
+            audio_data, sampling_rate = librosa.load(audio_path, sr=self.sample_rate, mono=mono, duration=duration)
+        except Exception as e:
+            raise RuntimeError(f"Fail to load audio from {audio_path}, error info [{e}]")
+        return audio_data, sampling_rate
+
+    def prepare_audio_data(self, audio_data: Union[np.ndarray, torch.Tensor], n_frames: Optional[int]=None) -> Tuple[List[Any], int]:
+        """Prepare audio data for processing.
+        """
+        clip_len = int(len(audio_data) / self.audio_unit)
+        if n_frames is not None:
+            if abs(n_frames - clip_len) > 2:
+                log(f"The number of frames must be close to the clip length (in 80ms), got {n_frames} and {clip_len}")
+                return [], n_frames
+            clip_len = n_frames
+        else:
+            n_frames = clip_len
+
+        # normalize audio, replace Wav2Vec2FeatureExtractor
+        if isinstance(audio_data, np.ndarray):
+            audio_data = torch.from_numpy(audio_data).to(self.device)
+        assert audio_data.ndim == 1, 'Audio must be 1D tensor.'
+        audio_data = (audio_data - torch.mean(audio_data)) / (torch.std(audio_data) + 1e-7)
+        #log(f"audio loaded! {audio_data.shape}")
+
+        # padding
+        # padding audio to fit the clip length
+        n_audio_samples = round(self.audio_unit * clip_len)
+        n_padding_audio_samples = n_audio_samples - len(audio_data)
+        n_padding_frames = math.ceil(n_padding_audio_samples / self.audio_unit)
+        if n_padding_audio_samples > 0:
+            if self.pad_mode == 'zero':
+                padding_value = 0
+            elif self.pad_mode == 'replicate':
+                padding_value = float(audio_data[-1])
+            else:
+                raise ValueError(f'Unknown pad mode: {self.pad_mode}')
+            audio_data = F.pad(audio_data, (0, n_padding_audio_samples), value=padding_value)
+        
+        # devide audio into sub-divisions for saving GPU memory
+        audio_segments = []
+        if clip_len <= self.subclip_len:
+            n_subdivision = 1
+            subclip_len = clip_len
+        else:
+            n_subdivision = math.ceil(clip_len / self.subclip_len)
+            subclip_len = self.subclip_len
+        
+        for i in range(0, n_subdivision):
+            start_idx = i * subclip_len
+            end_idx = min(start_idx + subclip_len, clip_len)
+            # debug
+            #log(f"[{i+1}/{n_subdivision}] data index [{round(start_idx * self.audio_unit)}, {round(end_idx * self.audio_unit)})")
+            audio_segments.append(
+                {
+                    "data": audio_data[round(start_idx * self.audio_unit):round(end_idx * self.audio_unit)].unsqueeze(0),
+                    "start_idx": start_idx,
+                    "end_idx": end_idx,
+                    "length": end_idx - start_idx
+                }
+            )
+        return audio_segments, n_frames
+        
+    def get_audio_embedding(self, audio, clip_len: int) -> torch.Tensor:
+        if audio.ndim == 2:
+            # Extract audio features
+            assert audio.shape[1] == 16000 * clip_len / self.fps, \
+                f'Incorrect audio length {audio.shape[1]}'
+            
+            # Extract audio features
+            if self.use_half:
+                audio = audio.half()
+            embeddings = self.audio_encoder(
+                pad_audio(audio), seq_len=clip_len, sample_strategy=self.sample_strategy, output_hidden_states=True
+            )  # (N, L, 768)
+            assert len(embeddings) > 0, "Fail to extract audio embedding"
+            
+            if self.only_last_features:
+                audio_emb = embeddings.last_hidden_state.squeeze(0)
+            else:
+                audio_emb = torch.stack(
+                    embeddings.hidden_states[1:], dim=1
+                ).squeeze(0)
+                audio_emb = rearrange(audio_emb, "b s d -> s b d")
+            
+        elif audio.ndim == 3:
+            assert audio.shape[1] == clip_len, f'Incorrect audio feature length {audio.shape[1]}'
+            audio_emb = audio
+        else:
+            raise ValueError(f'Incorrect audio input shape {audio.shape}')
+        
+        return audio_emb
+
+    def get_audio_embeddings(self, audio_segments: List[Any]) -> Optional[torch.Tensor]:
+        audio_embs = []
+        for audio_segment in audio_segments:
+            if self.is_training:
+                audio_emb = self.get_audio_embedding(audio_segment["data"], audio_segment["length"])
+            else:
+                with torch.no_grad():
+                    audio_emb = self.get_audio_embedding(audio_segment["data"], audio_segment["length"])
+            
+            audio_emb = audio_emb.cpu() if self.save_to_cpu else audio_emb
+            audio_embs.append(audio_emb)
+            #log(f"audio segment [{audio_segment['start_idx']}, {audio_segment['end_idx']}) has been processed.") 
+        
+        if len(audio_embs) == 0:
+            return None
+
+        audio_emb = torch.cat(audio_embs, dim=0)
+        
+        return audio_emb
+
+    def preprocess(
+        self, 
+        audio_path: str, 
+        n_frames: Optional[int] = None, 
+        duration: Optional[float] = None, 
+        need_seperate: bool = False
+    ):
+        """ Preprocess a WAV audio file by separating the vocals from the background and resampling it to a 16 kHz sample rate.
+        The separated vocal track is then converted into wav2vec2 for further processing or analysis.
+        """
+        if need_seperate:
+            vocal_audio_file = self.seperate_audio(audio_path)
+        else:
+            vocal_audio_file = audio_path
+        
+        audio_data, sampling_rate = self.load_audio(vocal_audio_file, duration=duration)
+    
+        assert sampling_rate == 16000, "The sample rate of audio must be 16000"
+        audio_segments, n_frames = self.prepare_audio_data(audio_data, n_frames)
+        audio_emb = self.get_audio_embeddings(audio_segments)
+        if audio_emb is None:
+            log(f"{audio_path} has been processed, but no audio embedding, set as 'None'.")
+        #else:
+            #log(f"{audio_path} has been processed, audio embedding shape {audio_emb.shape}.") 
+        return audio_emb, n_frames
+    
+    def preprocess_long(
+        self, 
+        audio_path: str, 
+        need_seperate: bool = False
+    ):
+        audio_list = cut_audio(audio_path, self.tmp_dir, length=self.max_length)
+        audio_emb_list = []
+        l = 0
+
+        for idx, audio_path in enumerate(audio_list):
+            padding = (idx+1) == len(audio_list)
+            emb, length = self.preprocess(audio_path, need_seperate=need_seperate)
+            audio_emb_list.append(emb)
+            log(f"Processing audio {idx+1}/{len(audio_list)}, path: {audio_path} length: {length}")
+            l += length
+        
+        audio_emb = torch.cat(audio_emb_list)
+        audio_length = l
+
+        # remove tmp file
+        for audio_path in audio_list:
+            os.remove(audio_path)
+        
+        return audio_emb, audio_length
+
+    def __enter__(self):
+        return self
+
+
diff --git a/src/models/audio/audio_proj.py b/src/models/audio/audio_proj.py
new file mode 100644
index 0000000000000000000000000000000000000000..9edf7d2ee770f2f4dbe173dd03a1e87cf6417543
--- /dev/null
+++ b/src/models/audio/audio_proj.py
@@ -0,0 +1,124 @@
+"""
+This module provides the implementation of an Audio Projection Model, which is designed for
+audio processing tasks. The model takes audio embeddings as input and outputs context tokens
+that can be used for various downstream applications, such as audio analysis or synthesis.
+
+The AudioProjModel class is based on the ModelMixin class from the diffusers library, which
+provides a foundation for building custom models. This implementation includes multiple linear
+layers with ReLU activation functions and a LayerNorm for normalization.
+
+Key Features:
+- Audio embedding input with flexible sequence length and block structure.
+- Multiple linear layers for feature transformation.
+- ReLU activation for non-linear transformation.
+- LayerNorm for stabilizing and speeding up training.
+- Rearrangement of input embeddings to match the model's expected input shape.
+- Customizable number of blocks, channels, and context tokens for adaptability.
+
+The module is structured to be easily integrated into larger systems or used as a standalone
+component for audio feature extraction and processing.
+
+Classes:
+- AudioProjModel: A class representing the audio projection model with configurable parameters.
+
+Functions:
+- (none)
+
+Dependencies:
+- torch: For tensor operations and neural network components.
+- diffusers: For the ModelMixin base class.
+- einops: For tensor rearrangement operations.
+
+"""
+
+import torch
+from diffusers import ModelMixin
+from einops import rearrange
+from torch import nn
+
+
+class AudioProjModel(ModelMixin):
+    """Audio Projection Model
+
+    This class defines an audio projection model that takes audio embeddings as input
+    and produces context tokens as output. The model is based on the ModelMixin class
+    and consists of multiple linear layers and activation functions. It can be used
+    for various audio processing tasks.
+
+    Attributes:
+        seq_len (int): The length of the audio sequence.
+        blocks (int): The number of blocks in the audio projection model.
+        channels (int): The number of channels in the audio projection model.
+        intermediate_dim (int): The intermediate dimension of the model.
+        context_tokens (int): The number of context tokens in the output.
+        output_dim (int): The output dimension of the context tokens.
+
+    Methods:
+        __init__(self, seq_len=5, blocks=12, channels=768, intermediate_dim=512, context_tokens=32, output_dim=768):
+            Initializes the AudioProjModel with the given parameters.
+        forward(self, audio_embeds):
+            Defines the forward pass for the AudioProjModel.
+            Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+
+    """
+
+    def __init__(
+        self,
+        seq_len=5,
+        blocks=12,  # add a new parameter blocks
+        channels=768,  # add a new parameter channels
+        intermediate_dim=512,
+        output_dim=768,
+        context_tokens=32,
+    ):
+        super().__init__()
+
+        self.seq_len = seq_len
+        self.blocks = blocks
+        self.channels = channels
+        self.input_dim = (
+            seq_len * blocks * channels
+        )  # update input_dim to be the product of blocks and channels.
+        self.intermediate_dim = intermediate_dim
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+
+        # define multiple linear layers
+        self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
+        self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
+        self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
+
+        self.norm = nn.LayerNorm(output_dim)
+
+    def forward(self, audio_embeds):
+        """
+        Defines the forward pass for the AudioProjModel.
+
+        Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+
+        Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+        """
+        # merge
+        video_length = audio_embeds.shape[1]
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
+
+        audio_embeds = torch.relu(self.proj1(audio_embeds))
+        audio_embeds = torch.relu(self.proj2(audio_embeds))
+
+        context_tokens = self.proj3(audio_embeds).reshape(
+            batch_size, self.context_tokens, self.output_dim
+        )
+
+        context_tokens = self.norm(context_tokens)
+        context_tokens = rearrange(
+            context_tokens, "(bz f) m c -> bz f m c", f=video_length
+        )
+
+        return context_tokens
diff --git a/src/models/audio/hubert.py b/src/models/audio/hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc77b7b2b518807b831d8b268f015cf6acf443d
--- /dev/null
+++ b/src/models/audio/hubert.py
@@ -0,0 +1,120 @@
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+
+from transformers import HubertModel
+from transformers.modeling_outputs import BaseModelOutput
+
+
+_CONFIG_FOR_DOC = 'HubertConfig'
+
+
+def linear_interpolation(features, seq_len):
+    """
+    Transpose the features to interpolate linearly.
+
+    Args:
+        features (torch.Tensor): The extracted features to be interpolated.
+        seq_len (torch.Tensor): The sequence lengths of the features.
+
+    Returns:
+        torch.Tensor: The interpolated features.
+    """
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)
+
+
+class HubertModel_(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        seq_len: Optional[int],
+        sample_strategy: Optional[str] = "presample",
+        attention_mask: Optional[torch.LongTensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Forward pass of the HuBERT model.
+
+        Args:
+            self: The instance of the model.
+            input_values: The input values (waveform) to the model.
+            seq_len: The sequence length of the input values.
+            sample_strategy: The sample strategy to align features and seq_len, supports ['presample', 'postsample'].
+            attention_mask: Attention mask to be used for the model.
+            mask_time_indices: Mask indices to be used for the model.
+            output_attentions: If set to True, returns attentions.
+            output_hidden_states: If set to True, returns hidden states.
+            return_dict: If set to True, returns a BaseModelOutput instead of a tuple.
+
+        Returns:
+            The output of the HuBERT model.
+        """
+        # output_fps=25, 
+        # attention_mask=None, 
+        # output_attentions=None,
+        # output_hidden_states=None, 
+        # return_dict=None, 
+        # frame_num=None
+        assert sample_strategy in ["presample", "postsample"], f"sample_strategy must be in ['presample', 'postsample]"
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)  # (N, C, L)
+        extract_features = extract_features.transpose(1, 2)
+        if sample_strategy == "presample":  
+            extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+
+        # # Resample the audio feature @ 50 fps to `output_fps`.
+        # if frame_num is not None:
+        #     extract_features_len = round(frame_num * 50 / output_fps)
+        #     extract_features = extract_features[:, :, :extract_features_len]
+        # extract_features = linear_interpolation(extract_features, 50, output_fps, output_len=frame_num)
+        # extract_features = extract_features.transpose(1, 2)  # (N, L, C)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, 
+            mask_time_indices=mask_time_indices, 
+            attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if sample_strategy == "postsample":
+            hidden_states = linear_interpolation(hidden_states, seq_len=seq_len)
+            for i in range(len(encoder_outputs.hidden_states)):
+                encoder_outputs.hidden_states[i] = linear_interpolation(encoder_outputs.hidden_states[i], seq_len=seq_len)
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, 
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions, 
+        )
diff --git a/src/models/audio/hubert2.py b/src/models/audio/hubert2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5feb39d687d3a28398ee7be892c7769538539737
--- /dev/null
+++ b/src/models/audio/hubert2.py
@@ -0,0 +1,120 @@
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+
+from transformers import HubertModel
+from transformers.modeling_outputs import BaseModelOutput
+
+
+_CONFIG_FOR_DOC = 'HubertConfig'
+
+
+def linear_interpolation(features, seq_len):
+    """
+    Transpose the features to interpolate linearly.
+
+    Args:
+        features (torch.Tensor): The extracted features to be interpolated.
+        seq_len (torch.Tensor): The sequence lengths of the features.
+
+    Returns:
+        torch.Tensor: The interpolated features.
+    """
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)
+
+
+class HubertModel(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        seq_len: Optional[int],
+        sample_strategy: Optional[str] = "presample",
+        attention_mask: Optional[torch.LongTensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Forward pass of the HuBERT model.
+
+        Args:
+            self: The instance of the model.
+            input_values: The input values (waveform) to the model.
+            seq_len: The sequence length of the input values.
+            sample_strategy: The sample strategy to align features and seq_len, supports ['presample', 'postsample'].
+            attention_mask: Attention mask to be used for the model.
+            mask_time_indices: Mask indices to be used for the model.
+            output_attentions: If set to True, returns attentions.
+            output_hidden_states: If set to True, returns hidden states.
+            return_dict: If set to True, returns a BaseModelOutput instead of a tuple.
+
+        Returns:
+            The output of the HuBERT model.
+        """
+        # output_fps=25, 
+        # attention_mask=None, 
+        # output_attentions=None,
+        # output_hidden_states=None, 
+        # return_dict=None, 
+        # frame_num=None
+        assert sample_strategy in ["presample", "postsample"], f"sample_strategy must be in ['presample', 'postsample]"
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)  # (N, C, L)
+        extract_features = extract_features.transpose(1, 2)
+        if sample_strategy == "presample":  
+            extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+
+        # # Resample the audio feature @ 50 fps to `output_fps`.
+        # if frame_num is not None:
+        #     extract_features_len = round(frame_num * 50 / output_fps)
+        #     extract_features = extract_features[:, :, :extract_features_len]
+        # extract_features = linear_interpolation(extract_features, 50, output_fps, output_len=frame_num)
+        # extract_features = extract_features.transpose(1, 2)  # (N, L, C)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, 
+            mask_time_indices=mask_time_indices, 
+            attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if sample_strategy == "postsample":
+            hidden_states = linear_interpolation(hidden_states, seq_len=seq_len)
+            for i in range(len(encoder_outputs.hidden_states)):
+                encoder_outputs.hidden_states[i] = linear_interpolation(encoder_outputs.hidden_states[i], seq_len=seq_len)
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, 
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions, 
+        )
diff --git a/src/models/audio/wav2vec.py b/src/models/audio/wav2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa21e661cb0258c7635ca657ffbddc78f5765e26
--- /dev/null
+++ b/src/models/audio/wav2vec.py
@@ -0,0 +1,210 @@
+
+
+"""
+This module defines the Wav2Vec model, which is a pre-trained model for speech recognition and understanding.
+It inherits from the Wav2Vec2Model class in the transformers library and provides additional functionalities
+such as feature extraction and encoding.
+
+Classes:
+    Wav2VecModel: Inherits from Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+
+Functions:
+    linear_interpolation: Interpolates the features based on the sequence length.
+"""
+
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+
+
+class Wav2VecModel(Wav2Vec2Model):
+    """
+    Wav2VecModel is a custom model class that extends the Wav2Vec2Model class from the transformers library. 
+    It inherits all the functionality of the Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+    ...
+
+    Attributes:
+        base_model (Wav2Vec2Model): The base Wav2Vec2Model object.
+
+    Methods:
+        forward(input_values, seq_len, attention_mask=None, mask_time_indices=None
+        , output_attentions=None, output_hidden_states=None, return_dict=None):
+            Forward pass of the Wav2VecModel. 
+            It takes input_values, seq_len, and other optional parameters as input and returns the output of the base model.
+
+        feature_extract(input_values, seq_len):
+            Extracts features from the input_values using the base model.
+
+        encode(extract_features, attention_mask=None, mask_time_indices=None, output_attentions=None, output_hidden_states=None, return_dict=None):
+            Encodes the extracted features using the base model and returns the encoded features.
+    """
+    def forward(
+        self,
+        input_values,
+        seq_len,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        """
+        Forward pass of the Wav2Vec model.
+
+        Args:
+            self: The instance of the model.
+            input_values: The input values (waveform) to the model.
+            seq_len: The sequence length of the input values.
+            attention_mask: Attention mask to be used for the model.
+            mask_time_indices: Mask indices to be used for the model.
+            output_attentions: If set to True, returns attentions.
+            output_hidden_states: If set to True, returns hidden states.
+            return_dict: If set to True, returns a BaseModelOutput instead of a tuple.
+
+        Returns:
+            The output of the Wav2Vec model.
+        """
+        self.config.output_attentions = True
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+    def feature_extract(
+        self,
+        input_values,
+        seq_len,
+    ):
+        """
+        Extracts features from the input values and returns the extracted features.
+
+        Parameters:
+        input_values (torch.Tensor): The input values to be processed.
+        seq_len (torch.Tensor): The sequence lengths of the input values.
+
+        Returns:
+        extracted_features (torch.Tensor): The extracted features from the input values.
+        """
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+
+        return extract_features
+
+    def encode(
+        self,
+        extract_features,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        """
+        Encodes the input features into the output space.
+
+        Args:
+            extract_features (torch.Tensor): The extracted features from the audio signal.
+            attention_mask (torch.Tensor, optional): Attention mask to be used for padding.
+            mask_time_indices (torch.Tensor, optional): Masked indices for the time dimension.
+            output_attentions (bool, optional): If set to True, returns the attention weights.
+            output_hidden_states (bool, optional): If set to True, returns all hidden states.
+            return_dict (bool, optional): If set to True, returns a BaseModelOutput instead of the tuple.
+
+        Returns:
+            The encoded output features.
+        """
+        self.config.output_attentions = True
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+def linear_interpolation(features, seq_len):
+    """
+    Transpose the features to interpolate linearly.
+
+    Args:
+        features (torch.Tensor): The extracted features to be interpolated.
+        seq_len (torch.Tensor): The sequence lengths of the features.
+
+    Returns:
+        torch.Tensor: The interpolated features.
+    """
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)
diff --git a/src/models/audio/wav2vec2.py b/src/models/audio/wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d42569419e8f6ab01acbc67515a0b2db26bc3d9
--- /dev/null
+++ b/src/models/audio/wav2vec2.py
@@ -0,0 +1,123 @@
+from packaging import version
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+
+_CONFIG_FOR_DOC = 'Wav2Vec2Config'
+
+
+# the implementation of Wav2Vec2Model is borrowed from
+# https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model
+# initialize our encoder with the pre-trained wav2vec 2.0 weights.
+def _compute_mask_indices(shape: Tuple[int, int], mask_prob: float, mask_length: int,
+                          attention_mask: Optional[torch.Tensor] = None, min_masks: int = 0, ) -> np.ndarray:
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    all_num_mask = int(mask_prob * all_sz / float(mask_length) + np.random.rand())
+    all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand())
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+
+        lengths = np.full(num_mask, mask_length)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        min_len = min(lengths)
+        if sz - min_len <= num_mask:
+            min_len = sz - num_mask - 1
+
+        mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+        mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return mask
+
+
+# linear interpolation layer
+def linear_interpolation(features, input_fps, output_fps, output_len=None):
+    # features: (N, C, L)
+    seq_len = features.shape[2] / float(input_fps)
+    if output_len is None:
+        output_len = int(seq_len * output_fps)
+    output_features = F.interpolate(features, size=output_len, align_corners=False, mode='linear')
+    return output_features
+
+
+class Wav2Vec2Model(Wav2Vec2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.is_old_version = version.parse(transformers.__version__) < version.parse('4.7.0')
+
+    def forward(self, input_values, output_fps=25, attention_mask=None, output_attentions=None,
+                output_hidden_states=None, return_dict=None, frame_num=None):
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        print(f"data shape before feature extractor: {input_values.shape}")
+        hidden_states = self.feature_extractor(input_values)  # (N, C, L)
+        print(f"data shape after feature extractor: {hidden_states.shape}")
+        # Resample the audio feature @ 50 fps to `output_fps`.
+        if frame_num is not None:
+            hidden_states_len = round(frame_num * 50 / output_fps)
+            hidden_states = hidden_states[:, :, :hidden_states_len]
+        hidden_states = linear_interpolation(hidden_states, 50, output_fps, output_len=frame_num)
+        hidden_states = hidden_states.transpose(1, 2)  # (N, L, C)
+
+        if attention_mask is not None:
+            output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+            attention_mask = torch.zeros(hidden_states.shape[:2], dtype=hidden_states.dtype,
+                                         device=hidden_states.device)
+            attention_mask[(torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1)] = 1
+            attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+
+        if self.is_old_version:
+            hidden_states = self.feature_projection(hidden_states)
+        else:
+            hidden_states = self.feature_projection(hidden_states)[0]
+
+        if self.config.apply_spec_augment and self.training:
+            batch_size, sequence_length, hidden_size = hidden_states.size()
+            if self.config.mask_time_prob > 0:
+                mask_time_indices = _compute_mask_indices((batch_size, sequence_length), self.config.mask_time_prob,
+                                                          self.config.mask_time_length, attention_mask=attention_mask,
+                                                          min_masks=2, )
+                hidden_states[torch.from_numpy(mask_time_indices)] = self.masked_spec_embed.to(hidden_states.dtype)
+            if self.config.mask_feature_prob > 0:
+                mask_feature_indices = _compute_mask_indices((batch_size, hidden_size), self.config.mask_feature_prob,
+                                                             self.config.mask_feature_length, )
+                mask_feature_indices = torch.from_numpy(mask_feature_indices).to(hidden_states.device)
+                hidden_states[mask_feature_indices[:, None].expand(-1, sequence_length, -1)] = 0
+        encoder_outputs = self.encoder(hidden_states, attention_mask=attention_mask,
+                                       output_attentions=output_attentions, output_hidden_states=output_hidden_states,
+                                       return_dict=return_dict, )
+        hidden_states = encoder_outputs[0]
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+        
+        for i in range(len(encoder_outputs.hidden_states)):
+            print(f"hidden states {i} after encoder: {encoder_outputs.hidden_states[i].shape}")
+
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_outputs.hidden_states,
+                               attentions=encoder_outputs.attentions, )
diff --git a/src/models/audio/wav2vec_modified.py b/src/models/audio/wav2vec_modified.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2068dc64c595e40a744c7aeef93fe58bb06e29e
--- /dev/null
+++ b/src/models/audio/wav2vec_modified.py
@@ -0,0 +1,223 @@
+
+
+"""
+This module defines the Wav2Vec model, which is a pre-trained model for speech recognition and understanding.
+It inherits from the Wav2Vec2Model class in the transformers library and provides additional functionalities
+such as feature extraction and encoding.
+
+Classes:
+    Wav2VecModel: Inherits from Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+
+Functions:
+    linear_interpolation: Interpolates the features based on the sequence length.
+"""
+
+from typing import Optional, Tuple, Union
+import torch
+
+import torch.nn.functional as F
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput, Wav2Vec2BaseModelOutput
+
+
+class Wav2VecModel(Wav2Vec2Model):
+    """
+    Wav2VecModel is a custom model class that extends the Wav2Vec2Model class from the transformers library. 
+    It inherits all the functionality of the Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+    ...
+
+    Attributes:
+        base_model (Wav2Vec2Model): The base Wav2Vec2Model object.
+
+    Methods:
+        forward(input_values, seq_len, attention_mask=None, mask_time_indices=None
+        , output_attentions=None, output_hidden_states=None, return_dict=None):
+            Forward pass of the Wav2VecModel. 
+            It takes input_values, seq_len, and other optional parameters as input and returns the output of the base model.
+
+        feature_extract(input_values, seq_len):
+            Extracts features from the input_values using the base model.
+
+        encode(extract_features, attention_mask=None, mask_time_indices=None, output_attentions=None, output_hidden_states=None, return_dict=None):
+            Encodes the extracted features using the base model and returns the encoded features.
+    """
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        seq_len: Optional[int],
+        sample_strategy: Optional[str] = "presample",
+        attention_mask: Optional[torch.LongTensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+        """
+        Forward pass of the Wav2Vec model.
+
+        Args:
+            self: The instance of the model.
+            input_values: The input values (waveform) to the model.
+            seq_len: The sequence length of the input values.
+            sample_strategy: The sample strategy to align features and seq_len, supports ['presample', 'postsample'].
+            attention_mask: Attention mask to be used for the model.
+            mask_time_indices: Mask indices to be used for the model.
+            output_attentions: If set to True, returns attentions.
+            output_hidden_states: If set to True, returns hidden states.
+            return_dict: If set to True, returns a BaseModelOutput instead of a tuple.
+
+        Returns:
+            The output of the Wav2Vec model.
+        """
+        assert sample_strategy in ["presample", "postsample"], f"sample_strategy must be in ['presample', 'postsample]"
+
+        self.config.output_attentions = True
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        if sample_strategy == "presample":  
+            extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if sample_strategy == "postsample":
+            hidden_states = linear_interpolation(hidden_states, seq_len=seq_len)
+            for i in range(len(encoder_outputs.hidden_states)):
+                encoder_outputs.hidden_states[i] = linear_interpolation(encoder_outputs.hidden_states[i], seq_len=seq_len)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+    def feature_extract(
+        self,
+        input_values,
+        seq_len,
+    ):
+        """
+        Extracts features from the input values and returns the extracted features.
+
+        Parameters:
+        input_values (torch.Tensor): The input values to be processed.
+        seq_len (torch.Tensor): The sequence lengths of the input values.
+
+        Returns:
+        extracted_features (torch.Tensor): The extracted features from the input values.
+        """
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+
+        return extract_features
+
+    def encode(
+        self,
+        extract_features,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        """
+        Encodes the input features into the output space.
+
+        Args:
+            extract_features (torch.Tensor): The extracted features from the audio signal.
+            attention_mask (torch.Tensor, optional): Attention mask to be used for padding.
+            mask_time_indices (torch.Tensor, optional): Masked indices for the time dimension.
+            output_attentions (bool, optional): If set to True, returns the attention weights.
+            output_hidden_states (bool, optional): If set to True, returns all hidden states.
+            return_dict (bool, optional): If set to True, returns a BaseModelOutput instead of the tuple.
+
+        Returns:
+            The encoded output features.
+        """
+        self.config.output_attentions = True
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+def linear_interpolation(features, seq_len):
+    """
+    Transpose the features to interpolate linearly.
+
+    Args:
+        features (torch.Tensor): The extracted features to be interpolated.
+        seq_len (torch.Tensor): The sequence lengths of the features.
+
+    Returns:
+        torch.Tensor: The interpolated features.
+    """
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)
diff --git a/src/models/dit/__init__.py b/src/models/dit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/models/dit/__pycache__/__init__.cpython-310.pyc b/src/models/dit/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b7ba1f0e938daef26892de97ba4b7683f3dbec5
Binary files /dev/null and b/src/models/dit/__pycache__/__init__.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/blocks.cpython-310.pyc b/src/models/dit/__pycache__/blocks.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e23ed7b8c9f03b83a8ce32a4de371e4721bbd3f
Binary files /dev/null and b/src/models/dit/__pycache__/blocks.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/common.cpython-310.pyc b/src/models/dit/__pycache__/common.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..592bd5ff3e42dfde92b993a09c0c42d25a5a1bd4
Binary files /dev/null and b/src/models/dit/__pycache__/common.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/config.cpython-310.pyc b/src/models/dit/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55e5b7a18fd48243bdfb83a816d0406af9fb9707
Binary files /dev/null and b/src/models/dit/__pycache__/config.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/dit_talking_head.cpython-310.pyc b/src/models/dit/__pycache__/dit_talking_head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18ef4549848ec4b4f8715dfbf22c08f86defd9d2
Binary files /dev/null and b/src/models/dit/__pycache__/dit_talking_head.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/embedders.cpython-310.pyc b/src/models/dit/__pycache__/embedders.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43a656374cf03bc74a767408b15c7d10b7bdb835
Binary files /dev/null and b/src/models/dit/__pycache__/embedders.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/embedders_v10.cpython-310.pyc b/src/models/dit/__pycache__/embedders_v10.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3d575d847832a887b554dfa250697558c4bef70
Binary files /dev/null and b/src/models/dit/__pycache__/embedders_v10.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/embedders_v2.cpython-310.pyc b/src/models/dit/__pycache__/embedders_v2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..812e8735d1e17eaf02da5eb3b37f1acb58fa87e4
Binary files /dev/null and b/src/models/dit/__pycache__/embedders_v2.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/embedders_v3.cpython-310.pyc b/src/models/dit/__pycache__/embedders_v3.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2a79d9165745710badb537428f72d479b9f1024
Binary files /dev/null and b/src/models/dit/__pycache__/embedders_v3.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/embedders_v4.cpython-310.pyc b/src/models/dit/__pycache__/embedders_v4.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03eaea48a076eec203e07b675ab04c98c6102ca1
Binary files /dev/null and b/src/models/dit/__pycache__/embedders_v4.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/embedders_v5.cpython-310.pyc b/src/models/dit/__pycache__/embedders_v5.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..138c4d3bfb31bd53dc1bc61c0da6cf59de1d7205
Binary files /dev/null and b/src/models/dit/__pycache__/embedders_v5.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/embedders_v6.cpython-310.pyc b/src/models/dit/__pycache__/embedders_v6.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75e330e9e5ace2d75f1b92de6c2e540af0f089fc
Binary files /dev/null and b/src/models/dit/__pycache__/embedders_v6.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/embedders_v8.cpython-310.pyc b/src/models/dit/__pycache__/embedders_v8.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13f5846e9431b88d11d869c51f7f9264ac6a9993
Binary files /dev/null and b/src/models/dit/__pycache__/embedders_v8.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/embedders_v9.cpython-310.pyc b/src/models/dit/__pycache__/embedders_v9.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b7aa1c9e3807276b85158657087964440bc7e84
Binary files /dev/null and b/src/models/dit/__pycache__/embedders_v9.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/keygeneration.cpython-310.pyc b/src/models/dit/__pycache__/keygeneration.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29b12d3fb6659db99c347c35b9d966cfe9cd5368
Binary files /dev/null and b/src/models/dit/__pycache__/keygeneration.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/light_dit.cpython-310.pyc b/src/models/dit/__pycache__/light_dit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edd924f672b2b3788f2371788940945a105dcf5f
Binary files /dev/null and b/src/models/dit/__pycache__/light_dit.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/modules.cpython-310.pyc b/src/models/dit/__pycache__/modules.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e661396a11ae6be444d74f302dff94df847bc33
Binary files /dev/null and b/src/models/dit/__pycache__/modules.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/modules2.cpython-310.pyc b/src/models/dit/__pycache__/modules2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b46b7a8309c26acb2adf0270a50dfa2ffd81b3e
Binary files /dev/null and b/src/models/dit/__pycache__/modules2.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/motion_diffusion.cpython-310.pyc b/src/models/dit/__pycache__/motion_diffusion.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..404554853970e5522c0a01e7403a5a4e0f10eed2
Binary files /dev/null and b/src/models/dit/__pycache__/motion_diffusion.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/posemb_layers.cpython-310.pyc b/src/models/dit/__pycache__/posemb_layers.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7fc4eb13c5a5400888f16f7c8aed2b8abe475ddb
Binary files /dev/null and b/src/models/dit/__pycache__/posemb_layers.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/posemb_layers_xtrans.cpython-310.pyc b/src/models/dit/__pycache__/posemb_layers_xtrans.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3876db8d9442b2f6b953dbe0577a6f47ba925b68
Binary files /dev/null and b/src/models/dit/__pycache__/posemb_layers_xtrans.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/positional_embedding.cpython-310.pyc b/src/models/dit/__pycache__/positional_embedding.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15754faffd971dc5d485ef313dd2fe679bc22a97
Binary files /dev/null and b/src/models/dit/__pycache__/positional_embedding.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_dual_head.cpython-310.pyc b/src/models/dit/__pycache__/talking_dual_head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d399eb22f7e12bef0a484f45bb912459fcc72153
Binary files /dev/null and b/src/models/dit/__pycache__/talking_dual_head.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8cbd5cb3d3f5ce8a6bef60f96ccf82611541e373
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v10.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v10.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f51bf63ce6f24dca53f341dd96c2a7dfac91db9a
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v10.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v11.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v11.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4a88b638870fe83be2e143d496a06f0221c4935
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v11.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v12.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v12.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ec56ec5bb5eb272c54ddb9f85ba35e33a92545b
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v12.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v13.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v13.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a22749b929982573e92407f1b7cd8774889fa753
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v13.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v14.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v14.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3a398dae95bbd3683459184a52ca26dd004fd4f
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v14.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v2.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdb1219cd2ce5060bc9d28607eae3b41f35392bd
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v2.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v3.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v3.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66b2fd3e1f8c71faebce3907ae607c3622411b20
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v3.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v4.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v4.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..551e30688df8b52b984d9cd86c91e459f63d91c2
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v4.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v5.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v5.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9b2ef86763c7c6ffe3ad2015907f36eba817328
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v5.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v6.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v6.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59cbfba6c62c4dc15065bb15e7ef35570f04a245
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v6.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v7.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v7.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d31ab0b3d9fa93d36b161bd383822bc3bfc2b68d
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v7.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v8.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v8.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..915b50b485c6674d9ddddd797938e8e50a3f91b9
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v8.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_diffusion_v9.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_diffusion_v9.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9de6f7aa1f0f648d9c0e15148ee48f764473e0a6
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_diffusion_v9.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f57834ba3290b545c51c7b1ef48e8745fada9b8c
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v10.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v10.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..142e205381e1333931e0d7baa4891f6c66cbfd5c
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v10.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v11.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v11.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1457b187294dd0a483c153a38486c32ff79cb12c
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v11.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v12.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v12.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3889d85206c80641619a3b9c16f93ae5f949abfc
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v12.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v13.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v13.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2fbbd631bddc4d40b8520f1ff2d73b2b653c991
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v13.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v14.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v14.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94fe6de678d45f70804d0e909db99c5a440483fc
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v14.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v2.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de1306ab24b345fd38dc3f65b99a46ecc8d98aad
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v2.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v3.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v3.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3db1910ce0053b6f7f8fa4a9b0c0080317d47ceb
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v3.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v4.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v4.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2897401ebdacac523101b28387c7aa04e9baa8a
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v4.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v5.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v5.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b764148e2864d243f8ae43bb81e09829ce4cb8b3
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v5.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v6.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v6.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d7a23895fbb45499bdb25635c53c604705e9041
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v6.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v7.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v7.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67c7f27dafaae628a524b8bc38e8925c0d98bbdd
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v7.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v8.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v8.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f311412ff63bd93965d3db2efae8dc8e80884a2
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v8.cpython-310.pyc differ
diff --git a/src/models/dit/__pycache__/talking_head_dit_v9.cpython-310.pyc b/src/models/dit/__pycache__/talking_head_dit_v9.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e04a1135ba4e6270af3400eccee9ed3077e2d953
Binary files /dev/null and b/src/models/dit/__pycache__/talking_head_dit_v9.cpython-310.pyc differ
diff --git a/src/models/dit/appearance_feature_extractor.py b/src/models/dit/appearance_feature_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d89e4f18a2fbe58447f52ab4c5e3f2011a4ec80
--- /dev/null
+++ b/src/models/dit/appearance_feature_extractor.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+
+"""
+Appearance extractor(F) defined in paper, which maps the source image s to a 3D appearance feature volume.
+"""
+
+import torch
+from torch import nn
+from .util import SameBlock2d, DownBlock2d, ResBlock3d
+
+
+class AppearanceFeatureExtractor(nn.Module):
+
+    def __init__(self, image_channel, block_expansion, num_down_blocks, max_features, reshape_channel, reshape_depth, num_resblocks):
+        super(AppearanceFeatureExtractor, self).__init__()
+        self.image_channel = image_channel
+        self.block_expansion = block_expansion
+        self.num_down_blocks = num_down_blocks
+        self.max_features = max_features
+        self.reshape_channel = reshape_channel
+        self.reshape_depth = reshape_depth
+
+        self.first = SameBlock2d(image_channel, block_expansion, kernel_size=(3, 3), padding=(1, 1))
+
+        down_blocks = []
+        for i in range(num_down_blocks):
+            in_features = min(max_features, block_expansion * (2 ** i))
+            out_features = min(max_features, block_expansion * (2 ** (i + 1)))
+            down_blocks.append(DownBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1)))
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+        self.second = nn.Conv2d(in_channels=out_features, out_channels=max_features, kernel_size=1, stride=1)
+
+        self.resblocks_3d = torch.nn.Sequential()
+        for i in range(num_resblocks):
+            self.resblocks_3d.add_module('3dr' + str(i), ResBlock3d(reshape_channel, kernel_size=3, padding=1))
+
+    def forward(self, source_image):
+        out = self.first(source_image)  # Bx3x256x256 -> Bx64x256x256
+
+        for i in range(len(self.down_blocks)):
+            out = self.down_blocks[i](out)
+        out = self.second(out)
+        bs, c, h, w = out.shape  # ->Bx512x64x64
+
+        f_s = out.view(bs, self.reshape_channel, self.reshape_depth, h, w)  # ->Bx32x16x64x64
+        f_s = self.resblocks_3d(f_s)  # ->Bx32x16x64x64
+        return f_s
diff --git a/src/models/dit/blocks.py b/src/models/dit/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..172e86d7a8f590a41dc93156426ffbb8f374bdc1
--- /dev/null
+++ b/src/models/dit/blocks.py
@@ -0,0 +1,366 @@
+import torch
+import torch.nn as nn
+import numbers
+
+from .modules import RMSNorm, SelfAttention, CrossAttention, Mlp,MMdual_attention,MMsingle_attention,MMfour_attention
+
+from einops import rearrange, repeat
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+def _basic_init(module):
+    if isinstance(module, nn.Linear):
+        torch.nn.init.xavier_uniform_(module.weight)
+        if module.bias is not None:
+            nn.init.constant_(module.bias, 0)
+
+
+#################################################################################
+#                                 Core DiT Model                                #
+#################################################################################
+class DiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning, contains CrossAttention.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        norm_type = block_kwargs.get("norm_type", "rms_norm")
+
+        assert norm_type in ["layer_norm", "rms_norm"]
+
+        make_norm_layer = (
+            nn.LayerNorm if norm_type == "layer_norm" else RMSNorm
+        )
+
+        self.norm1 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn1 = SelfAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+
+        self.norm2 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+
+    def forward(self, x, c,mask=None,freqs_cis=None):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn1(modulate(self.norm1(x), shift_msa, scale_msa),mask,freqs_cis)
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+
+class MMSingleStreamBlock(nn.Module):
+    ''' A multimodal dit block with seperate modulation '''
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        norm_type = block_kwargs.get("norm_type", "rms_norm")
+
+        assert norm_type in ["layer_norm", "rms_norm"]
+
+        make_norm_layer = (
+            nn.LayerNorm if norm_type == "layer_norm" else RMSNorm
+        )
+
+        self.norm1 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn1 = MMsingle_attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        # self.attn2 = CrossAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        
+        self.norm3 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm4 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.qkv_xs = nn.Linear(hidden_size, hidden_size * 3+mlp_hidden_dim, bias=True)
+        # self.xs_mlp = Mlp(in_features=hidden_size+mlp_hidden_dim, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.linear2 = nn.Linear(
+            hidden_size + mlp_hidden_dim, hidden_size,
+        )
+        self.mlp_act = approx_gelu()
+        self.adaLN_modulation_xs = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 3* hidden_size, bias=True)
+        )
+        self.hidden_size=hidden_size
+        self.mlp_hidden_dim=mlp_hidden_dim
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        nn.init.constant_(self.adaLN_modulation_xs[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation_xs[-1].bias, 0)
+
+    def forward(self,seq_len, x, c,mask=None,freqs_cis=None,freqs_cis2=None,causal=False):
+        shift_msa_xs, scale_msa_xs, gate_msa_xs = self.adaLN_modulation_xs(c).chunk(3, dim=1)
+        # Prepare for attention
+        x_mod=modulate(self.norm1(x), shift_msa_xs, scale_msa_xs)
+        qkv, mlp = torch.split(
+            self.qkv_xs(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1
+        )
+        att1= self.attn1(seq_len,qkv,mask,causal=causal,freqs_cis=freqs_cis,freqs_cis2=freqs_cis2)
+        output=self.linear2(torch.cat((att1, self.mlp_act(mlp)), 2))
+        x=x+gate_msa_xs.unsqueeze(1)*output
+        return x
+class MMfourStreamBlock(nn.Module):
+    ''' A multimodal dit block with seperate modulation '''
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        norm_type = block_kwargs.get("norm_type", "rms_norm")
+
+        assert norm_type in ["layer_norm", "rms_norm"]
+
+        make_norm_layer = (
+            nn.LayerNorm if norm_type == "layer_norm" else RMSNorm
+        )
+
+        self.norm1 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn1 = MMfour_attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        
+        self.norm2 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        # self.attn2 = CrossAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        
+        self.norm3 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+
+        self.norm4 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm5 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm6 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm7 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm8 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.xs_mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.audio_mlp1 = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.audio_mlp2 = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.audio_mlp3 = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation_xs = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+        self.adaLN_modulation_audio1 = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        
+        )
+        self.adaLN_modulation_audio2 = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+        self.adaLN_modulation_audio3 = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        nn.init.constant_(self.adaLN_modulation_xs[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation_xs[-1].bias, 0)
+
+        nn.init.constant_(self.adaLN_modulation_audio1[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation_audio1[-1].bias, 0)
+
+        nn.init.constant_(self.adaLN_modulation_audio2[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation_audio2[-1].bias, 0)
+
+        nn.init.constant_(self.adaLN_modulation_audio3[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation_audio3[-1].bias, 0)
+
+
+    def forward(self, x, c, y1,y2,y3,mask=None,freqs_cis=None,freqs_cis2=None,causal=False):
+        shift_msa_xs, scale_msa_xs, gate_msa_xs, shift_mlp_xs, scale_mlp_xs, gate_mlp_xs = self.adaLN_modulation_xs(c).chunk(6, dim=1)
+        shift_mca_audio1, scale_mca_audio1, gate_mca_audio1, shift_mlp_audio1, scale_mlp_audio1, gate_mlp_audio1 = self.adaLN_modulation_audio1(c).chunk(6, dim=1)
+        shift_mca_audio2, scale_mca_audio2, gate_mca_audio2, shift_mlp_audio2, scale_mlp_audio2, gate_mlp_audio2 = self.adaLN_modulation_audio2(c).chunk(6, dim=1)
+        shift_mca_audio3, scale_mca_audio3, gate_mca_audio3, shift_mlp_audio3, scale_mlp_audio3, gate_mlp_audio3= self.adaLN_modulation_audio3(c).chunk(6, dim=1)
+        # Prepare for attention
+        att1,att2,att3,att4= self.attn1( modulate(self.norm1(x), shift_msa_xs, scale_msa_xs),
+                                modulate(self.norm2(y1), shift_mca_audio1, scale_mca_audio1),
+                                modulate(self.norm3(y2), shift_mca_audio2, scale_mca_audio2),
+                                modulate(self.norm4(y3), shift_mca_audio3, scale_mca_audio3),
+                                mask,causal=causal,freqs_cis=freqs_cis,freqs_cis2=freqs_cis2)
+        x=x+gate_msa_xs.unsqueeze(1)*att1
+        y1=y1+gate_mca_audio1.unsqueeze(1)*att2
+        y2=y2+gate_mca_audio2.unsqueeze(1)*att3
+        y3=y3+gate_mca_audio3.unsqueeze(1)*att4
+
+        x = x + gate_mlp_xs.unsqueeze(1) * self.xs_mlp(modulate(self.norm5(x), shift_mlp_xs, scale_mlp_xs))
+        y1 = y1 + gate_mlp_audio1.unsqueeze(1) * self.audio_mlp1(modulate(self.norm6(y1), shift_mlp_audio1, scale_mlp_audio1))
+        y2 = y2 + gate_mlp_audio2.unsqueeze(1) * self.audio_mlp2(modulate(self.norm7(y2), shift_mlp_audio2, scale_mlp_audio2))
+        y3 = y3 + gate_mlp_audio3.unsqueeze(1) * self.audio_mlp3(modulate(self.norm8(y3), shift_mlp_audio3, scale_mlp_audio3))
+        return x,y1,y2,y3
+class MMDoubleStreamBlock(nn.Module):
+    ''' A multimodal dit block with seperate modulation '''
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        norm_type = block_kwargs.get("norm_type", "rms_norm")
+
+        assert norm_type in ["layer_norm", "rms_norm"]
+
+        make_norm_layer = (
+            nn.LayerNorm if norm_type == "layer_norm" else RMSNorm
+        )
+
+        self.norm1 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn1 = MMdual_attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        
+        self.norm2 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        # self.attn2 = CrossAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        
+        self.norm3 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm4 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.xs_mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.audio_mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation_xs = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+        self.adaLN_modulation_audio = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        nn.init.constant_(self.adaLN_modulation_xs[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation_xs[-1].bias, 0)
+
+        nn.init.constant_(self.adaLN_modulation_audio[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation_audio[-1].bias, 0)
+
+    def forward(self, seq_len,x, c, y,mask=None,freqs_cis=None,freqs_cis2=None,causal=False):
+        shift_msa_xs, scale_msa_xs, gate_msa_xs, shift_mlp_xs, scale_mlp_xs, gate_mlp_xs = self.adaLN_modulation_xs(c).chunk(6, dim=1)
+        shift_mca_audio, scale_mca_audio, gate_mca_audio, shift_mlp_audio, scale_mlp_audio, gate_mlp_audio = self.adaLN_modulation_audio(c).chunk(6, dim=1)
+        # Prepare for attention
+        att1,att2 = self.attn1(seq_len,modulate(self.norm1(x), shift_msa_xs, scale_msa_xs),modulate(self.norm2(y), shift_mca_audio, scale_mca_audio),mask,causal=causal,freqs_cis=freqs_cis,freqs_cis2=freqs_cis2)
+        x=x+gate_msa_xs.unsqueeze(1)*att1
+        y=y+gate_mca_audio.unsqueeze(1)*att2
+        x = x + gate_mlp_xs.unsqueeze(1) * self.xs_mlp(modulate(self.norm3(x), shift_mlp_xs, scale_mlp_xs))
+        y = y + gate_mlp_audio.unsqueeze(1) * self.audio_mlp(modulate(self.norm4(y), shift_mlp_audio, scale_mlp_audio))
+        return x,y
+class CrossDiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning, contains CrossAttention.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        norm_type = block_kwargs.get("norm_type", "rms_norm")
+
+        assert norm_type in ["layer_norm", "rms_norm"]
+
+        make_norm_layer = (
+            nn.LayerNorm if norm_type == "layer_norm" else RMSNorm
+        )
+
+        self.norm1 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn1 = SelfAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        
+        self.norm2 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn2 = CrossAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        
+        self.norm3 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 9 * hidden_size, bias=True)
+        )
+    
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+
+    def forward(self, x, c, y,mask=None):
+        shift_msa, scale_msa, gate_msa, shift_mca, scale_mca, gate_mca, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(9, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn1(modulate(self.norm1(x), shift_msa, scale_msa),mask)
+        x = x + gate_mca.unsqueeze(1) * self.attn2(modulate(self.norm2(x), shift_mca, scale_mca), y,mask)
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm3(x), shift_mlp, scale_mlp))
+        return x
+class SelfBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning, contains CrossAttention.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        norm_type = block_kwargs.get("norm_type", "rms_norm")
+
+        assert norm_type in ["layer_norm", "rms_norm"]
+
+        make_norm_layer = (
+            nn.LayerNorm if norm_type == "layer_norm" else RMSNorm
+        )
+        
+        self.norm2 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn2 = SelfAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        
+    
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Zero-out adaLN modulation layers in DiT blocks:
+
+    def forward(self, x, y,mask=None):
+        x = x + self.attn2(self.norm2(x),mask)
+        return x
+class CrossBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning, contains CrossAttention.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        norm_type = block_kwargs.get("norm_type", "rms_norm")
+
+        assert norm_type in ["layer_norm", "rms_norm"]
+
+        make_norm_layer = (
+            nn.LayerNorm if norm_type == "layer_norm" else RMSNorm
+        )
+        
+        self.norm2 = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn2 = CrossAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        
+    
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Zero-out adaLN modulation layers in DiT blocks:
+
+    def forward(self, x, y,mask=None):
+        x = x + self.attn2(self.norm2(x), y,mask)
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, out_channels, norm_type="rms_norm"):
+        super().__init__()
+        assert norm_type in ["layer_norm", "rms_norm"]
+
+        make_norm_layer = (
+            nn.LayerNorm if norm_type == "layer_norm" else RMSNorm
+        )
+
+        self.norm_final = make_norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Zero-out output layers:
+        nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.linear.weight, 0)
+        nn.init.constant_(self.linear.bias, 0)
+
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
\ No newline at end of file
diff --git a/src/models/dit/common.py b/src/models/dit/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..492103b00ee671d931738a2c05773d3c2c999828
--- /dev/null
+++ b/src/models/dit/common.py
@@ -0,0 +1,30 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=600):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        # vanilla sinusoidal encoding
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        x = x + self.pe[:, x.shape[1], :]
+        return self.dropout(x)
+
+
+def enc_dec_mask(T, S, frame_width=2, expansion=0, device='cuda'):
+    mask = torch.ones(T, S)
+    for i in range(T):
+        mask[i, max(0, (i - expansion) * frame_width):(i + expansion + 1) * frame_width] = 0
+    return (mask == 1).to(device=device)
diff --git a/src/models/dit/config.py b/src/models/dit/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..bddd73bb82ce54a4a615891c531f5a92ff25e7f6
--- /dev/null
+++ b/src/models/dit/config.py
@@ -0,0 +1,150 @@
+""" Model / Layer Config singleton state
+Borrowed from https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/config.py#L130
+"""
+import os
+import warnings
+from typing import Any, Optional
+
+import torch
+
+__all__ = [
+    'is_exportable', 'is_scriptable', 'is_no_jit', 'use_fused_attn',
+    'set_exportable', 'set_scriptable', 'set_no_jit', 'set_layer_config', 'set_fused_attn'
+]
+
+# Set to True if prefer to have layers with no jit optimization (includes activations)
+_NO_JIT = False
+
+# Set to True if prefer to have activation layers with no jit optimization
+# NOTE not currently used as no difference between no_jit and no_activation jit as only layers obeying
+# the jit flags so far are activations. This will change as more layers are updated and/or added.
+_NO_ACTIVATION_JIT = False
+
+# Set to True if exporting a model with Same padding via ONNX
+_EXPORTABLE = False
+
+# Set to True if wanting to use torch.jit.script on a model
+_SCRIPTABLE = False
+
+
+# use torch.scaled_dot_product_attention where possible
+_HAS_FUSED_ATTN = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+if 'TIMM_FUSED_ATTN' in os.environ:
+    _USE_FUSED_ATTN = int(os.environ['TIMM_FUSED_ATTN'])
+else:
+    _USE_FUSED_ATTN = 1  # 0 == off, 1 == on (for tested use), 2 == on (for experimental use)
+
+
+def is_no_jit():
+    return _NO_JIT
+
+
+class set_no_jit:
+    def __init__(self, mode: bool) -> None:
+        global _NO_JIT
+        self.prev = _NO_JIT
+        _NO_JIT = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _NO_JIT
+        _NO_JIT = self.prev
+        return False
+
+
+def is_exportable():
+    return _EXPORTABLE
+
+
+class set_exportable:
+    def __init__(self, mode: bool) -> None:
+        global _EXPORTABLE
+        self.prev = _EXPORTABLE
+        _EXPORTABLE = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _EXPORTABLE
+        _EXPORTABLE = self.prev
+        return False
+
+
+def is_scriptable():
+    return _SCRIPTABLE
+
+
+class set_scriptable:
+    def __init__(self, mode: bool) -> None:
+        global _SCRIPTABLE
+        self.prev = _SCRIPTABLE
+        _SCRIPTABLE = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _SCRIPTABLE
+        _SCRIPTABLE = self.prev
+        return False
+
+
+class set_layer_config:
+    """ Layer config context manager that allows setting all layer config flags at once.
+    If a flag arg is None, it will not change the current value.
+    """
+    def __init__(
+            self,
+            scriptable: Optional[bool] = None,
+            exportable: Optional[bool] = None,
+            no_jit: Optional[bool] = None,
+            no_activation_jit: Optional[bool] = None):
+        global _SCRIPTABLE
+        global _EXPORTABLE
+        global _NO_JIT
+        global _NO_ACTIVATION_JIT
+        self.prev = _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT
+        if scriptable is not None:
+            _SCRIPTABLE = scriptable
+        if exportable is not None:
+            _EXPORTABLE = exportable
+        if no_jit is not None:
+            _NO_JIT = no_jit
+        if no_activation_jit is not None:
+            _NO_ACTIVATION_JIT = no_activation_jit
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _SCRIPTABLE
+        global _EXPORTABLE
+        global _NO_JIT
+        global _NO_ACTIVATION_JIT
+        _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT = self.prev
+        return False
+
+
+def use_fused_attn(experimental: bool = False) -> bool:
+    # NOTE: ONNX export cannot handle F.scaled_dot_product_attention as of pytorch 2.0
+    if not _HAS_FUSED_ATTN or _EXPORTABLE:
+        return False
+    if experimental:
+        return _USE_FUSED_ATTN > 1
+    return _USE_FUSED_ATTN > 0
+
+
+def set_fused_attn(enable: bool = True, experimental: bool = False):
+    global _USE_FUSED_ATTN
+    if not _HAS_FUSED_ATTN:
+        warnings.warn('This version of pytorch does not have F.scaled_dot_product_attention, fused_attn flag ignored.')
+        return
+    if experimental and enable:
+        _USE_FUSED_ATTN = 2
+    elif enable:
+        _USE_FUSED_ATTN = 1
+    else:
+        _USE_FUSED_ATTN = 0
\ No newline at end of file
diff --git a/src/models/dit/dense_motion.py b/src/models/dit/dense_motion.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1a7f9a5a1b5463c4a1ddcee2bf36dcd34735706
--- /dev/null
+++ b/src/models/dit/dense_motion.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+
+"""
+The module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving
+"""
+
+from torch import nn
+import torch.nn.functional as F
+import torch
+from .util import Hourglass, make_coordinate_grid, kp2gaussian
+
+
+class DenseMotionNetwork(nn.Module):
+    def __init__(self, block_expansion, num_blocks, max_features, num_kp, feature_channel, reshape_depth, compress, estimate_occlusion_map=True):
+        super(DenseMotionNetwork, self).__init__()
+        self.hourglass = Hourglass(block_expansion=block_expansion, in_features=(num_kp+1)*(compress+1), max_features=max_features, num_blocks=num_blocks)  # ~60+G
+
+        self.mask = nn.Conv3d(self.hourglass.out_filters, num_kp + 1, kernel_size=7, padding=3)  # 65G! NOTE: computation cost is large
+        self.compress = nn.Conv3d(feature_channel, compress, kernel_size=1)  # 0.8G
+        self.norm = nn.BatchNorm3d(compress, affine=True)
+        self.num_kp = num_kp
+        self.flag_estimate_occlusion_map = estimate_occlusion_map
+
+        if self.flag_estimate_occlusion_map:
+            self.occlusion = nn.Conv2d(self.hourglass.out_filters*reshape_depth, 1, kernel_size=7, padding=3)
+        else:
+            self.occlusion = None
+
+    def create_sparse_motions(self, feature, kp_driving, kp_source):
+        bs, _, d, h, w = feature.shape  # (bs, 4, 16, 64, 64)
+        identity_grid = make_coordinate_grid((d, h, w), ref=kp_source)  # (16, 64, 64, 3)
+        identity_grid = identity_grid.view(1, 1, d, h, w, 3)  # (1, 1, d=16, h=64, w=64, 3)
+        coordinate_grid = identity_grid - kp_driving.view(bs, self.num_kp, 1, 1, 1, 3)
+
+        k = coordinate_grid.shape[1]
+
+        # NOTE: there lacks an one-order flow
+        driving_to_source = coordinate_grid + kp_source.view(bs, self.num_kp, 1, 1, 1, 3)    # (bs, num_kp, d, h, w, 3)
+
+        # adding background feature
+        identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1, 1)
+        sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1)  # (bs, 1+num_kp, d, h, w, 3)
+        return sparse_motions
+
+    def create_deformed_feature(self, feature, sparse_motions):
+        bs, _, d, h, w = feature.shape
+        feature_repeat = feature.unsqueeze(1).unsqueeze(1).repeat(1, self.num_kp+1, 1, 1, 1, 1, 1)      # (bs, num_kp+1, 1, c, d, h, w)
+        feature_repeat = feature_repeat.view(bs * (self.num_kp+1), -1, d, h, w)                         # (bs*(num_kp+1), c, d, h, w)
+        sparse_motions = sparse_motions.view((bs * (self.num_kp+1), d, h, w, -1))                       # (bs*(num_kp+1), d, h, w, 3)
+        sparse_deformed = F.grid_sample(feature_repeat, sparse_motions, align_corners=False)
+        sparse_deformed = sparse_deformed.view((bs, self.num_kp+1, -1, d, h, w))                        # (bs, num_kp+1, c, d, h, w)
+
+        return sparse_deformed
+
+    def create_heatmap_representations(self, feature, kp_driving, kp_source):
+        spatial_size = feature.shape[3:]  # (d=16, h=64, w=64)
+        gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)
+        gaussian_source = kp2gaussian(kp_source, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)
+        heatmap = gaussian_driving - gaussian_source  # (bs, num_kp, d, h, w)
+
+        # adding background feature
+        zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0], spatial_size[1], spatial_size[2]).type(heatmap.dtype).to(heatmap.device)
+        heatmap = torch.cat([zeros, heatmap], dim=1)
+        heatmap = heatmap.unsqueeze(2)         # (bs, 1+num_kp, 1, d, h, w)
+        return heatmap
+
+    def forward(self, feature, kp_driving, kp_source):
+        bs, _, d, h, w = feature.shape  # (bs, 32, 16, 64, 64)
+
+        feature = self.compress(feature)  # (bs, 4, 16, 64, 64)
+        feature = self.norm(feature)  # (bs, 4, 16, 64, 64)
+        feature = F.relu(feature)  # (bs, 4, 16, 64, 64)
+
+        out_dict = dict()
+
+        # 1. deform 3d feature
+        sparse_motion = self.create_sparse_motions(feature, kp_driving, kp_source)  # (bs, 1+num_kp, d, h, w, 3)
+        deformed_feature = self.create_deformed_feature(feature, sparse_motion)  # (bs, 1+num_kp, c=4, d=16, h=64, w=64)
+
+        # 2. (bs, 1+num_kp, d, h, w)
+        heatmap = self.create_heatmap_representations(deformed_feature, kp_driving, kp_source)  # (bs, 1+num_kp, 1, d, h, w)
+
+        input = torch.cat([heatmap, deformed_feature], dim=2)  # (bs, 1+num_kp, c=5, d=16, h=64, w=64)
+        input = input.view(bs, -1, d, h, w)  # (bs, (1+num_kp)*c=105, d=16, h=64, w=64)
+
+        prediction = self.hourglass(input)
+
+        mask = self.mask(prediction)
+        mask = F.softmax(mask, dim=1)  # (bs, 1+num_kp, d=16, h=64, w=64)
+        out_dict['mask'] = mask
+        mask = mask.unsqueeze(2)                                   # (bs, num_kp+1, 1, d, h, w)
+        sparse_motion = sparse_motion.permute(0, 1, 5, 2, 3, 4)    # (bs, num_kp+1, 3, d, h, w)
+        deformation = (sparse_motion * mask).sum(dim=1)            # (bs, 3, d, h, w)  mask take effect in this place
+        deformation = deformation.permute(0, 2, 3, 4, 1)           # (bs, d, h, w, 3)
+
+        out_dict['deformation'] = deformation
+
+        if self.flag_estimate_occlusion_map:
+            bs, _, d, h, w = prediction.shape
+            prediction_reshape = prediction.view(bs, -1, h, w)
+            occlusion_map = torch.sigmoid(self.occlusion(prediction_reshape))  # Bx1x64x64
+            out_dict['occlusion_map'] = occlusion_map
+
+        return out_dict
diff --git a/src/models/dit/embedders.py b/src/models/dit/embedders.py
new file mode 100644
index 0000000000000000000000000000000000000000..f067005afc3324ef18caee64a2cb83e8d64ea4ad
--- /dev/null
+++ b/src/models/dit/embedders.py
@@ -0,0 +1,665 @@
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from diffusers.models.modeling_utils import ModelMixin
+
+from .blocks import _basic_init, DiTBlock
+from .modules import RMSNorm
+from .positional_embedding import get_1d_sincos_pos_embed
+
+#################################################################################
+#          Embedding Layers for Timesteps, Emotion Labels and Motions           #
+#################################################################################
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size: int, frequency_embedding_size: int=256, dtype=None, device=None):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True, dtype=dtype, device=device),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device=device),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        for l in [0, 2]:
+            nn.init.normal_(self.mlp[l].weight, std=0.02)
+
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        if torch.is_floating_point(t):
+            embedding = embedding.to(dtype=t.dtype)
+        return embedding
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes: int, hidden_size: int, dropout_prob: float, dtype=None, device=None):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size, dtype=None, device=None)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    
+    def initialize_weights(self):
+        # Initialize label embedding table:
+        nn.init.normal_(self.embedding_table.weight, std=0.02)
+
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+
+
+class MotionEmbedder(nn.Module):
+    """
+    Embeds motion into vector representations, Motion shape B x L x D
+    """
+    def __init__(self, motion_dim: int, hidden_size: int, dtype=None, device=None):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(motion_dim, hidden_size, bias=True, dtype=None, device=None),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True, dtype=None, device=None),
+        )
+    
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        for l in [0, 2]:
+            w = self.mlp[l].weight.data
+            nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+            nn.init.constant_(self.mlp[l].bias, 0)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+
+
+class AudioEmbedder(ModelMixin):
+    """Audio Projection Model
+
+    This class defines an audio projection model that takes audio embeddings as input
+    and produces context tokens as output. The model is based on the ModelMixin class
+    and consists of multiple linear layers and activation functions. It can be used
+    for various audio processing tasks.
+
+    Attributes:
+        seq_len (int): The length of the audio sequence.
+        blocks (int): The number of blocks in the audio projection model.
+        channels (int): The number of channels in the audio projection model.
+        intermediate_dim (int): The intermediate dimension of the model.
+        context_tokens (int): The number of context tokens in the output.
+        output_dim (int): The output dimension of the context tokens.
+
+    Methods:
+        __init__(self, seq_len=5, blocks=12, channels=768, intermediate_dim=512, context_tokens=32, output_dim=768):
+            Initializes the AudioProjModel with the given parameters.
+        forward(self, audio_embeds):
+            Defines the forward pass for the AudioProjModel.
+            Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+
+    """
+
+    def __init__(
+        self,
+        seq_len=5,
+        blocks=12,  # add a new parameter blocks
+        channels=768,  # add a new parameter channels
+        intermediate_dim=512,
+        output_dim=768,
+        context_tokens=32,
+        input_len = 80,
+        condition_dim = 63,
+        norm_type="rms_norm",
+        qk_norm="rms_norm"
+    ):
+        super().__init__()
+        input_dim = (
+            seq_len * blocks * channels
+        )  # update input_dim to be the product of blocks and channels.
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+
+        # define multiple linear layers
+        self.proj1 = nn.Linear(input_dim, intermediate_dim)
+        self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
+        self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
+
+        self.norm = nn.LayerNorm(output_dim) if norm_type == "layer_norm" else RMSNorm(output_dim)
+
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.proj1.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.proj1.bias, 0)
+
+        w = self.proj2.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.proj2.bias, 0)
+        
+        w = self.proj3.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.proj3.bias, 0)
+
+    def forward(self, audio_embeds, conditions=None, emo=None):
+        """
+        Defines the forward pass for the AudioProjModel.
+
+        Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            conditions (torch.Tensor): optional other conditions with shape (batch_size, video_length, channels) or (batch_size, channels)
+            emo (torch.Tensor): optional emotion embedding with shape (batch_size, channels)
+        Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+        """
+        # merge
+        video_length = audio_embeds.shape[1]
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.reshape(batch_size, window_size * blocks * channels)
+
+        audio_embeds = torch.relu(self.proj1(audio_embeds))
+        audio_embeds = torch.relu(self.proj2(audio_embeds))
+
+        context_tokens = self.proj3(audio_embeds).reshape(
+            batch_size, self.context_tokens, self.output_dim
+        )
+
+        context_tokens = self.norm(context_tokens)
+        context_tokens = rearrange(
+            context_tokens, "(bz f) m c -> bz f m c", f=video_length
+        )
+
+        return context_tokens
+
+
+class ConditionAudioEmbedder(ModelMixin):
+    """Audio Projection Model with conditions
+
+    This class defines an audio projection model that takes audio embeddings as input
+    and produces context tokens as output. The model is based on the ModelMixin class
+    and consists of multiple linear layers and activation functions. It can be used
+    for various audio processing tasks.
+
+    Attributes:
+        seq_len (int): The length of the audio sequence.
+        blocks (int): The number of blocks in the audio projection model.
+        channels (int): The number of channels in the audio projection model.
+        intermediate_dim (int): The intermediate dimension of the model.
+        context_tokens (int): The number of context tokens in the output.
+        output_dim (int): The output dimension of the context tokens.
+
+    Methods:
+        __init__(self, seq_len=5, blocks=12, channels=768, intermediate_dim=512, context_tokens=32, output_dim=768):
+            Initializes the AudioProjModel with the given parameters.
+        forward(self, audio_embeds):
+            Defines the forward pass for the AudioProjModel.
+            Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+
+    """
+
+    def __init__(
+        self,
+        seq_len=5,
+        blocks=12,  # add a new parameter blocks
+        channels=768,  # add a new parameter channels
+        intermediate_dim=512,
+        output_dim=768,
+        context_tokens=32,
+        input_len = 80,
+        condition_dim=63,
+        norm_type="rms_norm",
+        qk_norm="rms_norm"
+    ):
+        super().__init__()
+        self.input_dim = (
+            seq_len * blocks * channels + condition_dim
+        )  # update input_dim to be the product of blocks and channels.
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+
+        # define multiple linear layers
+        self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
+        self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
+        self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
+
+        self.norm = nn.LayerNorm(output_dim) if norm_type == "layer_norm" else RMSNorm(output_dim)
+
+    def initialize_weights(self):
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.proj1.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.proj1.bias, 0)
+
+        w = self.proj2.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.proj2.bias, 0)
+        
+        w = self.proj3.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.proj3.bias, 0)
+
+    def forward(self, audio_embeds, conditions, emo=None):
+        """
+        Defines the forward pass for the AudioProjModel.
+
+        Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            conditions (torch.Tensor): other conditions with shape (batch_size, video_length, channels)
+            emo (torch.Tensor): optional emotion embedding with shape (batch_size, channels)
+        Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+        """
+        # merge
+        video_length = audio_embeds.shape[1]
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.reshape(batch_size, window_size * blocks * channels)  # bz*f, C
+        # concat conditions
+        conditions = rearrange(conditions, "bz f c -> (bz f) c")                          # bz*f, c
+        audio_embeds = torch.cat([audio_embeds, conditions], dim=1)                       # bz*f, C+c
+
+        # forward
+        audio_embeds = torch.relu(self.proj1(audio_embeds))
+        audio_embeds = torch.relu(self.proj2(audio_embeds))
+
+        context_tokens = self.proj3(audio_embeds).reshape(
+            batch_size, self.context_tokens, self.output_dim
+        )
+
+        context_tokens = self.norm(context_tokens)
+        context_tokens = rearrange(
+            context_tokens, "(bz f) m c -> bz f m c", f=video_length
+        )
+
+        return context_tokens
+
+
+class SimpleAudioEmbedder(ModelMixin):
+    """Simplfied Audio Projection Model
+
+    This class defines an audio projection model that takes audio embeddings as input
+    and produces context tokens as output. The model is based on the ModelMixin class
+    and consists of multiple linear layers and activation functions. It can be used
+    for various audio processing tasks.
+
+    Attributes:
+        seq_len (int): The length of the audio sequence.
+        blocks (int): The number of blocks in the audio projection model.
+        channels (int): The number of channels in the audio projection model.
+        intermediate_dim (int): The intermediate dimension of the model.
+        context_tokens (int): The number of context tokens in the output.
+        output_dim (int): The output dimension of the context tokens.
+
+    Methods:
+        __init__(self, seq_len=5, blocks=12, channels=768, intermediate_dim=512, context_tokens=32, output_dim=768):
+            Initializes the AudioProjModel with the given parameters.
+        forward(self, audio_embeds):
+            Defines the forward pass for the AudioProjModel.
+            Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+
+    """
+
+    def __init__(
+        self,
+        seq_len=5,
+        blocks=12,  # add a new parameter blocks
+        channels=768,  # add a new parameter channels
+        intermediate_dim=512,
+        output_dim=768,
+        context_tokens=32,
+        input_len = 80,
+        condition_dim = 63,
+        norm_type="rms_norm",
+        qk_norm="rms_norm",
+        n_blocks = 4,
+        n_heads = 4,
+        mlp_ratio = 4
+    ):
+        super().__init__()
+        self.input_dim = (
+            seq_len * blocks * channels
+        )  # update input_dim to be the product of blocks and channels.
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+        self.condition_dim=condition_dim
+        # define input layer
+        
+        self.input_layer = nn.Sequential(
+            nn.Linear(self.input_dim, intermediate_dim, bias=True, dtype=None, device=None),
+            nn.SiLU(),
+            nn.Linear(intermediate_dim, condition_dim+2*intermediate_dim, bias=True, dtype=None, device=None),
+        )
+
+        self.condition2_layer = nn.Linear(condition_dim, condition_dim)
+        self.emo_layer =nn.Linear(intermediate_dim, intermediate_dim)
+        # fuse layer for fusion additonal conditions, like ref_kp
+        self.use_condition = True
+        self.condition_layer = nn.Linear(condition_dim+intermediate_dim, intermediate_dim)
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, input_len, intermediate_dim), requires_grad=False)
+
+        # # mid blocks
+        self.mid_blocks = nn.ModuleList([
+            DiTBlock(
+                intermediate_dim, n_heads, 
+                mlp_ratio=mlp_ratio, 
+                norm_type=norm_type, 
+                qk_norm=qk_norm
+            ) for _ in range(n_blocks)
+        ])
+        # output layer
+        self.output_layer = nn.Linear(intermediate_dim, context_tokens * output_dim)
+        self.output_layer2 = nn.Linear(condition_dim+condition_dim, context_tokens * output_dim)
+        self.output_layer3 = nn.Linear(intermediate_dim+intermediate_dim, context_tokens * output_dim)
+        self.norm = nn.LayerNorm(output_dim) if norm_type == "layer_norm" else RMSNorm(output_dim)
+        self.norm2= nn.LayerNorm(output_dim) if norm_type == "layer_norm" else RMSNorm(output_dim)
+        self.norm3= nn.LayerNorm(output_dim) if norm_type == "layer_norm" else RMSNorm(output_dim)
+    def initialize_weights(self):
+        # 1. Initialize input layer
+        for l in [0, 2]:
+            w = self.input_layer[l].weight.data
+            nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+            nn.init.constant_(self.input_layer[l].bias, 0)
+        w = self.emo_layer.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.emo_layer.bias, 0)
+        #w = self.input_layer.weight.data
+        #nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        #nn.init.constant_(self.input_layer.bias, 0)
+        # 2. Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_1d_sincos_pos_embed(self.pos_embed.shape[-1], self.pos_embed.shape[-2])
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # 3. Initialize condition layer
+        nn.init.normal_(self.condition_layer.weight, std=0.02)
+        nn.init.constant_(self.condition_layer.bias, 0)
+        nn.init.normal_(self.condition2_layer.weight, std=0.02)
+        nn.init.constant_(self.condition2_layer.bias, 0)
+        # 4. Initialize mid blocks
+        # for block in self.mid_blocks:
+        #     block.initialize_weights()
+        # 5. Initialize output layer
+        w = self.output_layer.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.output_layer.bias, 0)
+
+        w = self.output_layer2.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.output_layer2.bias, 0)
+
+        w = self.output_layer3.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.output_layer3.bias, 0)
+
+    def forward(self, audio_embeds, conditions, emo_embeds,mask=None,freqs_cis=None):
+        """
+        Defines the forward pass for the AudioProjModel.
+
+        Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            conditions (torch.Tensor): other conditions with shape (batch_size, video_length, channels) or (batch_size, channels)
+            emo_embeds (torch.Tensor): optional emotion embedding with shape (batch_size, channels)
+        Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+        """
+        # preprare inputs
+        condition2=self.condition2_layer(conditions)
+        emo2=self.emo_layer(emo_embeds)
+
+        video_length = audio_embeds.shape[1]
+        emo_embeds=emo_embeds.unsqueeze(1).repeat(1,video_length,1)
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.reshape(batch_size, window_size * blocks * channels)
+        
+        # input layer
+        audio_embeds = self.input_layer(audio_embeds)
+        audio_embeds = rearrange(audio_embeds, "(bz f) c -> bz f c", f=video_length)
+        # audio_embeds=audio_embeds+self.pos_embed[:,:,:-1]
+        audio_kp=audio_embeds[:,:,:self.condition_dim]
+        audio_xs,audio_emo=audio_embeds[:,:,self.condition_dim:].chunk(2, dim=-1)
+        #enhance
+        audio_enc_kp=torch.cat([audio_kp,conditions], dim=-1)
+        audio_enc_emo=torch.cat([audio_emo,emo_embeds], dim=-1)
+        audio_enc_kp=rearrange(audio_enc_kp, "bz f c -> (bz f) c")
+        audio_enc_emo=rearrange(audio_enc_emo, "bz f c -> (bz f) c")
+        kp_context = self.output_layer2(audio_enc_kp).reshape(
+            batch_size, self.context_tokens, self.output_dim
+        )
+        kp_context=kp_context
+        kp_context=self.norm2(kp_context)
+        emo_context = self.output_layer3(audio_enc_emo).reshape(
+            batch_size, self.context_tokens, self.output_dim
+        )
+        emo_context=self.norm3(emo_context)
+        # condition layer
+        if self.use_condition:
+            audio_xs = self.condition_layer(torch.cat([audio_xs, condition2], dim=-1))
+        # positional embeddings
+              # add positional embedding
+        audio_xs=audio_xs+self.pos_embed
+        # mid blocks
+        for block in self.mid_blocks:
+            audio_xs = block(audio_xs, emo2,mask=mask,freqs_cis=None)
+        # output layer
+        audio_xs = rearrange(audio_xs, "bz f c -> (bz f) c")
+        audio_xs = self.output_layer(audio_xs).reshape(
+            batch_size, self.context_tokens, self.output_dim
+        )
+        audio_xs = self.norm(audio_xs)
+
+        kp_context=rearrange(kp_context, "(bz f) m c -> bz f m c", f=video_length)
+        emo_context=rearrange(emo_context, "(bz f) m c -> bz f m c", f=video_length)
+        audio_xs=rearrange(audio_xs, "(bz f) m c -> bz f m c", f=video_length)
+        # context_tokens=torch.cat([audio_xs, kp_context,emo_context], dim=1)
+        # context_tokens = self.output_layer(audio_embeds).reshape(
+        #     batch_size, self.context_tokens, self.output_dim
+        # )
+        # # context_tokens = self.norm(context_tokens)
+        # context_tokens = rearrange(
+        #     context_tokens, "(bz f) m c -> bz f m c", f=video_length
+        # )
+
+        return kp_context,emo_context,audio_xs,audio_kp,audio_emo,conditions,emo_embeds
+
+
+class ConditionEmbedder(nn.Module):
+    def __init__(
+        self,
+        input_dim=768,  # add a new parameter channels
+        intermediate_dim=1024,
+        output_dim=2048,
+        input_len = 80,
+        norm_type="rms_norm",
+        qk_norm="rms_norm",
+        n_blocks = 4,
+        n_heads = 4,
+        mlp_ratio = 4
+    ):
+        super().__init__()
+        self.input_dim = input_dim 
+        self.output_dim = output_dim
+
+        # define input layer
+        self.input_layer = nn.Sequential(
+            nn.Linear(self.input_dim, intermediate_dim, bias=True, dtype=None, device=None),
+            nn.SiLU(),
+            nn.Linear(intermediate_dim, intermediate_dim, bias=True, dtype=None, device=None),
+        )
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, input_len, intermediate_dim), requires_grad=False)
+
+        # mid blocks
+        self.mid_blocks = nn.ModuleList([
+            DiTBlock(
+                intermediate_dim, n_heads, 
+                mlp_ratio=mlp_ratio, 
+                norm_type=norm_type, 
+                qk_norm=qk_norm
+            ) for _ in range(n_blocks)
+        ])
+        # output layer
+        self.output_layer = nn.Linear(intermediate_dim, output_dim)
+        self.norm = nn.LayerNorm(output_dim) if norm_type == "layer_norm" else RMSNorm(output_dim)
+
+    def initialize_weights(self):
+        # 1. Initialize input layer
+        for l in [0, 2]:
+            w = self.input_layer[l].weight.data
+            nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+            nn.init.constant_(self.input_layer[l].bias, 0)
+
+        # 2. Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_1d_sincos_pos_embed(self.pos_embed.shape[-1], self.pos_embed.shape[-2])
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # 3. Initialize mid blocks
+        for block in self.mid_blocks:
+            block.initialize_weights()
+        # 4. Initialize output layer
+        w = self.output_layer.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.output_layer.bias, 0)
+
+    def forward(self, cond_embeds, emo_embeds):
+        # cond_embeds, B, L, D; emo_embeds, B, D
+        # input layer
+        #batch_size, length, channels = cond_embeds.shape
+        #cond_embeds = rearrange(cond_embeds, "bz f c -> (bz f) c")
+        cond_embeds = self.input_layer(cond_embeds)
+        # positional embeddings
+        #cond_embeds = rearrange(cond_embeds, "bz (f c) -> bz f c")
+        cond_embeds = cond_embeds + self.pos_embed
+        # mid blocks
+        for block in self.mid_blocks:
+            cond_embeds = block(cond_embeds, emo_embeds)
+        # output layer
+        #cond_embeds = rearrange(cond_embeds, "bz f c -> (bz f) c")
+        context_tokens = self.output_layer(cond_embeds)
+        context_tokens = self.norm(context_tokens)
+
+        return context_tokens
+
+
+class VectorEmbedder(nn.Module):
+    """Embeds a flat vector of dimension input_dim"""
+
+    def __init__(self, input_dim: int, hidden_size: int, dtype=None, device=None):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(input_dim, hidden_size, bias=True, dtype=dtype, device=device),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device=device),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(
+        self,
+        img_size: Optional[int] = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        flatten: bool = True,
+        bias: bool = True,
+        strict_img_size: bool = True,
+        dynamic_img_pad: bool = False,
+        dtype=None,
+        device=None,
+    ):
+        super().__init__()
+        self.patch_size = (patch_size, patch_size)
+        if img_size is not None:
+            self.img_size = (img_size, img_size)
+            self.grid_size = tuple(
+                [s // p for s, p in zip(self.img_size, self.patch_size)]
+            )
+            self.num_patches = self.grid_size[0] * self.grid_size[1]
+        else:
+            self.img_size = None
+            self.grid_size = None
+            self.num_patches = None
+
+        # flatten spatial dim and transpose to channels last, kept for bwd compat
+        self.flatten = flatten
+        self.strict_img_size = strict_img_size
+        self.dynamic_img_pad = dynamic_img_pad
+
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=bias,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
+        return x
+
diff --git a/src/models/dit/hubert.py b/src/models/dit/hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ab52969eab24e54f438eb89361f541cc9e27bcf
--- /dev/null
+++ b/src/models/dit/hubert.py
@@ -0,0 +1,52 @@
+from transformers import HubertModel
+from transformers.modeling_outputs import BaseModelOutput
+
+from .wav2vec2 import linear_interpolation
+# from wav2vec2 import linear_interpolation
+
+_CONFIG_FOR_DOC = 'HubertConfig'
+
+
+class HubertModel(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def forward(self, input_values, output_fps=25, attention_mask=None, output_attentions=None,
+                output_hidden_states=None, return_dict=None, frame_num=None):
+        self.config.output_attentions = True
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)  # (N, C, L)
+        # Resample the audio feature @ 50 fps to `output_fps`.
+        if frame_num is not None:
+            extract_features_len = round(frame_num * 50 / output_fps)
+            extract_features = extract_features[:, :, :extract_features_len]
+        extract_features = linear_interpolation(extract_features, 50, output_fps, output_len=frame_num)
+        extract_features = extract_features.transpose(1, 2)  # (N, L, C)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(hidden_states)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_outputs.hidden_states,
+                               attentions=encoder_outputs.attentions, )
diff --git a/src/models/dit/modules.py b/src/models/dit/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..8776e6951ae68a2a975187665bd04379d8f7869d
--- /dev/null
+++ b/src/models/dit/modules.py
@@ -0,0 +1,830 @@
+"""
+Modules, Inculude:
+    - Attention: Attention module used in transformers
+    - MLP: MLP module used in transformers
+    - PositionalEncoding: Positional encoding module used in transformers
+    - ROPE: ROPE module used in transformers
+"""
+import os
+import copy
+import logging
+import math
+import numbers
+from itertools import repeat
+from collections import OrderedDict
+import collections.abc
+from functools import partial
+from typing import Any, Callable, Dict, Optional, Set, Tuple, Type, Union, List, Final
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from einops import rearrange, repeat
+
+from .posemb_layers import apply_rotary_emb
+try:
+    from apex.normalization.fused_layer_norm import fused_layer_norm_affine
+    has_apex = True
+except ImportError:
+    has_apex = False
+
+try:
+    from apex.normalization.fused_layer_norm import fused_rms_norm_affine, fused_rms_norm
+    has_apex_rmsnorm = True
+except ImportError:
+    has_apex_rmsnorm = False
+
+has_torch_rms_norm = hasattr(F, 'rms_norm')
+
+from .config import use_fused_attn
+
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def make_divisible(v, divisor=8, min_value=None, round_limit=.9):
+    min_value = min_value or divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < round_limit * v:
+        new_v += divisor
+    return new_v
+
+
+def extend_tuple(x, n):
+    # pads a tuple to specified n by padding with last value
+    if not isinstance(x, (tuple, list)):
+        x = (x,)
+    else:
+        x = tuple(x)
+    pad_n = n - len(x)
+    if pad_n <= 0:
+        return x[:n]
+    return x + (x[-1],) * pad_n
+
+
+# RMS_NORM
+def get_autocast_dtype(device: str = 'cuda'):
+    try:
+        return torch.get_autocast_dtype(device)
+    except (AttributeError, TypeError):
+        # dispatch to older device specific fns, only covering cuda/cpu devices here
+        if device == 'cpu':
+            return torch.get_autocast_cpu_dtype()
+        else:
+            assert device == 'cuda'
+            return torch.get_autocast_gpu_dtype()
+
+
+def is_autocast_enabled(device: str = 'cuda'):
+    try:
+        return torch.is_autocast_enabled(device)
+    except TypeError:
+        # dispatch to older device specific fns, only covering cuda/cpu devices here
+        if device == 'cpu':
+            return torch.is_autocast_cpu_enabled()
+        else:
+            assert device == 'cuda'
+            return torch.is_autocast_enabled()  # defaults cuda (only cuda on older pytorch)
+
+
+_USE_FAST_NORM = False  # defaulting to False for now
+def is_fast_norm():
+    return _USE_FAST_NORM
+
+
+def rms_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: Optional[torch.Tensor] = None,
+    eps: float = 1e-5,
+):
+    norm_ndim = len(normalized_shape)
+    v = x.pow(2)
+    if torch.jit.is_scripting():
+        # ndim = len(x.shape)
+        # dims = list(range(ndim - norm_ndim, ndim))  # this doesn't work on pytorch <= 1.13.x
+        # NOTE -ve dims cause torchscript to crash in some cases, out of options to work around
+        assert norm_ndim == 1
+        v = torch.mean(v, dim=-1).unsqueeze(-1)  # ts crashes with -ve dim + keepdim=True
+    else:
+        dims = tuple(range(-1, -norm_ndim - 1, -1))
+        v = torch.mean(v, dim=dims, keepdim=True)
+    x = x * torch.rsqrt(v + eps)
+    if weight is not None:
+        x = x * weight
+    return x
+
+
+def fast_rms_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: Optional[torch.Tensor] = None,
+    eps: float = 1e-5,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # this must be by itself, cannot merge with has_apex_rmsnorm
+        return rms_norm(x, normalized_shape, weight, eps)
+
+    if has_apex_rmsnorm:
+        if weight is None:
+            return fused_rms_norm(x, normalized_shape, eps)
+        else:
+            return fused_rms_norm_affine(x, weight, normalized_shape, eps)
+
+    if is_autocast_enabled(x.device.type):
+        # normally native AMP casts LN inputs to float32
+        # apex LN does not, this is behaving like Apex
+        dt = get_autocast_dtype(x.device.type)
+        x, weight = x.to(dt), weight.to(dt)
+
+    with torch.autocast(device_type=x.device.type, enabled=False):
+        if has_torch_rms_norm:
+            x = F.rms_norm(x, normalized_shape, weight, eps)
+        else:
+            x = rms_norm(x, normalized_shape, weight, eps)
+
+    return x
+
+
+class RMSNorm(nn.Module):
+    """ RMSNorm w/ fast (apex) norm if available
+    """
+    __constants__ = ['normalized_shape', 'eps', 'elementwise_affine', '_fast_norm']
+    normalized_shape: Tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+    _fast_norm: bool
+
+    def __init__(self, channels, eps=1e-6, elementwise_affine=True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        normalized_shape = channels
+        if isinstance(normalized_shape, numbers.Integral):
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        self._fast_norm = is_fast_norm()  # can't script unless we have these flags here (no globals)
+
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+        else:
+            self.register_parameter('weight', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            nn.init.ones_(self.weight)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # NOTE fast norm fallback needs our rms norm impl, so both paths through here.
+        # Since there is no built-in PyTorch impl, always use APEX RmsNorm if is installed.
+        if self._fast_norm:
+            x = fast_rms_norm(x, self.normalized_shape, self.weight, self.eps)
+        else:
+            x = rms_norm(x, self.normalized_shape, self.weight, self.eps)
+        return x
+
+
+class Mlp(nn.Module):
+    """ MLP module used in transformers
+    """
+    def __init__(
+            self,
+            in_features,
+            hidden_features=None,
+            out_features=None,
+            act_layer=nn.GELU,
+            norm_layer=None,
+            bias=True,
+            drop=0.,
+            use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        #bias = to_2tuple(bias)
+        #drop_probs = to_2tuple(drop)
+        bias = [bias, bias]
+        drop_probs = [drop, drop]
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class MMsingle_attention(nn.Module):
+    """
+    Self-Attention module used in transformers
+    """
+    fused_attn: Final[bool]
+
+    def __init__(
+        self, dim: int, 
+        num_heads: int = 8, 
+        proj_bias: bool = True,
+        attn_drop: float = 0., 
+        proj_drop: float = 0.,
+        qkv_bias: bool = False, 
+        qk_norm: Optional[str] = "rms_norm", 
+        **block_kwargs
+    ) -> None:
+        super().__init__()
+
+        assert dim % num_heads == 0, f"dim {dim} should be divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = use_fused_attn()
+        self.attn_drop = nn.Dropout(attn_drop)
+        if qk_norm is None:
+            self.xs_q_norm = nn.Identity()
+            self.xs_k_norm = nn.Identity()
+        elif qk_norm == "rms_norm":
+            self.xs_q_norm = RMSNorm(self.head_dim, eps=1e-5)
+            self.xs_k_norm = RMSNorm(self.head_dim, eps=1e-5)
+        elif qk_norm == "layer_norm":
+            self.xs_q_norm = nn.LayerNorm(dim, eps=1e-5)
+            self.xs_k_norm = nn.LayerNorm(dim, eps=1e-5)
+        else:
+            raise ValueError(f"Unsupported qk_norm method: {qk_norm}")
+
+    def forward(self, txt_len,x: torch.Tensor, mask: Optional[torch.Tensor] = None,causal: bool = False,freqs_cis=None,freqs_cis2=None) -> torch.Tensor:
+        B, N1, C = x.shape
+        xs_qkv = x.reshape(B, N1, 3, -1)
+        xs_q, xs_k, xs_v = xs_qkv.permute(2, 0, 1, 3).unbind(0)
+        N2=N1//4
+        q = xs_q.view(B, N1, self.num_heads, self.head_dim)
+        k = xs_k.view(B, N1, self.num_heads, self.head_dim)
+        v = xs_v.view(B, N1, self.num_heads, self.head_dim).transpose(1, 2)
+        q, k = self.xs_q_norm(q), self.xs_k_norm(k)  
+        if freqs_cis is not None or freqs_cis2 is not None:
+            img_q, txt_q = q[:, :txt_len, :, :], q[:, txt_len:, :, :]
+            img_k, txt_k = k[:, :txt_len, :, :], k[:, txt_len:, :, :]
+            img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_qq.shape == img_q.shape and img_kk.shape == img_k.shape
+            ), f"img_kk: {img_qq.shape}, img_q: {img_q.shape}, img_kk: {img_kk.shape}, img_k: {img_k.shape}"
+            img_q, img_k = img_qq.transpose(1, 2), img_kk.transpose(1, 2)
+            if freqs_cis2 is not None:
+                txt_qq, txt_kk = apply_rotary_emb(txt_q, txt_k, freqs_cis2, head_first=False) 
+                assert (
+                    txt_qq.shape == txt_q.shape and txt_kk.shape == txt_k.shape
+                ), f"img_kk: {txt_q.shape}, img_q: {txt_q.shape}, img_kk: {txt_kk.shape}, img_k: {txt_k.shape}"
+                txt_q, txt_k = txt_qq, txt_kk
+            q = torch.cat((img_q, txt_q.transpose(1, 2)), dim=2)
+            k = torch.cat((img_k, txt_k.transpose(1, 2)), dim=2)
+        if mask is not None:
+            mask = mask[:, None, None, :].expand(-1, self.num_heads,N1, -1)  # (B, num_heads, N, N)
+            mask = mask.to(dtype=q.dtype)
+        if causal:
+            mask2 = torch.ones((N2+3*N2,N2+3*N2), dtype=torch.bool, device=v.device)
+            mask2[-N2-N2:, :N2]= 0
+            mask2[-N2-N2:-N2,-N2:]=0
+            mask2[-N2:,-N2-N2:-N2]=0
+            mask = mask2.to(dtype=torch.bool)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=mask,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            if mask is not None:
+                attn = attn.masked_fill(mask, float("-inf"))
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.transpose(1, 2).reshape(B, N1, -1)
+        return x
+class MMfour_attention(nn.Module):
+    """
+    Self-Attention module used in transformers
+    """
+    fused_attn: Final[bool]
+
+    def __init__(
+        self, dim: int, 
+        num_heads: int = 8, 
+        proj_bias: bool = True,
+        attn_drop: float = 0., 
+        proj_drop: float = 0.,
+        qkv_bias: bool = False, 
+        qk_norm: Optional[str] = "rms_norm", 
+        **block_kwargs
+    ) -> None:
+        super().__init__()
+
+        assert dim % num_heads == 0, f"dim {dim} should be divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = use_fused_attn()
+
+        self.qkv_xs = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.qkv_au1 = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.qkv_au2 = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.qkv_au3 = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        if qk_norm is None:
+            self.xs_q_norm = nn.Identity()
+            self.xs_k_norm = nn.Identity()
+            self.au_q_norm = nn.Identity()
+            self.au_k_norm = nn.Identity()
+        elif qk_norm == "rms_norm":
+            self.xs_q_norm = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+            self.xs_k_norm = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+            self.au_q_norm1 = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+            self.au_k_norm1 = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+
+            self.au_q_norm2 = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+            self.au_k_norm2 = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+
+            self.au_q_norm3 = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+            self.au_k_norm3= RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+        elif qk_norm == "layer_norm":
+            self.xs_q_norm = nn.LayerNorm(dim, eps=1e-5)
+            self.xs_k_norm = nn.LayerNorm(dim, eps=1e-5)
+            self.au_q_norm = nn.LayerNorm(dim, eps=1e-5)
+            self.au_k_norm = nn.LayerNorm(dim, eps=1e-5)
+        else:
+            raise ValueError(f"Unsupported qk_norm method: {qk_norm}")
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.xs_proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.au_proj1 =  nn.Linear(dim, dim, bias=proj_bias)
+        self.au_proj2 =  nn.Linear(dim, dim, bias=proj_bias)
+        self.au_proj3 =  nn.Linear(dim, dim, bias=proj_bias)
+        self.xs_proj_drop = nn.Dropout(proj_drop)
+        self.au_proj_drop1 = nn.Dropout(proj_drop)
+        self.au_proj_drop2 = nn.Dropout(proj_drop)
+        self.au_proj_drop3 = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, y1: torch.Tensor, y2: torch.Tensor,y3: torch.Tensor,mask: Optional[torch.Tensor] = None,causal=False,freqs_cis=None,freqs_cis2=None) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, N1, C = x.shape
+        xs_qkv = self.qkv_xs(x).reshape(B, N1, 3, -1)
+        xs_q, xs_k, xs_v = xs_qkv.permute(2, 0, 1, 3).unbind(0)
+        
+
+        B,N2,C= y1.shape
+        au_qkv1 = self.qkv_au1(y1).reshape(B, N2, 3, -1)
+        au_q1, au_k1, au_v1 = au_qkv1.permute(2, 0, 1, 3).unbind(0)
+        
+        B,N3,C= y2.shape
+        au_qkv2 = self.qkv_au2(y2).reshape(B, N3, 3, -1)
+        au_q2, au_k2, au_v2 = au_qkv2.permute(2, 0, 1, 3).unbind(0)
+
+        B,N4,C= y3.shape
+        au_qkv3 = self.qkv_au3(y3).reshape(B, N4, 3, -1)
+        au_q3, au_k3, au_v3 = au_qkv3.permute(2, 0, 1, 3).unbind(0)
+
+
+        M=N2//N1        
+        xs_q = xs_q.view(B, N1, self.num_heads, self.head_dim)
+        xs_k = xs_k.view(B, N1, self.num_heads, self.head_dim)
+        xs_v = xs_v.view(B, N1, self.num_heads, self.head_dim).transpose(1, 2)
+        xs_q, xs_k = self.xs_q_norm(xs_q), self.xs_k_norm(xs_k)
+        if freqs_cis is not None:
+            img_qq, img_kk = apply_rotary_emb(xs_q, xs_k, freqs_cis, head_first=False)
+            assert (
+                img_qq.shape == xs_q.shape and img_kk.shape == xs_k.shape
+            ), f"img_kk: {img_qq.shape}, img_q: {xs_q.shape}, img_kk: {img_kk.shape}, img_k: {xs_k.shape}"
+            xs_q, xs_k = img_qq.transpose(1, 2), img_kk.transpose(1, 2)
+        au_q1=au_q1.view(B, N2, self.num_heads, self.head_dim)
+        au_k1=au_k1.view(B, N2, self.num_heads, self.head_dim)
+        au_v1=au_v1.view(B, N2, self.num_heads, self.head_dim).transpose(1, 2)
+        au_q1, au_k1 = self.au_q_norm1(au_q1), self.au_k_norm1(au_k1)
+
+        au_q2=au_q2.view(B, N3, self.num_heads, self.head_dim)
+        au_k2=au_k2.view(B, N3, self.num_heads, self.head_dim)
+        au_v2=au_v2.view(B, N3, self.num_heads, self.head_dim).transpose(1, 2)
+        au_q2, au_k2 = self.au_q_norm2(au_q2), self.au_k_norm2(au_k2)
+
+        au_q3=au_q3.view(B, N4, self.num_heads, self.head_dim)
+        au_k3=au_k3.view(B, N4, self.num_heads, self.head_dim)
+        au_v3=au_v3.view(B, N4, self.num_heads, self.head_dim).transpose(1, 2)
+        au_q3, au_k3 = self.au_q_norm3(au_q3), self.au_k_norm3(au_k3)
+
+        if freqs_cis2 is not None:
+            au_q11, au_k11 = apply_rotary_emb(au_q1, au_k1, freqs_cis2, head_first=False)
+            au_q1, au_k1 = au_q11, au_k11
+            assert (
+                au_q11.shape == au_q1.shape and au_k11.shape == au_k1.shape
+            ), f"au_q11: {au_q11.shape}, img_q: {au_q1.shape}, img_kk: {au_k11.shape}, img_k: {au_k1.shape}"
+
+
+
+        q = torch.cat((xs_q, au_q1.transpose(1, 2),au_q2.transpose(1, 2),au_q3.transpose(1, 2)), dim=2)
+        k = torch.cat((xs_k, au_k1.transpose(1, 2),au_k2.transpose(1, 2),au_k3.transpose(1, 2)), dim=2)
+        v = torch.cat((xs_v, au_v1,au_v2,au_v3), dim=2)
+
+        if mask is not None:
+            # mask = mask[:, None, :]  # (B, 1, N)
+            mask2 = mask[:, None, :].expand(-1, self.num_heads,-1)
+            mask = mask[:, None, None, :].expand(-1, self.num_heads,M, -1) 
+            mask = rearrange(mask, "b n m d -> b n (m d)")
+            att_mask=torch.cat((mask2,mask),dim=-1)
+            att_mask=att_mask[:,:,None,:].expand(-1, -1,N1+N2, -1) 
+            mask = att_mask.to(dtype=q.dtype)
+        if causal:
+            mask2 = torch.ones((N1+3*N2,N1+3*N2), dtype=torch.bool, device=v.device)
+            mask2[-N3-N4:, :N1] = 0
+            mask2[-N1-N1:-N1,-N1:]=0
+            mask2[-N1:,-N1-N1:-N1]=0
+            mask = mask2.to(dtype=torch.bool)
+        if self.fused_attn:
+            # print("yesyes")
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=mask,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            if mask is not None:
+                attn = attn.masked_fill(mask, float("-inf"))
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.transpose(1, 2).reshape(B, N1+N2+N3+N4, C)
+        xs,au1,au2,au3=x[:,:N1],x[:,N1:N1+N2],x[:,N1+N2:N1+N2+N3],x[:,N1+N2+N3:N1+N2+N3+N4]
+        xs = self.xs_proj(xs)
+        xs = self.xs_proj_drop(xs)
+        au1 = self.au_proj1(au1)
+        au1 = self.au_proj_drop1(au1)
+
+        au2 = self.au_proj2(au2)
+        au2 = self.au_proj_drop2(au2)
+
+        au3 = self.au_proj3(au3)
+        au3 = self.au_proj_drop3(au3)
+        return xs,au1,au2,au3
+class MMdual_attention(nn.Module):
+    """
+    Self-Attention module used in transformers
+    """
+    fused_attn: Final[bool]
+
+    def __init__(
+        self, dim: int, 
+        num_heads: int = 8, 
+        proj_bias: bool = True,
+        attn_drop: float = 0., 
+        proj_drop: float = 0.,
+        qkv_bias: bool = False, 
+        qk_norm: Optional[str] = "rms_norm", 
+        **block_kwargs
+    ) -> None:
+        super().__init__()
+
+        assert dim % num_heads == 0, f"dim {dim} should be divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = use_fused_attn()
+
+        self.qkv_xs = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.qkv_au = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        if qk_norm is None:
+            self.xs_q_norm = nn.Identity()
+            self.xs_k_norm = nn.Identity()
+            self.au_q_norm = nn.Identity()
+            self.au_k_norm = nn.Identity()
+        elif qk_norm == "rms_norm":
+            self.xs_q_norm = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+            self.xs_k_norm = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+            self.au_q_norm = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+            self.au_k_norm = RMSNorm(self.head_dim, eps=1e-5,elementwise_affine=True)
+        elif qk_norm == "layer_norm":
+            self.xs_q_norm = nn.LayerNorm(dim, eps=1e-5)
+            self.xs_k_norm = nn.LayerNorm(dim, eps=1e-5)
+            self.au_q_norm = nn.LayerNorm(dim, eps=1e-5)
+            self.au_k_norm = nn.LayerNorm(dim, eps=1e-5)
+        else:
+            raise ValueError(f"Unsupported qk_norm method: {qk_norm}")
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.xs_proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.au_proj =  nn.Linear(dim, dim, bias=proj_bias)
+        self.xs_proj_drop = nn.Dropout(proj_drop)
+        self.au_proj_drop = nn.Dropout(proj_drop)
+    def forward(self, seq_len,x: torch.Tensor, y: torch.Tensor, mask: Optional[torch.Tensor] = None,causal=False,freqs_cis=None,freqs_cis2=None) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, N1, C = x.shape
+        xs_qkv = self.qkv_xs(x).reshape(B, N1, 3, -1)
+        xs_q, xs_k, xs_v = xs_qkv.permute(2, 0, 1, 3).unbind(0)
+        
+
+        B,N2,C= y.shape
+        au_qkv = self.qkv_au(y).reshape(B, N2, 3, -1)
+        au_q, au_k, au_v = au_qkv.permute(2, 0, 1, 3).unbind(0)    
+        xs_q = xs_q.view(B, N1, self.num_heads, self.head_dim)
+        xs_k = xs_k.view(B, N1, self.num_heads, self.head_dim)
+        xs_v = xs_v.view(B, N1, self.num_heads, self.head_dim).transpose(1, 2)
+        xs_q, xs_k = self.xs_q_norm(xs_q), self.xs_k_norm(xs_k)
+        if freqs_cis is not None:
+            img_qq, img_kk = apply_rotary_emb(xs_q, xs_k, freqs_cis, head_first=False)
+            assert (
+                img_qq.shape == xs_q.shape and img_kk.shape == xs_k.shape
+            ), f"img_kk: {img_qq.shape}, img_q: {xs_q.shape}, img_kk: {img_kk.shape}, img_k: {xs_k.shape}"
+            xs_q, xs_k = img_qq.transpose(1, 2), img_kk.transpose(1, 2)
+        au_q=au_q.view(B, N2, self.num_heads, self.head_dim)
+        au_k=au_k.view(B, N2, self.num_heads, self.head_dim)
+        au_v=au_v.view(B, N2, self.num_heads, self.head_dim).transpose(1, 2)
+        au_q, au_k = self.au_q_norm(au_q), self.au_k_norm(au_k)
+        if freqs_cis2 is not None:
+            img_qq, img_kk = apply_rotary_emb(au_q, au_k, freqs_cis2, head_first=False)
+            assert (
+                img_qq.shape == au_q.shape and img_kk.shape == au_k.shape
+            ), f"img_kk: {img_qq.shape}, img_q: {xs_q.shape}, img_kk: {img_kk.shape}, img_k: {xs_k.shape}"
+            au_q, au_k = img_qq, img_kk
+        q = torch.cat((xs_q, au_q.transpose(1, 2)), dim=2)
+        k = torch.cat((xs_k, au_k.transpose(1, 2)), dim=2)
+        v = torch.cat((xs_v, au_v), dim=2)
+
+        if mask is not None:
+            # mask = mask[:, None, :]  # (B, 1, N)
+            mask2 = mask[:, None, :].expand(-1, self.num_heads,-1)
+            mask = mask[:, None, None, :].expand(-1, self.num_heads,M, -1) 
+            mask = rearrange(mask, "b n m d -> b n (m d)")
+            att_mask=torch.cat((mask2,mask),dim=-1)
+            att_mask=att_mask[:,:,None,:].expand(-1, -1,N1+N2, -1) 
+            mask = att_mask.to(dtype=q.dtype)
+        if causal:
+            mask2 = torch.ones((N1+3*N1,N1+3*N1), dtype=torch.bool, device=v.device)
+            mask2[-N1-N1:, :N1] = 0
+            mask2[-N1-N1:-N1,-N1:]=0
+            mask2[-N1:,-N1-N1:-N1]=0
+
+            mask = mask2.to(dtype=torch.bool)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=mask,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            if mask is not None:
+                attn = attn.masked_fill(mask, float("-inf"))
+            attn = attn.softmax(dim=-1)
+            
+
+
+
+            attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.transpose(1, 2).reshape(B, N1+N2, C)
+        xs,au=x[:,:N1],x[:,N1:]
+        xs = self.xs_proj(xs)
+        xs = self.xs_proj_drop(xs)
+        au = self.au_proj(au)
+        au = self.au_proj_drop(au)
+        return xs,au
+
+class SelfAttention(nn.Module):
+    """
+    Self-Attention module used in transformers
+    """
+    fused_attn: Final[bool]
+
+    def __init__(
+        self, dim: int, 
+        num_heads: int = 8, 
+        proj_bias: bool = True,
+        attn_drop: float = 0., 
+        proj_drop: float = 0.,
+        qkv_bias: bool = False, 
+        qk_norm: Optional[str] = "rms_norm", 
+        **block_kwargs
+    ) -> None:
+        super().__init__()
+
+        assert dim % num_heads == 0, f"dim {dim} should be divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = use_fused_attn()
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        if qk_norm is None:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+        elif qk_norm == "rms_norm":
+            self.q_norm = RMSNorm(dim, eps=1e-5)
+            self.k_norm = RMSNorm(dim, eps=1e-5)
+        elif qk_norm == "layer_norm":
+            self.q_norm = nn.LayerNorm(dim, eps=1e-5)
+            self.k_norm = nn.LayerNorm(dim, eps=1e-5)
+        else:
+            raise ValueError(f"Unsupported qk_norm method: {qk_norm}")
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None,freqs_cis=None) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, -1).permute(2, 0, 1, 3) 
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = q.view(B, N, self.num_heads, self.head_dim)
+        k = k.view(B, N, self.num_heads, self.head_dim)
+        v = v.view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
+        
+        if freqs_cis is not None:
+            img_qq, img_kk = apply_rotary_emb(q, k, freqs_cis, head_first=False)
+            assert (
+                img_qq.shape == q.shape and img_kk.shape == k.shape
+            ), f"img_kk: {img_qq.shape}, img_q: {q.shape}, img_kk: {img_kk.shape}, img_k: {k.shape}"
+            q, k = img_qq, img_kk
+        
+        if mask is not None:
+            mask = mask[:, None, None, :].expand(-1, self.num_heads,N, -1)  # (B, num_heads, N, N)
+            mask = mask.to(dtype=q.dtype)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q.transpose(1, 2), k.transpose(1, 2), v,
+                attn_mask=mask,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            if mask is not None:
+                attn = attn.masked_fill(mask, float("-inf"))
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class CrossAttention(nn.Module):
+    """
+    Cross-Attention module used in transformers
+    """
+    fused_attn: Final[bool]
+
+    def __init__(
+        self, dim: int,
+        num_heads: int = 8, 
+        proj_bias: bool = True,
+        attn_drop: float = 0., 
+        proj_drop: float = 0.,
+        qkv_bias: bool = False, 
+        qk_norm: Optional[str] = "rms_norm", 
+        **block_kwargs
+    ) -> None:
+        super().__init__()
+
+        assert dim % num_heads == 0, f"dim {dim} should be divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = use_fused_attn()
+
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+
+        self.window_size = int(block_kwargs.get('window_size', 1))
+        if self.window_size > 1:
+            self.indices = (
+                torch.arange(self.window_size) - (self.window_size - 1) // 2
+            ).unsqueeze(0)            # 1, window_size, [-3, -2, -1, 0, 1, 2, 3]
+            norm_dim = dim
+        else:
+            self.indices = None
+            norm_dim = self.head_dim
+
+        if qk_norm is None:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+        elif qk_norm == "rms_norm":
+            self.q_norm = RMSNorm(norm_dim, eps=1e-5)
+            self.k_norm = RMSNorm(norm_dim, eps=1e-5)
+        elif qk_norm == "layer_norm":
+            self.q_norm = nn.LayerNorm(norm_dim, eps=1e-5)
+            self.k_norm = nn.LayerNorm(norm_dim, eps=1e-5)
+        else:
+            raise ValueError(f"Unsupported qk_norm method: {qk_norm}")
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        
+    def forward(self, x: torch.Tensor, y: torch.Tensor,mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, N, C = x.shape
+        
+        '''
+        if self.window_size > 1:
+            indices = (torch.arange(N).unsqueeze(1) + self.indices).to(x.device)   # N x window_size
+            indices = indices.clamp(0, N - 1)
+            attn_mask = torch.zeros(N, y.shape[1], dtype=x.dtype, device=x.device)  # N x N
+            attn_mask = torch.scatter(attn_mask, dim=1, index=indices, value=1)     # N x N
+            attn_mask = attn_mask.unsqueeze(0).unsqueeze(-1)                        # 1 x N x N x 1
+            attn_mask = attn_mask.expand(-1, -1, -1, M)                             # 1 x N x N x M
+            attn_mask = attn_mask.reshape(1, N, -1)                                 # 1 x N x (NxM)
+
+            #x = rearrange(x, "b n c -> (b n) 1 c")
+            y = rearrange(y, "b n m d -> b (n m) d")
+
+            q = self.to_q(x)
+            q = self.q_norm(q).reshape(-1, N, self.num_heads, self.head_dim).transpose(1, 2)
+
+            kv = self.to_kv(y).reshape(-1, N*M, 2, self.num_heads*self.head_dim).permute(2, 0, 1, 3)
+            k, v = kv.unbind(0)
+            k = self.k_norm(k)
+            k = k.view(-1, N*M, self.num_heads, self.head_dim).transpose(1, 2)
+            v = v.view(-1, N*M, self.num_heads, self.head_dim).transpose(1, 2)
+        else:
+        '''
+        '''
+        # wsize = 1
+        attn_mask = None
+
+        x = rearrange(x, "b n c -> (b n) 1 c")
+        y = rearrange(y, "b n m d -> (b n) m d")
+
+        q = self.to_q(x).reshape(-1, 1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        kv = self.to_kv(y).reshape(-1, M, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        '''
+
+        # wsize=all
+        # attn_mask = None
+        if y.shape==4:
+            M = y.shape[2]
+            y = rearrange(y, "b n m d -> b (n m) d")
+        else:
+            N2 = y.shape[1]
+            M=N2//N
+        q = self.to_q(x).reshape(B, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        kv = self.to_kv(y).reshape(B, -1, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if mask is not None:
+            mask = mask[:, None, None, :].expand(-1, self.num_heads, M, -1)  # (B, num_heads, N, N)
+            mask = rearrange(mask, "b n m d -> b n (m d)")
+            mask=mask[:, :, None, :].expand(-1, -1, N, -1)
+            mask = mask.to(dtype=q.dtype)
+            # mask = mask.masked_fill(mask == 0, float("-inf"))
+            
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=mask,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)         # B x N x (N*M)
+            attn = attn.masked_fill(mask == 0, float(-1e-9))
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        # B, H, N, C//H
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
diff --git a/src/models/dit/motion_extractor.py b/src/models/dit/motion_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2982e53c52d9ec1e0bec0453cc05edb51a15d23
--- /dev/null
+++ b/src/models/dit/motion_extractor.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+
+"""
+Motion extractor(M), which directly predicts the canonical keypoints, head pose and expression deformation of the input image
+"""
+
+from torch import nn
+import torch
+
+from .convnextv2 import convnextv2_tiny
+from .util import filter_state_dict
+
+model_dict = {
+    'convnextv2_tiny': convnextv2_tiny,
+}
+
+
+class MotionExtractor(nn.Module):
+    def __init__(self, **kwargs):
+        super(MotionExtractor, self).__init__()
+
+        # default is convnextv2_base
+        backbone = kwargs.get('backbone', 'convnextv2_tiny')
+        self.detector = model_dict.get(backbone)(**kwargs)
+
+    def load_pretrained(self, init_path: str):
+        if init_path not in (None, ''):
+            state_dict = torch.load(init_path, map_location=lambda storage, loc: storage)['model']
+            state_dict = filter_state_dict(state_dict, remove_name='head')
+            ret = self.detector.load_state_dict(state_dict, strict=False)
+            print(f'Load pretrained model from {init_path}, ret: {ret}')
+
+    def forward(self, x):
+        out = self.detector(x)
+        return out
diff --git a/src/models/dit/posemb_layers.py b/src/models/dit/posemb_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..11cd1dec61fb8aec6006a53e0d1dcbb2300d9db0
--- /dev/null
+++ b/src/models/dit/posemb_layers.py
@@ -0,0 +1,314 @@
+import torch
+from typing import Union, Tuple, List
+
+
+def _to_tuple(x, dim=2):
+    if isinstance(x, int):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+
+
+def get_meshgrid_nd(start, *args, dim=2):
+    """
+    Get n-D meshgrid with start, stop and num.
+
+    Args:
+        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
+            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
+            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
+            n-tuples.
+        *args: See above.
+        dim (int): Dimension of the meshgrid. Defaults to 2.
+
+    Returns:
+        grid (np.ndarray): [dim, ...]
+    """
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start, dim=dim)
+        start = (0,) * dim
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start, dim=dim)
+        stop = _to_tuple(args[0], dim=dim)
+        num = [stop[i] - start[i] for i in range(dim)]
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start, dim=dim)  # Left-Top       eg: 12,0
+        stop = _to_tuple(args[0], dim=dim)  # Right-Bottom   eg: 20,32
+        num = _to_tuple(args[1], dim=dim)  # Target Size    eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+
+    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")  # dim x [W, H, D]
+    grid = torch.stack(grid, dim=0)  # [dim, W, H, D]
+
+    return grid
+
+
+#################################################################################
+#                   Rotary Positional Embedding Functions                       #
+#################################################################################
+# https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L80
+
+
+def reshape_for_broadcast(
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    x: torch.Tensor,
+    head_first=False,
+):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+
+    Notes:
+        When using FlashMHAModified, head_first should be False.
+        When using Attention, head_first should be True.
+
+    Args:
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        head_first (bool): head dimension first (except batch dim) or not.
+
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+
+    if isinstance(freqs_cis, tuple):
+        # freqs_cis: (cos, sin) in real space
+        if head_first:
+            assert freqs_cis[0].shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [
+                d if i == ndim - 2 or i == ndim - 1 else 1
+                for i, d in enumerate(x.shape)
+            ]
+        else:
+            assert freqs_cis[0].shape[-2:] == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+            if len(freqs_cis[0].shape) == 3:
+                shape[0] = freqs_cis[0].shape[0]
+        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+    else:
+        # freqs_cis: values in complex space
+        if head_first:
+            assert freqs_cis.shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [
+                d if i == ndim - 2 or i == ndim - 1 else 1
+                for i, d in enumerate(x.shape)
+            ]
+        else:
+            assert freqs_cis.shape[-2:] == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+            if len(freqs_cis[0].shape) == 3:
+                shape[0] = freqs_cis[0].shape[0]
+        return freqs_cis.view(*shape)
+
+
+def rotate_half(x):
+    x_real, x_imag = (
+        x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    )  # [B, S, H, D//2]
+    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+    head_first: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
+        freqs_cis (torch.Tensor or tuple): Precomputed frequency tensor for complex exponential.
+        head_first (bool): head dimension first (except batch dim) or not.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+
+    """
+    xk_out = None
+    if isinstance(freqs_cis, tuple):
+        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)  # [S, D]
+        cos, sin = cos.to(xq.device), sin.to(xq.device)
+        # real * cos - imag * sin
+        # imag * cos + real * sin
+        xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
+        xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
+    else:
+        # view_as_complex will pack [..., D/2, 2](real) to [..., D/2](complex)
+        xq_ = torch.view_as_complex(
+            xq.float().reshape(*xq.shape[:-1], -1, 2)
+        )  # [B, S, H, D//2]
+        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(
+            xq.device
+        )  # [S, D//2] --> [1, S, 1, D//2]
+        # (real, imag) * (cos, sin) = (real * cos - imag * sin, imag * cos + real * sin)
+        # view_as_real will expand [..., D/2](complex) to [..., D/2, 2](real)
+        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+        xk_ = torch.view_as_complex(
+            xk.float().reshape(*xk.shape[:-1], -1, 2)
+        )  # [B, S, H, D//2]
+        xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+
+    return xq_out, xk_out
+
+
+def get_nd_rotary_pos_embed(
+    rope_dim_list,
+    start,
+    *args,
+    theta=10000.0,
+    use_real=False,
+    theta_rescale_factor: Union[float, List[float]] = 1.0,
+    interpolation_factor: Union[float, List[float]] = 1.0,
+):
+    """
+    This is a n-d version of precompute_freqs_cis, which is a RoPE for tokens with n-d structure.
+
+    Args:
+        rope_dim_list (list of int): Dimension of each rope. len(rope_dim_list) should equal to n.
+            sum(rope_dim_list) should equal to head_dim of attention layer.
+        start (int | tuple of int | list of int): If len(args) == 0, start is num; If len(args) == 1, start is start,
+            args[0] is stop, step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num.
+        *args: See above.
+        theta (float): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool): If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+            Some libraries such as TensorRT does not support complex64 data type. So it is useful to provide a real
+            part and an imaginary part separately.
+        theta_rescale_factor (float): Rescale factor for theta. Defaults to 1.0.
+
+    Returns:
+        pos_embed (torch.Tensor): [HW, D/2]
+    """
+
+    grid = get_meshgrid_nd(
+        start, *args, dim=len(rope_dim_list)
+    )  # [3, W, H, D] / [2, W, H]
+
+    if isinstance(theta_rescale_factor, int) or isinstance(theta_rescale_factor, float):
+        theta_rescale_factor = [theta_rescale_factor] * len(rope_dim_list)
+    elif isinstance(theta_rescale_factor, list) and len(theta_rescale_factor) == 1:
+        theta_rescale_factor = [theta_rescale_factor[0]] * len(rope_dim_list)
+    assert len(theta_rescale_factor) == len(
+        rope_dim_list
+    ), "len(theta_rescale_factor) should equal to len(rope_dim_list)"
+
+    if isinstance(interpolation_factor, int) or isinstance(interpolation_factor, float):
+        interpolation_factor = [interpolation_factor] * len(rope_dim_list)
+    elif isinstance(interpolation_factor, list) and len(interpolation_factor) == 1:
+        interpolation_factor = [interpolation_factor[0]] * len(rope_dim_list)
+    assert len(interpolation_factor) == len(
+        rope_dim_list
+    ), "len(interpolation_factor) should equal to len(rope_dim_list)"
+
+    # use 1/ndim of dimensions to encode grid_axis
+    embs = []
+    for i in range(len(rope_dim_list)):
+        emb = get_1d_rotary_pos_embed(
+            rope_dim_list[i],
+            grid[i].reshape(-1),
+            theta,
+            use_real=use_real,
+            theta_rescale_factor=theta_rescale_factor[i],
+            interpolation_factor=interpolation_factor[i],
+        )  # 2 x [WHD, rope_dim_list[i]]
+        embs.append(emb)
+
+    if use_real:
+        cos = torch.cat([emb[0] for emb in embs], dim=1)  # (WHD, D/2)
+        sin = torch.cat([emb[1] for emb in embs], dim=1)  # (WHD, D/2)
+        return cos, sin
+    else:
+        emb = torch.cat(embs, dim=1)  # (WHD, D/2)
+        return emb
+
+
+def get_1d_rotary_pos_embed(
+    dim: int,
+    pos: Union[torch.FloatTensor, int],
+    theta: float = 10000.0,
+    use_real: bool = False,
+    theta_rescale_factor: float = 1.0,
+    interpolation_factor: float = 1.0,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Precompute the frequency tensor for complex exponential (cis) with given dimensions.
+    (Note: `cis` means `cos + i * sin`, where i is the imaginary unit.)
+
+    This function calculates a frequency tensor with complex exponential using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        pos (int or torch.FloatTensor): Position indices for the frequency tensor. [S] or scalar
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool, optional): If True, return real part and imaginary part separately.
+                                   Otherwise, return complex numbers.
+        theta_rescale_factor (float, optional): Rescale factor for theta. Defaults to 1.0.
+
+    Returns:
+        freqs_cis: Precomputed frequency tensor with complex exponential. [S, D/2]
+        freqs_cos, freqs_sin: Precomputed frequency tensor with real and imaginary parts separately. [S, D]
+    """
+    if isinstance(pos, int):
+        pos = torch.arange(pos).float()
+
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    if theta_rescale_factor != 1.0:
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+    )  # [D/2]
+    # assert interpolation_factor == 1.0, f"interpolation_factor: {interpolation_factor}"
+    freqs = torch.outer(pos * interpolation_factor, freqs)  # [S, D/2]
+    if use_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(
+            torch.ones_like(freqs), freqs
+        )  # complex64     # [S, D/2]
+        return freqs_cis
\ No newline at end of file
diff --git a/src/models/dit/posemb_layers_xtrans.py b/src/models/dit/posemb_layers_xtrans.py
new file mode 100644
index 0000000000000000000000000000000000000000..101992b37a22a4bc40b439522e7f9d15b6699640
--- /dev/null
+++ b/src/models/dit/posemb_layers_xtrans.py
@@ -0,0 +1,3255 @@
+from __future__ import annotations
+from typing import Callable
+
+import math
+from copy import deepcopy
+from random import random, randrange
+from packaging import version
+
+import torch
+from torch.amp import autocast
+import torch.nn.functional as F
+from torch import nn, einsum, tensor, Tensor, cat, stack, arange, is_tensor
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.nn import Module, ModuleList, ModuleDict
+
+from functools import partial, wraps
+from collections import namedtuple
+from contextlib import nullcontext
+from dataclasses import dataclass
+
+from loguru import logger
+
+from x_transformers.attend import Attend, Intermediates
+from x_transformers.autoregressive_wrapper import AutoregressiveWrapper
+
+import einx
+from einops.layers.torch import Rearrange
+from einops import rearrange, repeat, reduce, pack, unpack
+
+# einstein notation
+
+# b - batch
+# n - sequence
+# d - feature dimension
+# h - attention heads
+# i, j - sequence (source, target)
+
+# constants
+
+DEFAULT_DIM_HEAD = 64
+
+@dataclass
+class LayerIntermediates:
+    hiddens:            list[Tensor] | None = None   # all hiddens, before the final norm (in pre-norm architecture)
+    last_hidden:        Tensor | None = None         # very last hidden after all attention layers, after the final norm
+    attn_intermediates: list[Intermediates] | None = None
+    layer_hiddens:      list[Tensor] | None = None
+    attn_z_loss:        Tensor | None = None
+    mems:               Tensor | None = None
+    memory_tokens:      Tensor | None = None
+    logit_entropies:    Tensor | None = None
+
+LinearNoBias = partial(nn.Linear, bias = False)
+
+# helpers
+
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+
+def identity(t, *args, **kwargs):
+    return t
+
+def first(it, default = None):
+    return it[0] if len(it) > 0 else default
+
+def is_empty(x):
+    return len(x) == 0
+
+def cast_tuple(val, depth = 1):
+    return val if isinstance(val, tuple) else (val,) * depth
+
+def divisible_by(num, den):
+    return (num % den) == 0
+
+def maybe(fn = None):
+    if not exists(fn):
+        fn = identity
+
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner
+
+def at_most_one_of(*bools):
+    return sum(map(int, bools)) <= 1
+
+class always():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, *args, **kwargs):
+        return self.val
+
+class not_equals():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, x, *args, **kwargs):
+        return x != self.val
+
+class equals():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, x, *args, **kwargs):
+        return x == self.val
+
+def Sequential(*modules):
+    return nn.Sequential(*filter(exists, modules))
+
+# tensor helpers
+
+def log(t, eps = 1e-20):
+    return t.clamp(min = eps).log()
+
+def max_neg_value(tensor):
+    return -torch.finfo(tensor.dtype).max
+
+def l2norm(t, groups = 1):
+    t = rearrange(t, '... (g d) -> ... g d', g = groups)
+    t = F.normalize(t, p = 2, dim = -1)
+    return rearrange(t, '... g d -> ... (g d)')
+
+def softclamp(t, value):
+    return (t / value).tanh() * value
+
+def masked_mean(t, mask = None, dim = 1):
+    if not exists(mask):
+        return t.mean(dim = dim)
+
+    dims_append = (1,) * (t.ndim - mask.ndim)
+    mask = mask.reshape(*mask.shape, *dims_append)
+
+    num = (t * mask).sum(dim = dim)
+    den = mask.sum(dim = dim).clamp(min = 1.)
+    return num / den
+
+def pad_at_dim(t, pad: tuple[int, int], dim = -1, value = 0.):
+    if pad == (0, 0):
+        return t
+
+    dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
+    zeros = ((0, 0) * dims_from_right)
+    return F.pad(t, (*zeros, *pad), value = value)
+
+def or_reduce(masks):
+    head, *body = masks
+    for rest in body:
+        head = head | rest
+    return head
+
+# entropy
+
+def calc_entropy(
+    t: Tensor,
+    is_prob = False
+):
+    prob = t.softmax(dim = -1) if not is_prob else t
+    return -(prob * log(prob)).sum(dim = -1)
+
+# auxiliary loss helpers
+
+def calc_z_loss(
+    pre_softmax_attns: list[Tensor],
+    mask = None,
+    weight = 1.
+):
+    # the same loss applied to the mixture of experts router logits in https://arxiv.org/abs/2202.08906
+    # in the paper, in a tiny footnote, they mention using it on attention logits with stabilizing effects
+    # also used in PaLM as one of the measures
+
+    lse = 0.
+
+    for attn in pre_softmax_attns:
+        lse = lse + attn.logsumexp(dim = -1)
+
+    loss = torch.square(lse)
+    loss = reduce(loss, 'b h n -> b n', 'sum')
+
+    if not exists(mask):
+        return loss.mean() * weight
+
+    loss = loss[mask].sum() / mask.sum().clamp(min = 1e-5)
+    return loss * weight
+
+# init helpers
+
+def init_zero_(layer):
+    nn.init.constant_(layer.weight, 0.)
+    if exists(layer.bias):
+        nn.init.constant_(layer.bias, 0.)
+
+# keyword argument helpers
+
+def pick_and_pop(keys, d):
+    values = tuple(d.pop(key) for key in  keys)
+    return dict(zip(keys, values))
+
+def group_dict_by_key(cond, d):
+    return_val = [dict(),dict()]
+    for key in d.keys():
+        match = bool(cond(key))
+        ind = int(not match)
+        return_val[ind][key] = d[key]
+    return tuple(return_val)
+
+def string_begins_with(prefix, str):
+    return str.startswith(prefix)
+
+def group_by_key_prefix(prefix, d):
+    return group_dict_by_key(partial(string_begins_with, prefix), d)
+
+def groupby_prefix_and_trim(prefix, d):
+    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
+    prefix_len = len(prefix)
+    kwargs_without_prefix = {key[prefix_len:]: value for key, value in kwargs_with_prefix.items()}
+    return kwargs_without_prefix, kwargs
+
+# structured dropout, more effective than traditional attention dropouts
+
+def dropout_seq(seq, mask, dropout):
+    b, n, *_, device = *seq.shape, seq.device
+    logits = torch.randn(b, n, device = device)
+
+    if exists(mask):
+        mask_value = max_neg_value(logits)
+        logits = logits.masked_fill(~mask, mask_value)
+
+    keep_prob = 1. - dropout
+    num_keep = max(1,  int(keep_prob * n))
+    keep_indices = logits.topk(num_keep, dim = 1).indices
+
+    batch_indices = arange(b, device = device)
+    batch_indices = rearrange(batch_indices, 'b -> b 1')
+
+    seq = seq[batch_indices, keep_indices]
+
+    if exists(mask):
+        seq_counts = mask.sum(dim = -1)
+        seq_keep_counts = torch.ceil(seq_counts * keep_prob).int()
+        keep_mask = arange(num_keep, device = device) < rearrange(seq_keep_counts, 'b -> b 1')
+
+        mask = mask[batch_indices, keep_indices] & keep_mask
+
+    return seq, mask
+
+# activations
+
+class ReluSquared(Module):
+    def forward(self, x):
+        return F.relu(x) ** 2
+
+# embedding
+
+class TokenEmbedding(Module):
+    def __init__(self, dim, num_tokens, l2norm_embed = False):
+        super().__init__()
+        self.l2norm_embed = l2norm_embed
+        self.emb = nn.Embedding(num_tokens, dim)
+
+    def forward(self, x):
+        token_emb = self.emb(x.long())
+        return l2norm(token_emb) if self.l2norm_embed else token_emb
+
+    def init_(self):
+        if self.l2norm_embed:
+            nn.init.normal_(self.emb.weight, std=1e-5)
+            return
+        nn.init.kaiming_normal_(self.emb.weight)
+
+# positional embeddings
+
+class AbsolutePositionalEmbedding(Module):
+    def __init__(self, dim, max_seq_len, l2norm_embed = False):
+        super().__init__()
+        self.scale = dim ** -0.5 if not l2norm_embed else 1.
+        self.max_seq_len = max_seq_len
+        self.l2norm_embed = l2norm_embed
+        self.emb = nn.Embedding(max_seq_len, dim)
+
+    def forward(self, x, pos = None, seq_start_pos = None):
+        seq_len, device = x.shape[1], x.device
+        assert seq_len <= self.max_seq_len, f'you are passing in a sequence length of {seq_len} but your absolute positional embedding has a max sequence length of {self.max_seq_len}'
+
+        if not exists(pos):
+            pos = arange(seq_len, device = device)
+
+        if exists(seq_start_pos):
+            pos = (pos - seq_start_pos[..., None]).clamp(min = 0)
+
+        pos_emb = self.emb(pos)
+        pos_emb = pos_emb * self.scale
+        return l2norm(pos_emb) if self.l2norm_embed else pos_emb
+
+class ScaledSinusoidalEmbedding(Module):
+    def __init__(self, dim, theta = 10000):
+        super().__init__()
+        assert divisible_by(dim, 2)
+        self.scale = nn.Parameter(torch.ones(1) * dim ** -0.5)
+
+        half_dim = dim // 2
+        freq_seq = arange(half_dim).float() / half_dim
+        inv_freq = theta ** -freq_seq
+        self.register_buffer('inv_freq', inv_freq, persistent = False)
+
+    def forward(self, x, pos = None, seq_start_pos = None):
+        seq_len, device = x.shape[1], x.device
+
+        if not exists(pos):
+            pos = arange(seq_len, device = device)
+
+        if exists(seq_start_pos):
+            pos = pos - seq_start_pos[..., None]
+
+        emb = einsum('i, j -> i j', pos, self.inv_freq)
+        emb = cat((emb.sin(), emb.cos()), dim = -1)
+        return emb * self.scale
+
+class RelativePositionBias(Module):
+    def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 8):
+        super().__init__()
+        self.scale = scale
+        self.causal = causal
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.relative_attention_bias = nn.Embedding(num_buckets, heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
+        ret = 0
+        n = -relative_position
+        if not causal:
+            num_buckets //= 2
+            ret += (n < 0).long() * num_buckets
+            n = torch.abs(n)
+        else:
+            n = torch.max(n, torch.zeros_like(n))
+
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).long()
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(self, i, j):
+        device = self.device
+        q_pos = arange(j - i, j, dtype = torch.long, device = device)
+        k_pos = arange(j, dtype = torch.long, device = device)
+        rel_pos = einx.subtract('j, i -> i j', k_pos, q_pos)
+        rp_bucket = self._relative_position_bucket(rel_pos, causal = self.causal, num_buckets = self.num_buckets, max_distance = self.max_distance)
+        values = self.relative_attention_bias(rp_bucket)
+        bias = rearrange(values, 'i j h -> h i j')
+        return bias * self.scale
+
+class CoPE(Module):
+    """
+    Appendix B of https://arxiv.org/abs/2405.18719
+    """
+    def __init__ (
+        self,
+        dim,
+        heads,
+        max_pos,
+        soft_onehot = False,
+        talking_heads = False,
+        soft_onehot_temp = 5e-2
+    ):
+        super () . __init__ ()
+        self.max_pos = max_pos
+        self.pos_emb = nn.Parameter(torch.zeros(max_pos, dim))
+
+        self.talking_heads = nn.Conv2d(heads, heads, 1, bias = False) if talking_heads else None
+        self.soft_onehot = soft_onehot
+        self.soft_onehot_temp = soft_onehot_temp
+
+        if not soft_onehot:
+            return
+
+        self.register_buffer('positions', arange(max_pos))
+
+    def forward(self, query, attn_logits):
+
+        if exists(self.talking_heads):
+            i, j = attn_logits.shape[-2:]
+            causal_mask = attn_logits.new_ones(i, j).triu_(j - i + 1).bool()
+
+            attn_logits = self.talking_heads(attn_logits)
+
+            attn_logits = attn_logits.masked_fill(causal_mask, -torch.finfo(attn_logits.dtype).max)
+
+        # compute positions
+
+        gates = attn_logits.sigmoid()
+
+        pos = gates.flip(-1).cumsum(dim = -1).flip(-1)
+        pos = pos.clamp(max = self.max_pos - 1)
+
+        logits_int = einsum('b h n d, p d -> b h n p', query, self.pos_emb)
+
+        if self.soft_onehot:
+            diff_pos = einx.subtract('i, j -> i j', pos, self.positions).abs()
+            soft_onehot_pos = F.softmax(-diff_pos / self.soft_onehot_temp, dim = -1)
+            cope_pos_emb = einsum('b h i j p, b h i p -> b h i j', soft_onehot_pos, logits_int)
+        else:
+            # interpolate from integer positions
+            pos_ceil = pos.ceil().long()
+            pos_floor = pos.floor().long()
+            logits_ceil = logits_int.gather(-1, pos_ceil)
+            logits_floor = logits_int.gather(-1, pos_floor)
+
+            w = pos - pos_floor
+            cope_pos_emb = logits_ceil * w + logits_floor * (1 - w)
+
+        return cope_pos_emb
+
+class DynamicPositionBias(Module):
+    def __init__(self, dim, *, heads, depth, log_distance = False, norm = False):
+        super().__init__()
+        assert depth >= 1, 'depth for dynamic position bias MLP must be greater or equal to 1'
+        self.log_distance = log_distance
+
+        self.mlp = ModuleList([])
+
+        self.mlp.append(Sequential(
+            nn.Linear(1, dim),
+            LayerNorm(dim) if norm else None,
+            nn.SiLU()
+        ))
+
+        for _ in range(depth - 1):
+            self.mlp.append(Sequential(
+                nn.Linear(dim, dim),
+                nn.LayerNorm(dim) if norm else None,
+                nn.SiLU()
+            ))
+
+        self.mlp.append(nn.Linear(dim, heads))
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(self, i, j):
+        n, device = j, self.device
+
+        # get the (n x n) matrix of distances
+        seq_arange = arange(j - i, j, device = device)
+        context_arange = arange(j, device = device)
+        indices = einx.subtract('i, j -> i j', seq_arange, context_arange)
+        indices += (j - 1)
+
+        # input to continuous positions MLP
+        pos = arange(-j + 1, j, device = device).float()
+        pos = rearrange(pos, '... -> ... 1')
+
+        if self.log_distance:
+            pos = torch.sign(pos) * torch.log(pos.abs() + 1)  # log of distance is sign(rel_pos) * log(abs(rel_pos) + 1)
+
+        for layer in self.mlp:
+            pos = layer(pos)
+
+        # get position biases        
+        bias = pos[indices]
+        bias = rearrange(bias, 'i j h -> h i j')
+        return bias
+
+class AlibiPositionalBias(Module):
+    def __init__(
+        self,
+        heads,
+        total_heads = None,
+        slopes: list[int] | None = None,
+        **kwargs
+    ):
+        super().__init__()
+        self.heads = heads
+        self.total_heads = default(total_heads, heads)
+
+        slopes = Tensor(default(slopes, self._get_slopes(heads)))
+        slopes = rearrange(slopes, 'h -> h 1 1')
+
+        self.register_buffer('slopes', slopes, persistent = False)
+        self.register_buffer('bias', None, persistent = False)
+    
+    @property
+    def device(self):
+        return next(self.buffers()).device
+
+    @staticmethod
+    def _get_slopes(heads):
+        def get_slopes_power_of_2(n):
+            start = (2**(-2**-(math.log2(n)-3)))
+            ratio = start
+            return [start*ratio**i for i in range(n)]
+
+        if math.log2(heads).is_integer():
+            return get_slopes_power_of_2(heads)
+
+        closest_power_of_2 = 2 ** math.floor(math.log2(heads))
+        return get_slopes_power_of_2(closest_power_of_2) + get_slopes_power_of_2(2 * closest_power_of_2)[0::2][:heads-closest_power_of_2]
+
+    def forward_custom_pos(
+        self,
+        pos_i: Tensor,
+        pos_j: Tensor | None = None
+    ):
+        h, device = self.total_heads, self.device
+
+        pos_j = default(pos_j, pos_i)
+        bias = -einx.subtract('... j, ... i -> ... i j', pos_j, pos_i).abs()
+
+        if bias.ndim == 3:
+            bias = rearrange(bias, 'b i j -> b 1 i j')
+
+        bias = bias * self.slopes
+        num_heads_unalibied = h - bias.shape[-3]
+        bias = pad_at_dim(bias, (0, num_heads_unalibied), dim = -3)
+
+        return bias
+
+    def forward(self, i, j):
+        h, device = self.total_heads, self.device
+
+        if exists(self.bias) and self.bias.shape[-1] >= j and self.bias.shape[-2] >= i:
+            return self.bias[..., -i:, -j:]
+
+        seq_arange = arange(j - i, j, device = device)
+        context_arange = arange(j, device = device)
+        bias = -einx.subtract('j, i -> 1 i j', context_arange, seq_arange).abs()
+
+        bias = bias * self.slopes
+        num_heads_unalibied = h - bias.shape[-3]
+        bias = pad_at_dim(bias, (0, num_heads_unalibied), dim = -3)
+
+        self.register_buffer('bias', bias, persistent = False)
+        return self.bias
+
+class DataDependentAlibi(Module):
+    """ https://openreview.net/forum?id=q2Lnyegkr8 """
+
+    def __init__(
+        self,
+        dim,
+        heads,
+        causal = True,
+        bias_init = 5.,
+        post_log_scale = 1.,
+    ):
+        super().__init__()
+
+        self.causal = causal
+
+        linear = nn.Linear(dim, heads * (1 if causal else 2))
+
+        self.to_forget_gates = nn.Sequential(
+            linear,
+            Rearrange('b n h -> b h n'),
+            nn.LogSigmoid()
+        )
+
+        nn.init.constant_(linear.bias, bias_init)
+        self.post_log_scale = post_log_scale
+
+    def forward(self, x):
+        bidirectional = not self.causal
+
+        forget_gates = self.to_forget_gates(x) * self.post_log_scale
+
+        forget_gates = forget_gates.cumsum(dim = -1)
+
+        if bidirectional:
+            forget_gates, forget_gates_reversed = forget_gates.chunk(2, dim = 1)
+
+        forget_gates = einx.subtract('b h i, b h j -> b h i j', forget_gates, forget_gates)
+
+        if bidirectional:
+            forget_gates_reversed = einx.subtract('b h j, b h i -> b h i j', forget_gates_reversed, forget_gates_reversed)
+            forget_gates = forget_gates.tril() + forget_gates_reversed.triu()
+
+        return forget_gates
+
+class PerRowDataDependentAlibi(Module):
+    """ same as data dependent alibi from forgetting transformer, but the forgetting gates are also derived by a queries and keys with a small head dimension """
+
+    def __init__(
+        self,
+        dim,
+        heads,
+        causal = True,
+        dim_head = 8,
+        post_log_scale = 1.
+    ):
+        super().__init__()
+        assert causal, 'bidirectional not supported yet'
+
+        self.scale = dim_head ** -0.5
+
+        linear = nn.Linear(dim, heads * dim_head * 2, bias = False)
+
+        self.to_forget_gates = nn.Sequential(
+            linear,
+            Rearrange('b n (qk h d) -> qk b h n d', qk = 2, d = dim_head)
+        )
+
+        self.post_log_scale = post_log_scale
+
+    def forward(self, x):
+        q, k = self.to_forget_gates(x)
+        forget_gates = einsum('... i d, ... j d -> ... i j', q, k) * self.scale
+
+        forget_gates = F.logsigmoid(forget_gates) * self.post_log_scale
+
+        # mask out upper triangle + diagonal
+
+        n = x.shape[-2]
+        causal_mask = torch.ones((n, n), dtype = torch.bool, device = x.device).triu()
+
+        forget_gates = forget_gates.masked_fill(causal_mask, 0.)
+
+        # reverse cumsum
+
+        forget_gates = forget_gates.flip(dims = (-1,))
+        forget_gates = forget_gates.cumsum(dim = -1)
+        forget_gates = forget_gates.flip(dims = (-1,))
+
+        return forget_gates
+
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        use_xpos = False,
+        scale_base = 512,
+        interpolation_factor = 1.,
+        base = 10000,
+        base_rescale_factor = 1.
+    ):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        base *= base_rescale_factor ** (dim / (dim - 2))
+
+        inv_freq = 1. / (base ** (arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+
+        assert interpolation_factor >= 1.
+        self.interpolation_factor = interpolation_factor
+
+        if not use_xpos:
+            self.register_buffer('scale', None)
+            return
+
+        scale = (arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+
+        self.scale_base = scale_base
+        self.register_buffer('scale', scale)
+
+    def forward_from_seq_len(self, seq_len,interpolation_factor=1):
+        device = self.inv_freq.device
+
+        t = arange(seq_len, device = device)
+        return self.forward(t,interpolation_factor=interpolation_factor)
+
+    @autocast('cuda', enabled = False)
+    def forward(self, t,interpolation_factor=1):
+        max_pos = t.max() + 1
+
+        if t.ndim == 1:
+            t = rearrange(t, 'n -> 1 n')
+
+        freqs = torch.einsum('b i , j -> b i j', t.type_as(self.inv_freq), self.inv_freq) * interpolation_factor
+        freqs = stack((freqs, freqs), dim = -1)
+        freqs = rearrange(freqs, '... d r -> ... (d r)')
+
+        if not exists(self.scale):
+            return freqs, 1.
+
+        power = (t - (max_pos // 2)) / self.scale_base
+        scale = self.scale ** rearrange(power, '... n -> ... n 1')
+        scale = stack((scale, scale), dim = -1)
+        scale = rearrange(scale, '... d r -> ... (d r)')
+
+        return freqs, scale
+
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+
+@autocast('cuda', enabled = False)
+def apply_rotary_pos_emb(t, freqs, scale = 1):
+    rot_dim, seq_len, orig_dtype = freqs.shape[-1], t.shape[-2], t.dtype
+
+    freqs = freqs[:, -seq_len:, :]
+    scale = scale[:, -seq_len:, :] if isinstance(scale, torch.Tensor) else scale
+
+    if t.ndim == 4 and freqs.ndim == 3:
+        freqs = rearrange(freqs, 'b n d -> b 1 n d')
+
+    # partial rotary embeddings, Wang et al. GPT-J
+    t, t_unrotated = t[..., :rot_dim], t[..., rot_dim:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    out = cat((t, t_unrotated), dim = -1)
+
+    return out.type(orig_dtype)
+
+# norms
+
+class Scale(Module):
+    def __init__(self, value, fn):
+        super().__init__()
+        self.value = value
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        out = self.fn(x, **kwargs)
+        scale_fn = lambda t: t * self.value
+
+        if not isinstance(out, tuple):
+            return scale_fn(out)
+
+        return (scale_fn(out[0]), *out[1:])
+
+class LayerNorm(Module):
+    def __init__(
+        self,
+        dim,
+        unit_offset = False
+    ):
+        """
+        bias-less layernorm has been shown to be more stable. most newer models have moved towards rmsnorm, also bias-less
+        """
+        super().__init__()
+        self.unit_offset = unit_offset
+
+        self.ln = nn.LayerNorm(dim, elementwise_affine = False)
+        self.gamma = nn.Parameter(torch.ones(dim))
+        nn.init.constant_(self.gamma, 1. - float(unit_offset))
+
+    def forward(self, x):
+        normed = self.ln(x)
+        gamma = self.gamma + float(self.unit_offset)
+        return normed * gamma
+
+class AdaptiveLayerNorm(Module):
+    def __init__(
+        self,
+        dim,
+        dim_condition = None
+    ):
+        super().__init__()
+        dim_condition = default(dim_condition, dim)
+
+        self.ln = nn.LayerNorm(dim, elementwise_affine = False)
+        self.to_gamma = LinearNoBias(dim_condition, dim)
+        nn.init.zeros_(self.to_gamma.weight)
+
+    def forward(self, x, *, condition):
+        if condition.ndim == 2:
+            condition = rearrange(condition, 'b d -> b 1 d')
+
+        normed = self.ln(x)
+        gamma = self.to_gamma(condition)
+        return normed * (gamma + 1.)
+
+class ScaleNorm(Module):
+    def __init__(
+        self,
+        dim,
+        unit_offset = False
+    ):
+        super().__init__()
+        self.unit_offset = unit_offset
+        self.scale = dim ** 0.5
+
+        self.g = nn.Parameter(torch.zeros(1))
+        nn.init.constant_(self.g, 1. - float(unit_offset))
+
+    def forward(self, x):
+        gamma = self.g + float(self.unit_offset)
+        return F.normalize(x, dim = -1) * self.scale * gamma
+
+class RMSNorm(Module):
+    def __init__(
+        self,
+        dim,
+        unit_offset = False
+    ):
+        super().__init__()
+        self.unit_offset = unit_offset
+        self.scale = dim ** 0.5
+
+        self.g = nn.Parameter(torch.zeros(dim))
+        nn.init.constant_(self.g, 1. - float(unit_offset))
+
+    def forward(self, x):
+        gamma = self.g + float(self.unit_offset)
+        return F.normalize(x, dim = -1) * self.scale * gamma
+
+class AdaptiveRMSNorm(Module):
+    def __init__(
+        self,
+        dim,
+        dim_condition = None
+    ):
+        super().__init__()
+        self.scale = dim ** 0.5
+        dim_condition = default(dim_condition, dim)
+
+        self.to_gamma = LinearNoBias(dim_condition, dim)
+        nn.init.zeros_(self.to_gamma.weight)
+
+    def forward(self, x, *, condition):
+        if condition.ndim == 2:
+            condition = rearrange(condition, 'b d -> b 1 d')
+
+        normed = F.normalize(x, dim = -1)
+        gamma = self.to_gamma(condition)
+        return normed * self.scale * (gamma + 1.)
+
+class SimpleRMSNorm(Module):
+    def __init__(
+        self,
+        dim,
+        **kwargs
+    ):
+        super().__init__()
+        self.scale = dim ** 0.5
+
+    def forward(self, x):
+        return F.normalize(x, dim = -1) * self.scale
+
+class MultiheadRMSNorm(Module):
+    def __init__(self, dim, heads):
+        super().__init__()
+        self.rmsnorm = SimpleRMSNorm(dim)
+        self.gamma = nn.Parameter(torch.zeros(heads, 1, dim))
+
+    def forward(self, x):
+        return self.rmsnorm(x) * (self.gamma + 1.)
+
+class DynamicTanh(Module):
+    """ https://arxiv.org/abs/2503.10622 """
+    def __init__(
+        self,
+        dim,
+        init_alpha = 1.,
+        gamma = 1.,
+        beta = 0.,
+        unit_offset = False
+    ):
+        super().__init__()
+        self.pre_tanh_scale = nn.Parameter(tensor(init_alpha))
+
+        self.gamma = nn.Parameter(torch.ones(dim))
+        self.beta = nn.Parameter(torch.zeros(dim))
+
+        self.pre_tanh_scale_offset = init_alpha if unit_offset else 0.
+        self.gamma_offset = float(unit_offset)
+
+        nn.init.constant_(self.pre_tanh_scale, 0 if unit_offset else init_alpha)
+        nn.init.constant_(self.gamma, 1. - float(unit_offset))
+
+    def forward(self, x):
+        pre_tanh_scale = self.pre_tanh_scale + self.pre_tanh_scale_offset
+        gamma = self.gamma + self.gamma_offset
+        return (x * pre_tanh_scale).tanh() * gamma + self.beta
+
+# residual and residual gates
+
+class Residual(Module):
+    def __init__(self, dim, scale_residual = False, scale_residual_constant = 1., **kwargs):
+        super().__init__()
+        self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None
+        self.scale_residual_constant = scale_residual_constant
+
+    def prepare(self, residual):
+        return residual, residual, dict()
+
+    def forward(self, x, residual, **kwargs):
+        if exists(self.residual_scale):
+            residual = residual * self.residual_scale
+
+        if self.scale_residual_constant != 1:
+            residual = residual * self.scale_residual_constant
+
+        return x + residual
+
+class GRUGating(Module):
+    def __init__(self, dim, scale_residual = False, **kwargs):
+        super().__init__()
+        self.gru = nn.GRUCell(dim, dim)
+        self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None
+
+    def prepare(self, residual):
+        return residual, residual, dict()
+
+    def forward(self, x, residual, **kwargs):
+        if exists(self.residual_scale):
+            residual = residual * self.residual_scale
+
+        gated_output = self.gru(
+            rearrange(x, 'b n d -> (b n) d'),
+            rearrange(residual, 'b n d -> (b n) d')
+        )
+
+        return gated_output.reshape_as(x)
+
+# hyper connections
+
+class HyperConnection(Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        layer_index,
+        num_residual_streams,
+        num_input_views = 1,
+        tanh = True,
+        **kwargs
+    ):
+        """
+        https://arxiv.org/abs/2409.19606
+        Appendix J - Algorithm 2, Dynamic only
+        """
+        super().__init__()
+
+        self.act = nn.Tanh() if tanh else nn.Identity()
+
+        self.norm = nn.LayerNorm(dim, bias = False)
+
+        self.num_residual_streams = num_residual_streams
+        self.layer_index = layer_index
+
+        self.static_beta = nn.Parameter(torch.ones(num_residual_streams))
+
+        init_alpha0 = torch.zeros((num_residual_streams, num_input_views))
+        init_alpha0[layer_index % num_residual_streams, :] = 1.
+
+        self.static_alpha = nn.Parameter(cat([init_alpha0, torch.eye(num_residual_streams)], dim = 1))
+
+        self.dynamic_alpha_fn = nn.Parameter(torch.zeros(dim, num_residual_streams + num_input_views))
+        self.dynamic_alpha_scale = nn.Parameter(torch.ones(()) * 1e-2)
+
+        self.num_input_views = num_input_views
+
+        self.dynamic_beta_fn = nn.Parameter(torch.zeros(dim))
+        self.dynamic_beta_scale = nn.Parameter(torch.ones(()) * 1e-2)
+
+    def prepare(self, residuals):
+
+        residuals = rearrange(residuals, '(b s) n d -> b n s d', s = self.num_residual_streams)
+
+        normed = self.norm(residuals)
+
+        wc_weight = self.act(normed @ self.dynamic_alpha_fn)
+        dynamic_alpha = wc_weight * self.dynamic_alpha_scale
+        alpha = dynamic_alpha + self.static_alpha
+
+        dc_weight = self.act(normed @ self.dynamic_beta_fn)
+        dynamic_beta = dc_weight * self.dynamic_beta_scale
+        beta = dynamic_beta + self.static_beta
+
+        # width connection
+
+        mix_h = einsum('... s t, ... s d -> ... t d', alpha, residuals)
+
+        views = self.num_input_views
+
+        if views == 1:
+            branch_input, residuals = mix_h[..., 0, :], mix_h[..., 1:, :]
+        else:
+            branch_input, residuals = mix_h[..., :views, :], mix_h[..., views:, :]
+            branch_input = rearrange(branch_input, '... v d -> v ... d')
+
+        return branch_input, residuals, dict(beta = beta)
+
+    def forward(self, x, residuals, *, beta):
+        residuals = einsum('b n d, b n s -> b n s d', x, beta) + residuals
+        return rearrange(residuals, 'b n s d -> (b s) n d')
+
+# LIMe - layer integrated memory (dynamic version)
+
+class DynamicLIMe(Module):
+    def __init__(
+        self,
+        dim,
+        num_layers,
+        num_views = 1,
+        norm = True,
+        use_softmax = True
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.multiple_views = num_views > 1
+
+        self.to_weights = Sequential(
+            RMSNorm(dim) if norm else None,
+            nn.Linear(dim, num_views * num_layers),
+            Rearrange('... (views layers) -> views ... layers', views = num_views),
+            nn.Softmax(dim = -1) if use_softmax else nn.ReLU()
+        )
+
+    def forward(
+        self,
+        x,
+        hiddens
+    ):
+
+        if not is_tensor(hiddens):
+            hiddens = stack(hiddens)
+
+        assert hiddens.shape[0] == self.num_layers, f'expected hiddens to have {self.num_layers} layers but received {tuple(hiddens.shape)} instead (first dimension must be layers)'
+
+        weights = self.to_weights(x)
+
+        out = einsum('l b n d, v b n l -> v b n d', hiddens, weights)
+
+        if self.multiple_views:
+            return out
+
+        return rearrange(out, '1 ... -> ...')
+
+# token shifting
+
+def shift(t, amount, mask = None):
+    if amount == 0:
+        return t
+
+    amount = min(amount, t.shape[1])
+
+    if exists(mask):
+        t = t.masked_fill(~mask[..., None], 0.)
+
+    return pad_at_dim(t, (amount, -amount), dim = - 2, value = 0.)
+
+class ShiftTokens(Module):
+    def __init__(self, shifts, fn):
+        super().__init__()
+        self.fn = fn
+        self.shifts = tuple(shifts)
+
+    def forward(self, x, **kwargs):
+        mask = kwargs.get('mask', None)
+        shifts = self.shifts
+        segments = len(shifts)
+        feats_per_shift = x.shape[-1] // segments
+        splitted = x.split(feats_per_shift, dim = -1)
+        segments_to_shift, rest = splitted[:segments], splitted[segments:]
+        segments_to_shift = [shift(*args, mask = mask) for args in zip(segments_to_shift, shifts)]
+        x = cat((*segments_to_shift, *rest), dim = -1)
+        return self.fn(x, **kwargs)
+
+class FoldAxially(Module):
+    def __init__(
+        self,
+        axial_dim,
+        fn: Module
+    ):
+        super().__init__()
+        self.fn = fn
+        self.axial_dim = axial_dim # will fold the sequence as rearrange("b (n axial_dim) ... -> (b axial_dim) n ...")
+
+    def forward(
+        self,
+        x,
+        **kwargs
+    ):
+        if self.axial_dim == 1:
+            return self.fn(x, **kwargs)
+
+        seq_len, axial_dim = x.shape[1], self.axial_dim
+
+        next_multiple = math.ceil(seq_len / axial_dim) * axial_dim
+        x = pad_at_dim(x, (0, next_multiple - seq_len), dim = 1)
+
+        x = rearrange(x, 'b (n axial_dim) ... -> (b axial_dim) n ...', axial_dim = axial_dim)
+
+        out = self.fn(x, **kwargs)
+
+        (out, *rest_out), tree_spec = tree_flatten(out)
+
+        out = rearrange(out, '(b axial_dim) n ... -> b (n axial_dim) ...', axial_dim = axial_dim)
+
+        out = out[:, :seq_len]
+        out = tree_unflatten((out, *rest_out), tree_spec)
+
+        return out
+
+# post branch operator
+
+class LayerScale(Module):
+    def __init__(
+        self,
+        fn: Module,
+        dim,
+        init_value = 0.,
+        unit_offset = False
+    ):
+        super().__init__()
+        self.unit_offset = unit_offset
+
+        self.fn = fn
+        self.gamma = nn.Parameter(torch.zeros(dim))
+        nn.init.constant_(self.gamma, init_value - float(unit_offset))
+
+    def forward(self, x, **kwargs):
+        out = self.fn(x, **kwargs)
+
+        gamma = self.gamma + float(self.unit_offset)
+
+        if isinstance(out, Tensor):
+            return out * gamma
+
+        out, *rest = out
+        return out * gamma, *rest
+
+class AdaptiveLayerScale(Module):
+    def __init__(
+        self,
+        fn: Module,
+        dim,
+        dim_condition = None,
+        init_bias_value = -2.
+    ):
+        super().__init__()
+        self.fn = fn
+
+        dim_condition = default(dim_condition, dim)
+        self.to_gamma = nn.Linear(dim_condition, dim)
+
+        nn.init.zeros_(self.to_gamma.weight)
+        nn.init.constant_(self.to_gamma.bias, init_bias_value)
+
+    def forward(self, x, *, condition, **kwargs):
+        if condition.ndim == 2:
+            condition = rearrange(condition, 'b d -> b 1 d')
+
+        out = self.fn(x, **kwargs)
+        gamma = self.to_gamma(condition).sigmoid()
+
+        if isinstance(out, Tensor):
+            return out * gamma
+
+        out, *rest = out
+        return out * gamma, *rest
+
+# skip connection combining
+
+class ConcatCombine(Module):
+    def __init__(self, dim, prev_layer_ind):
+        super().__init__()
+        self.prev_layer_ind = prev_layer_ind
+        self.combine = LinearNoBias(dim * 2, dim)
+
+    def forward(self, x, prev_layers: list[Tensor]):
+        skip = prev_layers[self.prev_layer_ind]
+        concatted_skip = cat((skip, x), dim = -1)
+        return self.combine(concatted_skip)
+
+# feedforward
+
+class GLU(Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        activation: Callable,
+        mult_bias = False
+    ):
+        super().__init__()
+        self.act = activation
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+        self.mult_bias = nn.Parameter(torch.ones(dim_out)) if mult_bias else 1.
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim = -1)
+        return x * self.act(gate) * self.mult_bias
+
+class FeedForward(Module):
+    def __init__(
+        self,
+        dim,
+        dim_out = None,
+        mult = 4,
+        glu = False,
+        glu_mult_bias = False,
+        swish = False,
+        relu_squared = False,
+        custom_activation = None,
+        post_act_ln = False,
+        dropout = 0.,
+        sublayer_dropout = 0.,
+        no_bias = False,
+        zero_init_output = False
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+
+        if exists(custom_activation):
+            activation = deepcopy(custom_activation)
+        elif relu_squared:
+            activation = ReluSquared()
+        elif swish:
+            activation = nn.SiLU()
+        else:
+            activation = nn.GELU()
+
+        if glu:
+            project_in = GLU(dim, inner_dim, activation, mult_bias = glu_mult_bias)
+        else:
+            project_in = nn.Sequential(
+                nn.Linear(dim, inner_dim, bias = not no_bias),
+                activation
+            )
+
+        self.ff = Sequential(
+            project_in,
+            LayerNorm(inner_dim) if post_act_ln else None,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out, bias = not no_bias),
+            nn.Dropout(sublayer_dropout) if sublayer_dropout > 0. else None
+        )
+
+        # init last linear layer to 0
+        if zero_init_output:
+            init_zero_(self.ff[-1])
+
+    def forward(self, x):
+        return self.ff(x)
+
+# attention. it is all we need
+
+class Attention(Module):
+    def __init__(
+        self,
+        dim,
+        dim_head = DEFAULT_DIM_HEAD,
+        dim_context = None,
+        heads = 8,
+        causal = False,
+        flash = False,
+        pre_talking_heads = False,
+        post_talking_heads = False,
+        pre_scale_post_talking_heads = False,
+        head_scale = False,
+        sparse_topk = None,
+        sparse_topk_straight_through = False,
+        num_mem_kv = 0,
+        dropout = 0.,
+        sublayer_dropout = 0.,
+        on_attn = False,
+        gate_value_heads = False,
+        swiglu_values = False,
+        gate_values = False,
+        zero_init_output = False,
+        hard = False,
+        max_attend_past = None,
+        qk_norm = False,
+        qk_norm_groups = 1,
+        qk_norm_scale = 10,
+        qk_norm_dim_scale = False,
+        l2_distance = False,
+        sigmoid = False,
+        selective = False,
+        custom_attn_fn: Callable | None = None,
+        hybrid_module: Module | None = None,
+        hybrid_mask_kwarg: str | None = None,
+        hybrid_fold_axial_dim: int | None = None,
+        hybrid_learned_mix = False,
+        one_kv_head = False,
+        kv_heads = None,
+        value_dim_head = None,
+        dim_out = None,
+        add_zero_kv = False,         # same as add_zero_attn in pytorch
+        rotate_num_heads = None,
+        data_dependent_alibi = False,
+        data_dependent_alibi_per_row = False,
+        data_dependent_alibi_per_row_dim_head = 8,
+        data_dependent_alibi_kwargs: dict = dict(),
+        use_cope = False,
+        cope_max_pos = 16,
+        cope_soft_onehot_pos = False,
+        cope_talking_heads = False,
+        softclamp_logits = False,
+        logit_softclamp_value = 50.,
+        learned_value_residual_mix = False,
+        laser = False,                # https://arxiv.org/abs/2411.03493v1
+        laser_softclamp_value = 15.,
+        qkv_receive_diff_residuals = False,
+        use_latent_q = False,
+        dim_latent_q = None,
+        use_latent_kv = False,
+        dim_latent_kv = None,
+        latent_rope_subheads = None,
+        onnxable = False,
+        attend_sdp_kwargs: dict = dict(
+            enable_flash = True,
+            enable_math = True,
+            enable_mem_efficient = True
+        )
+    ):
+        super().__init__()
+        dim_kv = default(dim_context, dim)
+
+        self.scale = dim_head ** -0.5
+
+        self.heads = heads
+        self.causal = causal
+        self.max_attend_past = max_attend_past
+
+        assert not (exists(kv_heads) and one_kv_head), 'either attn_one_kv_head is set to True (in which case kv_heads is set to 1), or attn_kv_heads is set, but not both'
+
+        value_dim_head = default(value_dim_head, dim_head)
+        kv_heads = default(kv_heads, heads)
+
+        kv_heads = 1 if one_kv_head else kv_heads
+        assert divisible_by(heads, kv_heads)
+
+        self.kv_heads = kv_heads
+
+        q_dim = dim_head * heads
+        k_dim = dim_head * kv_heads
+        v_dim = value_dim_head * kv_heads
+        out_dim = value_dim_head * heads
+
+        # determine input dimensions to qkv based on whether intermediate latent q and kv are being used
+        # for eventually supporting multi-latent attention (MLA)
+
+        self.to_latent_q = None
+        self.to_latent_kv = None
+        self.to_rotateable_k = None # for their "decoupled rope", subheads of keys that comes directly from base sequence (does not go through latents)
+
+        dim_q_input = dim
+        dim_kv_input = dim_kv
+
+        if use_latent_q:
+            assert exists(dim_latent_q)
+            self.to_latent_q = LinearNoBias(dim, dim_latent_q)
+            dim_q_input = dim_latent_q
+
+        if use_latent_kv:
+            assert exists(dim_latent_kv)
+            self.to_latent_kv = LinearNoBias(dim, dim_latent_kv)
+            dim_kv_input = dim_latent_kv
+
+        if exists(latent_rope_subheads):
+            assert not exists(rotate_num_heads), '`rotate_num_heads` cannot be set when multi-latent attention is being used'
+            rotate_num_heads = latent_rope_subheads
+
+            k_dim = dim_head * (kv_heads - latent_rope_subheads)
+
+            self.to_rotateable_k = LinearNoBias(dim, dim_head * latent_rope_subheads)
+            self.split_rotateable_k_heads = Rearrange('b n (h d) -> b h n d', h = latent_rope_subheads)
+
+        self.use_latent_q = use_latent_q
+        self.use_latent_kv = use_latent_kv
+
+        # query key projection
+
+        self.to_q = LinearNoBias(dim_q_input, q_dim)
+        self.to_k = LinearNoBias(dim_kv_input, k_dim)
+        self.to_v = LinearNoBias(dim_kv_input, v_dim)
+
+        # split and merge of attention heads
+
+        self.split_q_heads = Rearrange('b n (h d) -> b h n d', h = heads)
+        self.split_k_heads = Rearrange('b n (h d) -> b h n d', d = dim_head)
+        self.split_v_heads = Rearrange('b n (h d) -> b h n d', d = value_dim_head)
+
+        self.merge_heads = Rearrange('b h n d -> b n (h d)')
+
+        # whether qkv receives different residual stream combinations from hyper connections or lime
+
+        self.qkv_receive_diff_residuals = qkv_receive_diff_residuals
+
+        # enhancing gradients to attention through exponentiated values
+
+        self.laser = laser
+        self.laser_softclamp_value = laser_softclamp_value
+
+        # add GLU gating for aggregated values, from alphafold2
+
+        self.to_v_gate = None
+        if gate_values:
+            self.to_v_gate = nn.Linear(dim, out_dim)
+            self.to_v_gate_activation = F.silu if swiglu_values else F.sigmoid
+            nn.init.constant_(self.to_v_gate.weight, 0)
+            nn.init.constant_(self.to_v_gate.bias, 10)
+
+        # add per head gating of the output values, from 'Attend to nothing' paper
+
+        self.to_v_head_gate = None
+        if gate_value_heads:
+            self.to_v_head_gate = nn.Linear(dim, heads)
+            nn.init.constant_(self.to_v_head_gate.weight, 0)
+            nn.init.constant_(self.to_v_head_gate.bias, 10)
+
+        # cosine sim attention
+
+        self.qk_norm = qk_norm
+        self.qk_norm_groups = qk_norm_groups
+        self.qk_norm_scale = qk_norm_scale
+
+        # whether to use the rmsnorm (equivalent to cosine sim attention when scale is equal to 1) - https://arxiv.org/abs/2302.05442
+
+        self.qk_norm_dim_scale = qk_norm_dim_scale
+
+        self.qk_norm_q_scale = self.qk_norm_k_scale = 1
+        if qk_norm and qk_norm_dim_scale:
+            self.qk_norm_q_scale = nn.Parameter(torch.ones(heads, 1, dim_head))
+            self.qk_norm_k_scale = nn.Parameter(torch.ones(kv_heads, 1, dim_head))
+
+        assert (not qk_norm) or divisible_by(dim_head, qk_norm_groups), 'dimension per attention head must be divisible by the qk norm groups'
+        assert not (qk_norm and (dim_head // qk_norm_groups) <= 2), 'the group dimension may be too small (2 was too small in my tests, but 4 still works, surprisingly)'
+
+        # contextual positional encoding
+        # https://arxiv.org/html/2405.18719v2
+
+        cope = None
+
+        if use_cope:
+            assert causal, 'CoPE was designed for causal attention'
+            assert not flash, 'CoPE is not flash attention compatible'
+
+            cope = CoPE(
+                dim = dim_head,
+                heads = heads,
+                max_pos = cope_max_pos,
+                talking_heads = cope_talking_heads,
+                soft_onehot = cope_soft_onehot_pos
+            )
+
+        # data dependent alibi
+        # https://openreview.net/forum?id=q2Lnyegkr8
+
+        self.data_dependent_alibi = None
+
+        if data_dependent_alibi:
+
+            dda_klass = DataDependentAlibi if not data_dependent_alibi_per_row else PerRowDataDependentAlibi
+            dda_kwargs = dict(dim = dim, heads = heads, causal = causal)
+
+            if data_dependent_alibi_per_row:
+                dda_kwargs.update(dim_head = data_dependent_alibi_per_row_dim_head)
+
+            self.data_dependent_alibi = dda_klass(**dda_kwargs, **data_dependent_alibi_kwargs)
+
+        # attend class - includes core attention algorithm + talking heads
+
+        self.attend = Attend(
+            heads = heads,
+            causal = causal,
+            pre_talking_heads = pre_talking_heads,
+            post_talking_heads = post_talking_heads,
+            pre_scale_post_talking_heads = pre_scale_post_talking_heads,
+            dropout = dropout,
+            sparse_topk = sparse_topk,
+            sparse_topk_straight_through = sparse_topk_straight_through,
+            hard = hard,
+            qk_norm = qk_norm,
+            scale = qk_norm_scale if qk_norm else self.scale,
+            l2_distance = l2_distance,
+            sigmoid = sigmoid,
+            selective = selective,
+            custom_attn_fn = custom_attn_fn,
+            add_zero_kv = add_zero_kv,
+            flash = flash,
+            softclamp_logits = softclamp_logits,
+            logit_softclamp_value = logit_softclamp_value,
+            cope = cope,
+            onnxable = onnxable,
+            sdp_kwargs = attend_sdp_kwargs
+        )
+
+        # head scaling
+
+        self.head_scale = head_scale
+        if head_scale:
+            self.head_scale_params = nn.Parameter(torch.ones(1, heads, 1, 1))
+
+        # explicit topk sparse attention
+
+        self.sparse_topk = sparse_topk
+
+        # add memory key / values
+
+        self.num_mem_kv = num_mem_kv
+        if num_mem_kv > 0:
+            self.mem_k = nn.Parameter(torch.randn(kv_heads, num_mem_kv, dim_head))
+            self.mem_v = nn.Parameter(torch.randn(kv_heads, num_mem_kv, dim_head))
+
+        # maybe learned value residual mixer per token
+
+        self.to_value_residual_mix = nn.Sequential(
+            nn.Linear(dim, heads),
+            nn.Sigmoid(),
+            Rearrange('b n h -> b h n 1')
+         ) if learned_value_residual_mix else always(0.5)
+
+        # attention on attention
+
+        self.attn_on_attn = on_attn
+
+        # hybrid module, in same vein as hymba https://www.arxiv.org/abs/2411.13676
+
+        hybrid_mix = None
+        hybrid_norms = None
+        hybrid_module = maybe(deepcopy)(hybrid_module)
+
+        if exists(hybrid_module) and exists(hybrid_fold_axial_dim):
+            hybrid_module = FoldAxially(axial_dim = hybrid_fold_axial_dim, fn = hybrid_module)
+            hybrid_mix = LinearNoBias(dim, heads) if hybrid_learned_mix else None
+
+            hybrid_norms = ModuleList([
+                MultiheadRMSNorm(dim_head, heads = heads),
+                MultiheadRMSNorm(dim_head, heads = heads)
+            ])
+
+        self.hybrid_module = hybrid_module
+        self.hybrid_norms = hybrid_norms
+        self.hybrid_mix = hybrid_mix
+        self.hybrid_mask_kwarg = hybrid_mask_kwarg # for bidirectional, can forward `mask` into the hybrid module and let it handle variable lengths
+
+        # output dimension by default same as input, but can be overridden
+
+        dim_out = default(dim_out, dim)
+        self.to_out = nn.Sequential(LinearNoBias(out_dim, dim_out * 2), nn.GLU()) if on_attn else LinearNoBias(out_dim, dim_out)
+
+        # sublayer dropout
+
+        self.sublayer_dropout = nn.Dropout(sublayer_dropout) if sublayer_dropout > 0. else None
+
+        # the number of attention heads to rotate, for decoupled rope in multi-latent attention
+
+        rotate_num_heads = default(rotate_num_heads, heads)
+
+        assert 0 < rotate_num_heads <= heads
+        is_partial_rotate_heads = rotate_num_heads < heads
+        assert not (is_partial_rotate_heads and kv_heads < heads), 'grouped query attention not compatible with partial rotate heads (decoupled rope for multi-latent attention), yet'
+
+        self.rotate_num_heads = rotate_num_heads
+
+        # whether parent can kv cache
+
+        self.can_cache_kv = not selective
+
+        # init output projection 0
+
+        if zero_init_output:
+            init_zero_(self.to_out)
+
+    def forward(
+        self,
+        x,
+        context = None,
+        mask = None,
+        context_mask = None,
+        attn_mask = None,
+        rel_pos = None,
+        attn_bias = None,
+        rotary_pos_emb = None,
+        context_rotary_pos_emb = None,
+        pos = None, # for custom alibi positions
+        prev_attn = None,
+        mem = None,
+        mem_mask = None,
+        return_intermediates = False,
+        cache: Intermediates | None = None,
+        value_residual = None
+    ):
+        b, n, h, kv_h, head_scale, num_mem_kv, device, has_context, qkv_receive_diff_residuals, is_multi_latent_attn = x.shape[0], x.shape[1], self.heads, self.kv_heads, self.head_scale, self.num_mem_kv, x.device, exists(context), self.qkv_receive_diff_residuals, self.use_latent_kv
+
+        # an interesting possibility with hyper connections
+        # having queries, keys, values be routed from different layers
+
+        assert not (qkv_receive_diff_residuals and has_context), 'qkv receiving different sequences can only be used for self attention'
+
+        if qkv_receive_diff_residuals:
+            assert x.ndim == 4 and x.shape[0] == 3
+
+            q_input, k_input, v_input = x
+        else:
+            kv_input = default(context, x)
+            q_input, k_input, v_input = x, kv_input, kv_input
+
+        if exists(mem):
+            k_input, mem_packed_shape = pack([mem, k_input], 'b * d')
+            v_input, _ = pack([mem, v_input], 'b * d')
+
+        # multi-latent attention logic
+        # https://arxiv.org/abs/2405.04434 - Deepseek-AI team
+
+        k_sub_heads = None # the rotateable subheads of keys derived from base sequence
+
+        if self.use_latent_q:
+            q_input = self.to_latent_q(q_input)
+
+        if is_multi_latent_attn:
+            assert not qkv_receive_diff_residuals
+            needs_k_sub_heads = exists(self.to_rotateable_k)
+
+            latent_kv_input = self.to_latent_kv(k_input)
+
+            if needs_k_sub_heads:
+                rotateable_k = self.to_rotateable_k(k_input)
+                k_sub_heads = self.split_rotateable_k_heads(rotateable_k)
+
+            if exists(cache):
+                cached_latent_kv, maybe_cached_k_sub_heads = cache.cached_kv
+                latent_kv_input = cat((cached_latent_kv, latent_kv_input), dim = -2)
+
+                if exists(maybe_cached_k_sub_heads):
+                    k_sub_heads = cat((maybe_cached_k_sub_heads, k_sub_heads), dim = -2)
+
+            if return_intermediates:
+                cached_kv = (latent_kv_input, k_sub_heads)
+
+            k_input = v_input = latent_kv_input
+
+        # query, key, value projection
+
+        q = self.to_q(q_input)
+        k = self.to_k(k_input)
+        v = self.to_v(v_input)
+
+        q = self.split_q_heads(q)
+        k = self.split_k_heads(k)
+        v = self.split_v_heads(v)
+
+        # take care of decoupled rope from multi-latent attention
+
+        if exists(k_sub_heads):
+            k = cat((k, k_sub_heads), dim = 1)
+
+        # if previous values passed in for residual, either invoke resformer
+
+        orig_values = v
+
+        # https://arxiv.org/abs/2410.17897v1
+
+        if exists(value_residual):
+            value_residual_mix = self.to_value_residual_mix(q_input)
+            v = value_residual.lerp(v, value_residual_mix)
+
+        # qk normalization
+
+        if self.qk_norm:
+            qk_l2norm = partial(l2norm, groups = self.qk_norm_groups)
+            q, k = map(qk_l2norm, (q, k))
+            scale = self.qk_norm_scale
+
+            q = q * self.qk_norm_q_scale
+            k = k * self.qk_norm_k_scale
+
+        # take care of caching
+
+        if not is_multi_latent_attn:
+            if exists(cache):
+                ck, cv = cache.cached_kv
+
+                if exists(mem):
+                    mk, k = unpack(k, mem_packed_shape, 'b h * d')
+                    mv, v = unpack(v, mem_packed_shape, 'b h * d')
+
+                k = cat((ck, k), dim = -2)
+                v = cat((cv, v), dim = -2)
+
+                if exists(mem):
+                    k = cat((mk, k), dim = -2)
+                    v = cat((mv, v), dim = -2)
+
+            if return_intermediates:
+                mem_len = mem.shape[-2] if exists(mem) else 0
+                cached_kv = (k[..., mem_len:, :], v[..., mem_len:, :])
+
+        if exists(rotary_pos_emb):
+            rotate_num_heads = self.rotate_num_heads
+            partial_rotate_heads = rotate_num_heads < h
+
+            freqs, xpos_scale = rotary_pos_emb
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale ** -1.) if exists(xpos_scale) else (1., 1.)
+
+            if partial_rotate_heads:
+                q_rest, q = q[:, :-rotate_num_heads], q[:, -rotate_num_heads:]
+                k_rest, k = k[:, :-rotate_num_heads], k[:, -rotate_num_heads:]
+
+            q = apply_rotary_pos_emb(q, freqs, q_xpos_scale)
+
+            if has_context:
+                # override with `context_rotary_pos_emb` if provided
+
+                freqs, xpos_scale = context_rotary_pos_emb
+                _, k_xpos_scale = (xpos_scale, xpos_scale ** -1.) if exists(xpos_scale) else (1., 1.)
+
+            k = apply_rotary_pos_emb(k, freqs, k_xpos_scale)
+
+            if partial_rotate_heads:
+                q = cat((q_rest, q), dim = 1)
+                k = cat((k_rest, k), dim = 1)
+
+        input_mask = context_mask
+
+        if not exists(input_mask) and not has_context:
+            input_mask = mask
+
+            if (exists(input_mask) or exists(mem_mask)) and exists(mem):
+                seq_len, mem_len = n, mem.shape[-2]
+
+                if not exists(mem_mask):
+                    input_mask = pad_at_dim(input_mask, (mem_len, 0), dim = -1, value = True)
+                elif not exists(input_mask):
+                    input_mask = pad_at_dim(mem_mask, (0, seq_len), dim = -1, value = True)
+                else:
+                    input_mask = cat((mem_mask, input_mask), dim = -1)
+
+        # i, j determined for relative positional bias, excluding memory key / values
+
+        i, j = tuple(t.shape[-2] for t in (q, k))
+
+        # maybe append memory key / values
+
+        if num_mem_kv > 0:
+            mem_k, mem_v = tuple(repeat(t, 'h n d -> b h n d', b = b) for t in (self.mem_k, self.mem_v))
+
+            if self.qk_norm:
+                mem_k = l2norm(mem_k)
+                mem_k = mem_k * self.qk_norm_k_scale
+
+            k = cat((mem_k, k), dim = -2)
+            v = cat((mem_v, v), dim = -2)
+
+            if exists(input_mask):
+                input_mask = pad_at_dim(input_mask, (self.num_mem_kv, 0), dim = -1, value = True)
+
+        # determine masking
+
+        mask_value = max_neg_value(q)
+        masks = []
+        final_attn_mask = None
+
+        if exists(input_mask):
+            input_mask = rearrange(input_mask, 'b j -> b 1 1 j')
+            masks.append(~input_mask)
+
+        if exists(attn_mask):
+            assert 2 <= attn_mask.ndim <= 4, 'attention mask must have greater than 2 dimensions but less than or equal to 4'
+            if attn_mask.ndim == 2:
+                attn_mask = rearrange(attn_mask, 'i j -> 1 1 i j')
+            elif attn_mask.ndim == 3:
+                attn_mask = rearrange(attn_mask, 'h i j -> 1 h i j')
+            masks.append(~attn_mask)
+
+        if exists(self.max_attend_past):
+            range_q = arange(j - i, j, device = device)
+            range_k = arange(j, device = device)
+            dist = einx.subtract('i, j -> 1 1 i j', range_q, range_k)
+            max_attend_past_mask = dist > self.max_attend_past
+            max_attend_past_mask = pad_at_dim(max_attend_past_mask, (num_mem_kv, 0), value = False, dim = -1) # handle memory key / values
+            masks.append(max_attend_past_mask)
+
+        if len(masks) > 0:
+            final_attn_mask = ~or_reduce(masks)
+
+        # prepare relative positional bias, if needed
+
+        if exists(rel_pos):
+            assert not exists(attn_bias)
+
+            if exists(pos):
+                assert isinstance(rel_pos, AlibiPositionalBias), 'only alibi allowed for custom positions at the moment'
+                # allow for custom positions to be passed in
+                attn_bias = rel_pos.forward_custom_pos(pos)
+            else:
+                attn_bias = rel_pos(i, j)
+
+            attn_bias = pad_at_dim(attn_bias, (num_mem_kv, 0)) # handle memory key / values
+
+        # prepare data dependent alibi from forgetting transformers paper, if needed
+
+        if exists(self.data_dependent_alibi):
+            attn_bias = self.data_dependent_alibi(x)
+
+            attn_bias = pad_at_dim(attn_bias, (num_mem_kv, 0))
+
+        if self.laser:
+            v = softclamp(v, self.laser_softclamp_value)
+            v = v.exp()
+
+        # attention is all we need
+
+        out, intermediates = self.attend(
+            q, k, v,
+            mask = final_attn_mask,
+            attn_bias = attn_bias,
+            prev_attn = prev_attn
+        )
+
+        # laser
+
+        if self.laser:
+            out = log(out)
+
+        # store the values for resformer
+
+        intermediates.values = orig_values
+
+        # normformer scaling of heads
+
+        if head_scale:
+            out = out * self.head_scale_params
+
+        # per head gating, from https://arxiv.org/abs/2306.12929
+
+        if exists(self.to_v_head_gate):
+            head_gate = self.to_v_head_gate(x)
+            out = einx.multiply('b n h, b h n d ->b h n d', head_gate.sigmoid(), out)
+
+        # if exists hybrid module, must do a normalization
+
+         # hybrid module
+
+        if exists(self.hybrid_module):
+
+            # hybrid input
+
+            hybrid_forward_kwargs = dict()
+
+            if not self.causal and exists(self.hybrid_mask_kwarg):
+                hybrid_forward_kwargs = {self.hybrid_mask_kwarg: mask}
+
+            # hybrid forward
+
+            hybrid_outputs = self.hybrid_module(x, **hybrid_forward_kwargs)
+
+            # handle hybrid out
+
+            (hybrid_out, *rest_hybrid_outs), _ = tree_flatten(hybrid_outputs)
+
+            # handle variable hybrid output and multi rmsnorm before summing to main attention output (also normed)
+
+            if hybrid_out.ndim == 3:
+                hybrid_out = rearrange(hybrid_out, 'b n (h d) -> b h n d', h = h)
+
+            out_norm, hybrid_out_norm = self.hybrid_norms
+
+            out = out_norm(out)
+            hybrid_out = hybrid_out_norm(hybrid_out)
+
+            if exists(self.hybrid_mix):
+                mix = self.hybrid_mix(x)
+                mix = rearrange(mix, 'b n h -> b h n 1')
+                out = out.lerp(hybrid_out, mix.sigmoid())
+            else:
+                out = 0.5 * (out + hybrid_out)
+
+        # merge heads
+
+        out = self.merge_heads(out)
+
+        # alphafold2 styled gating of the values
+
+        if exists(self.to_v_gate):
+            gates = self.to_v_gate(x)
+            out = out * self.to_v_gate_activation(gates)
+
+        # combine the heads
+
+        out = self.to_out(out)
+
+        # maybe sublayer dropout
+
+        out = maybe(self.sublayer_dropout)(out)
+
+        if exists(mask):
+            out = einx.where('b n, b n d, -> b n d', mask, out, 0.)
+
+        if not return_intermediates:
+            return out
+
+        intermediates.cached_kv = cached_kv
+
+        return out, intermediates
+
+class AttentionLayers(Module):
+    def __init__(
+        self,
+        dim,
+        depth = None,
+        heads = 8,
+        causal = False,
+        cross_attend = False,
+        only_cross = False,
+        use_scalenorm = False,
+        use_rmsnorm = False,
+        use_dynamic_tanh = False,
+        dynamic_tanh_init_alpha = 1.,
+        use_simple_rmsnorm = False,
+        use_adaptive_layernorm = False,
+        use_adaptive_rmsnorm = False,
+        use_adaptive_layerscale = False, # paired with use_adaptive_layernorm for ada-ln-zero from DiT paper
+        norm_add_unit_offset = True,
+        dim_condition = None,
+        adaptive_condition_mlp = False,
+        adaptive_condition_mlp_expansion = 4,
+        alibi_pos_bias = False,
+        alibi_num_heads = None,
+        rel_pos_bias = False,
+        rel_pos_num_buckets = 32,
+        rel_pos_max_distance = 128,
+        dynamic_pos_bias = False,
+        dynamic_pos_bias_log_distance = False,
+        dynamic_pos_bias_mlp_depth = 2,
+        dynamic_pos_bias_norm = False,
+        rotary_pos_emb = False,
+        rotary_emb_dim = None,
+        rotary_xpos = False,
+        rotary_interpolation_factor = 1.,
+        rotary_xpos_scale_base = 512,
+        rotary_base_rescale_factor = 1.,
+        rotate_num_heads = None,
+        weight_tie_layers = False,
+        custom_layers: tuple[str, ...] | None = None,
+        layers_execute_order: tuple[int, ...] | None = None,
+        sandwich_coef = None,
+        par_ratio = None,
+        residual_attn = False,
+        cross_residual_attn = False,
+        macaron = False,
+        pre_norm = True,
+        pre_norm_has_final_norm = True,
+        gate_residual = False,
+        scale_residual = False,
+        scale_residual_constant = 1.,
+        shift_tokens = 0,
+        sandwich_norm = False,
+        softclamp_output = False,
+        softclamp_output_value = 30.,
+        zero_init_branch_output = False,
+        layer_dropout = 0.,
+        cross_attn_tokens_dropout = 0.,
+        disable_abs_pos_emb = None,
+        use_layerscale = False,
+        layerscale_init_value = 0.,
+        unet_skips = False,
+        integrate_layers = False,
+        layer_integrate_use_softmax = True,
+        num_residual_streams = 1,
+        qkv_receive_diff_residuals = False,
+        reinject_input = False,              # seen first in DEQ paper https://arxiv.org/abs/1909.01377, but later used in a number of papers trying to achieve depthwise generalization https://arxiv.org/abs/2410.03020v1
+        learned_reinject_input_gate = False,
+        add_value_residual = False,          # resformer from Zhou et al - https://arxiv.org/abs/2410.17897v1 - further corroboration by https://arxiv.org/abs/2412.15113 (faster emergence of ICL) - looks like this setting may becoming a necessity for every transformer soon
+        learned_value_residual_mix = True,   # seeing big improvements when the value residual mix value is learned per token - credit goes to @faresobeid for taking the first step with learned scalar mix, then @Blinkdl for taking it a step further with data dependent. here we will use per token learned
+        rel_pos_kwargs: dict = dict(),
+        residual_fn_kwargs: dict = dict(),
+        **kwargs
+    ):
+        super().__init__()
+        rotary_pos_emb = rotary_pos_emb or rotary_xpos
+
+        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
+        attn_kwargs, kwargs = groupby_prefix_and_trim('attn_', kwargs)
+        cross_attn_kwargs, kwargs = groupby_prefix_and_trim('cross_attn_', kwargs)
+
+        dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
+        data_dependent_alibi = attn_kwargs.get('data_dependent_alibi', False)
+
+        assert len(kwargs) == 0, f'unrecognized kwargs passed in {kwargs.keys()}'
+
+        self.dim = dim
+        self.causal = causal
+        self.layers = ModuleList([])
+
+        # routing related
+        # 1. greater than one residual stream, proposed in Hyper-Connections paper https://arxiv.org/abs/2409.19606
+        # 2. integrating more than one past layer, from LIMe paper https://arxiv.org/abs/2502.09245
+
+        qkv_receive_diff_residuals |= integrate_layers # qkv always receives different views if integrating layers
+
+        # hyper connections
+
+        assert num_residual_streams > 0
+        has_hyper_connections = num_residual_streams > 1
+
+        self.num_residual_streams = num_residual_streams
+        self.stream_emb = nn.Parameter(torch.zeros(num_residual_streams, dim)) if num_residual_streams > 1 else None
+
+        assert not (has_hyper_connections and gate_residual)
+
+        hyper_conn_produce_diff_views = qkv_receive_diff_residuals and not integrate_layers
+
+        # LIMe
+
+        hiddens_counter = 0
+        self.layer_integrators = ModuleList([])
+
+        assert not (qkv_receive_diff_residuals and not (hyper_conn_produce_diff_views or integrate_layers))
+
+        # positions related
+
+        self.disable_abs_pos_emb = default(disable_abs_pos_emb, (rel_pos_bias or rotary_pos_emb))
+
+        rotary_emb_dim = default(rotary_emb_dim, dim_head // 2)
+
+        assert rotary_emb_dim <= dim_head, f'rotary emb dim {rotary_emb_dim} must be less than or equal to attention head dimension {dim_head}'
+
+        if rotary_emb_dim < 32:
+            logger.warning('when training language model, rotary embedding dimension should be at least 32')
+
+        assert not (rotary_xpos and not causal), 'rotary xpos is not compatible with bidirectional attention'
+        self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim, use_xpos = rotary_xpos, scale_base = rotary_xpos_scale_base, interpolation_factor = rotary_interpolation_factor, base_rescale_factor = rotary_base_rescale_factor) if rotary_pos_emb else None
+
+        assert at_most_one_of(alibi_pos_bias, rel_pos_bias, data_dependent_alibi), 'you can only choose one of Alibi positional bias, data dependent Alibi (forgetting transformers), dynamic tanh, or T5 relative positional bias'
+        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
+
+        # relative positional bias
+
+        flash_attn = attn_kwargs.get('flash', False)
+        assert at_most_one_of(rel_pos_bias, dynamic_pos_bias, alibi_pos_bias), 'you can only choose up to one of t5, alibi, or dynamic positional bias'
+
+        self.rel_pos = None
+
+        if rel_pos_bias:
+            assert not flash_attn, 'flash attention not compatible with t5 relative positional bias'
+            self.rel_pos = RelativePositionBias(scale = dim_head ** 0.5, causal = causal, heads = heads, num_buckets = rel_pos_num_buckets, max_distance = rel_pos_max_distance, **rel_pos_kwargs)
+        elif dynamic_pos_bias:
+            assert not flash_attn, 'flash attention not compatible with dynamic positional bias'
+            self.rel_pos = DynamicPositionBias(dim = dim // 4, heads = heads, log_distance = dynamic_pos_bias_log_distance, depth = dynamic_pos_bias_mlp_depth, norm = dynamic_pos_bias_norm, **rel_pos_kwargs)
+        elif alibi_pos_bias:
+            alibi_num_heads = default(alibi_num_heads, heads)
+            assert alibi_num_heads <= heads, 'number of ALiBi heads must be less than the total number of heads'
+            self.rel_pos = AlibiPositionalBias(heads = alibi_num_heads, total_heads = heads, **rel_pos_kwargs)
+
+        assert not (not pre_norm and sandwich_norm), 'sandwich norm cannot be used when not using prenorm'
+
+        self.pre_norm = pre_norm
+        self.sandwich_norm = sandwich_norm
+
+        self.residual_attn = residual_attn
+        self.cross_residual_attn = cross_residual_attn
+        assert not (flash_attn and (residual_attn or cross_residual_attn)), 'flash attention is not compatible with residual attention'
+
+        self.cross_attend = cross_attend
+
+        # determine norm
+
+        assert at_most_one_of(use_scalenorm, use_rmsnorm, use_dynamic_tanh, use_simple_rmsnorm, use_adaptive_layernorm, use_adaptive_rmsnorm), 'you can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnorm'
+
+        norm_need_condition = False
+        dim_condition = default(dim_condition, dim)
+        dim_condition_mult = 1
+
+        if adaptive_condition_mlp:
+            dim_condition_mult = adaptive_condition_mlp_expansion
+
+        if use_scalenorm:
+            norm_class = ScaleNorm
+        elif use_rmsnorm:
+            norm_class = RMSNorm
+        elif use_simple_rmsnorm:
+            norm_class = SimpleRMSNorm
+        elif use_dynamic_tanh:
+            assert pre_norm, 'dynamic tanh norm only tested for pre-norm'
+            norm_class = partial(DynamicTanh, init_alpha = dynamic_tanh_init_alpha)
+        elif use_adaptive_layernorm:
+            norm_need_condition = True
+            norm_class = partial(AdaptiveLayerNorm, dim_condition = dim_condition * dim_condition_mult)
+        elif use_adaptive_rmsnorm:
+            norm_need_condition = True
+            norm_class = partial(AdaptiveRMSNorm, dim_condition = dim_condition * dim_condition_mult)
+        else:
+            norm_class = LayerNorm
+
+        norm_fn = partial(norm_class, dim)
+
+        if not norm_need_condition and norm_add_unit_offset:
+            # researcher Ohad Rubin shares in a blog post by adding an offset to gammas, they can be subjected to weight decay safely
+            norm_fn = partial(norm_fn, unit_offset = True)
+
+        self.norm_need_condition = norm_need_condition
+        self.dim_condition = dim_condition
+
+        # determine default block layer type order
+
+        if cross_attend and not only_cross:
+            default_block = ('a', 'c', 'f')
+        elif cross_attend and only_cross:
+            default_block = ('c', 'f')
+        else:
+            default_block = ('a', 'f')
+
+        if macaron:
+            default_block = ('f',) + default_block
+
+        # determine post branch wrapper
+
+        assert at_most_one_of(use_layerscale, use_adaptive_layerscale)
+
+        post_branch_fn = None
+        post_branch_fn_needs_condition = False
+
+        if use_layerscale:
+            post_branch_fn = partial(LayerScale, dim = dim, init_value = layerscale_init_value)
+        elif use_adaptive_layerscale:
+            post_branch_fn = partial(AdaptiveLayerScale, dim = dim, dim_condition = dim_condition * dim_condition_mult)
+            post_branch_fn_needs_condition = True
+
+        self.post_branch_fn_needs_condition = post_branch_fn_needs_condition
+
+        if exists(post_branch_fn) and not post_branch_fn_needs_condition and norm_add_unit_offset:
+            post_branch_fn = partial(post_branch_fn, unit_offset = True)
+
+        # setup mlp for conditioning
+
+        self.need_condition = norm_need_condition or post_branch_fn_needs_condition
+
+        self.adaptive_mlp = nn.Identity()
+
+        if self.need_condition and adaptive_condition_mlp:
+            self.adaptive_mlp = nn.Sequential(
+                LinearNoBias(dim_condition, dim_condition * dim_condition_mult),
+                nn.SiLU()
+            )
+
+        # zero init
+
+        if zero_init_branch_output:
+            attn_kwargs = {**attn_kwargs, 'zero_init_output':  True}
+            ff_kwargs = {**ff_kwargs, 'zero_init_output':  True}
+
+        # setup weight tying, which is a special case of `layer_execute_order`
+
+        assert not (exists(layers_execute_order) and exists(custom_layers) and exists(depth)), 'depth should not be passed in if using custom layers and custom layer execution order'
+
+        assert not (weight_tie_layers and any([*map(exists, (custom_layers, par_ratio, sandwich_coef))]))
+
+        if weight_tie_layers:
+            assert exists(depth), 'depth must be passed in with `weight_tie_layers` = True'
+            assert not exists(layers_execute_order)
+            layers_execute_order = tuple(range(len(default_block))) * depth
+            depth = 1
+
+        # calculate layer block order
+
+        len_default_block = 1
+
+        if exists(custom_layers):
+            layer_types = custom_layers
+        elif exists(par_ratio):
+            par_depth = depth * len(default_block)
+            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
+            default_block = tuple(filter(not_equals('f'), default_block))
+            par_attn  = par_depth // par_ratio
+            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
+            par_width = (depth_cut + depth_cut // par_attn) // par_attn
+            assert len(default_block) <= par_width, 'default block is too large for par_ratio'
+            par_block = default_block + ('f',) * (par_width - len(default_block))
+            par_head = par_block * par_attn
+            layer_types = par_head + ('f',) * (par_depth - len(par_head))
+        elif exists(sandwich_coef):
+            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
+            layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
+        else:
+            assert exists(depth), '`depth` must be passed in for `Decoder` or `Encoder`'
+            layer_types = default_block * depth
+            len_default_block = len(default_block)
+
+        self.layer_types = layer_types
+        self.layers_execute_order = default(layers_execute_order, tuple(range(len(layer_types))))
+
+        assert all([i < len(self.layer_types) for i in self.layers_execute_order])
+
+        self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
+
+        # set the depth
+
+        depth = default(depth, len(self.layers_execute_order))
+        self.depth = depth
+
+        # stochastic depth
+
+        self.layer_dropouts = cast_tuple(layer_dropout, len(layer_types))
+
+        # structured dropout for cross attending
+
+        self.cross_attn_tokens_dropout = cross_attn_tokens_dropout
+
+        # calculate token shifting
+
+        shift_tokens = cast_tuple(shift_tokens, len(layer_types))
+
+        # optional soft clamping just before the final norm
+        # used in gemma 2
+
+        self.softclamp_output = softclamp_output
+        self.softclamp_output_value = softclamp_output_value
+
+        # whether it has post norm
+
+        self.final_norm = norm_fn() if pre_norm else nn.Identity()
+
+        # whether unet or not
+
+        self.unet_skips = unet_skips
+        num_skips = self.depth // len_default_block
+
+        assert not (unet_skips and num_skips == 0), 'must have depth of at least 2 for unet skip connections'
+
+        skip_indices = [i * len_default_block for i in range(num_skips)]
+
+        self.skip_combines = ModuleList([])
+
+        # whether there is reinjection of input at every layer
+
+        self.reinject_input = reinject_input
+        self.reinject_input_proj = nn.Linear(dim, dim, bias = False) if reinject_input else None
+        self.learned_reinject_input_gate = nn.Linear(dim, 1, bias = False) if learned_reinject_input_gate else None
+
+        # add the value from the first self attention block to all latter projected self attention values as a residual
+
+        self.add_value_residual = add_value_residual
+
+        is_first_self_attn = True
+        is_first_cross_attn = True
+        learned_value_residual_mix &= add_value_residual
+
+        # iterate and construct layers
+
+        for ind, (layer_type, layer_shift_tokens) in enumerate(zip(self.layer_types, shift_tokens)):
+
+            # `ind` is the index of each module - attention, feedforward, cross attention
+            # but `block_ind` refers to the typical enumeration of a transformer block (attn + ff + [optional] cross attn)
+
+            block_begin = divisible_by(ind, len_default_block)
+            block_ind = ind // len_default_block
+
+            is_last_layer = ind == (len(self.layer_types) - 1)
+
+            # attention, cross attention, feedforward
+
+            layer_qkv_receives_diff_view = layer_type == 'a' and qkv_receive_diff_residuals and not (is_first_self_attn and integrate_layers)
+
+            if layer_type == 'a':
+                self_attn_learned_value_residual = learned_value_residual_mix and not is_first_self_attn
+
+                layer = Attention(dim, heads = heads, causal = causal, qkv_receive_diff_residuals = layer_qkv_receives_diff_view, learned_value_residual_mix = self_attn_learned_value_residual, rotate_num_heads = rotate_num_heads, **attn_kwargs)
+                is_first_self_attn = False
+
+            elif layer_type == 'c':
+                layer = Attention(dim, heads = heads, **{**attn_kwargs, **cross_attn_kwargs})
+                is_first_cross_attn = False
+
+            elif layer_type == 'f':
+                layer = FeedForward(dim, **ff_kwargs)
+                layer = layer if not macaron else Scale(0.5, layer)
+
+            else:
+                raise Exception(f'invalid layer type {layer_type}')
+
+            if layer_shift_tokens > 0:
+                shift_range_upper = layer_shift_tokens + 1
+                shift_range_lower = -layer_shift_tokens if not causal else 0
+                layer = ShiftTokens(range(shift_range_lower, shift_range_upper), layer)
+
+            if exists(post_branch_fn):
+                layer = post_branch_fn(layer)
+
+            layer_integrate = None
+
+            if integrate_layers:
+                num_layer_hiddens = ind + 1
+                layer_integrate_num_view = 3 if layer_qkv_receives_diff_view else 1
+
+                layer_integrate = DynamicLIMe(dim, num_layer_hiddens, num_views = layer_integrate_num_view, use_softmax = layer_integrate_use_softmax)
+
+            if has_hyper_connections:
+                residual_fn = partial(HyperConnection, num_residual_streams = num_residual_streams)
+
+                if layer_type == 'a' and hyper_conn_produce_diff_views:
+                    residual_fn = partial(residual_fn, num_input_views = 3)
+
+            elif gate_residual:
+                residual_fn = GRUGating
+            else:
+                residual_fn = Residual
+
+            residual = residual_fn(dim, layer_index = ind, scale_residual = scale_residual, scale_residual_constant = scale_residual_constant, **residual_fn_kwargs)
+
+            # handle unet skip connection
+
+            skip_combine = None
+            is_latter_half = block_begin and block_ind >= (self.depth / 2)
+
+            if self.unet_skips and is_latter_half:
+                skip_combine = ConcatCombine(dim, skip_indices.pop())
+
+            # all normalizations of the layer
+
+            pre_branch_norm = norm_fn() if pre_norm else None
+            post_branch_norm = norm_fn() if sandwich_norm else None
+            post_main_norm = norm_fn() if not pre_norm else None
+
+            norms = ModuleList([
+                pre_branch_norm,
+                post_branch_norm,
+                post_main_norm
+            ])
+
+            self.skip_combines.append(skip_combine)
+
+            self.layer_integrators.append(layer_integrate)
+
+            self.layers.append(ModuleList([
+                norms,
+                layer,
+                residual
+            ]))
+
+        # determine whether can cache kv
+
+        self.can_cache_kv = all([module.can_cache_kv for module in self.modules() if isinstance(module, Attention)])
+
+    def forward(
+        self,
+        x,
+        context = None,
+        mask = None,
+        context_mask = None,
+        attn_mask = None,
+        self_attn_kv_mask = None,
+        mems = None,
+        mem_masks = None,
+        seq_start_pos: Tensor | None = None,
+        cache: LayerIntermediates | None = None,
+        cache_age = 1,
+        return_hiddens = False,
+        rotary_pos_emb = None,
+        pos = None,
+        context_pos = None,
+        attn_bias = None,
+        condition = None,
+        in_attn_cond = None, # https://arxiv.org/abs/2105.04090
+        layers_execute_order: tuple[int, ...] | None = None
+    ):
+        assert not (self.cross_attend ^ exists(context)), 'context must be passed in if cross_attend is set to True'
+        assert not (exists(condition) ^ self.need_condition), 'condition needs to be passed in if using adaptive layernorm or vice versa'
+
+        # handle condition
+
+        if exists(condition):
+            assert condition.shape[-1] == self.dim_condition, f'expected condition dimension of {self.dim_condition} but received {condition.shape[-1]}'
+
+            assert condition.ndim in {2, 3}
+
+            if condition.ndim == 2:
+                condition = rearrange(condition, 'b d -> b 1 d')
+
+            condition = self.adaptive_mlp(condition)
+
+        # setup maybe layernorm kwarg
+
+        norm_kwargs = dict()
+
+        if self.norm_need_condition:
+            norm_kwargs.update(condition = condition)
+
+        # maybe post branch fn conditioning (DiT paper's ada-ln-zero)
+
+        block_forward_kwargs = dict()
+
+        if self.post_branch_fn_needs_condition:
+            block_forward_kwargs.update(condition = condition)
+
+        # initialize accums
+
+        hiddens = []
+        layer_hiddens = []
+        intermediates = []
+
+        prev_attn = None
+        prev_cross_attn = None
+
+        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
+        mem_masks = mem_masks.copy() if exists(mem_masks) else [None] * self.num_attn_layers
+
+        # handle left padded sequences
+
+        if exists(seq_start_pos):
+            seq_arange = arange(x.shape[-2], device = x.device, dtype = torch.long)
+            left_pad_mask = seq_arange >= seq_start_pos[..., None]
+
+            if exists(self_attn_kv_mask):
+                self_attn_kv_mask = self_attn_kv_mask & left_pad_mask
+            else:
+                self_attn_kv_mask = left_pad_mask
+
+        # rotary positions
+
+        cross_attn_rotary_pos_emb = dict()
+
+        if exists(self.rotary_pos_emb):
+            if not exists(rotary_pos_emb):
+                maybe_mem = first(mems, None) # todo - handle edge case where different layers get different memory lengths. don't think this will ever come up but who knows
+                mem_len = maybe_mem.shape[1] if exists(maybe_mem) else 0
+
+                if not exists(pos):
+                    pos = arange(x.shape[1] + mem_len, device = x.device) - mem_len
+
+                rotary_pos_emb = self.rotary_pos_emb(pos)
+
+            # allow for rotary positions for context if provided
+
+            if exists(context_pos):
+                assert self.cross_attend
+                context_rotary_pos_emb = self.rotary_pos_emb(context_pos)
+
+                cross_attn_rotary_pos_emb.update(
+                    rotary_pos_emb = rotary_pos_emb,
+                    context_rotary_pos_emb = context_rotary_pos_emb
+                )
+
+        # assume cached key / values
+
+        attn_cache = []
+
+        if exists(cache):
+            assert self.causal and not any([*map(exists, (mask, attn_mask))])
+
+            if exists(context):
+                context = context[:, :0]
+
+            if cache_age > 0:
+                x = x[:, -cache_age:] # for spec decoding, may be greater than 1
+
+            attn_cache = cache.attn_intermediates
+
+        iter_attn_cache = iter(attn_cache)
+
+        # setup multistreams if needed
+
+        streams = self.num_residual_streams
+        is_multistream = streams > 1
+
+        if is_multistream:
+            x = einx.add('b n d, s d -> (b s) n d', x, self.stream_emb)
+
+        # get layers to be executed
+
+        layer_variables = (
+            self.layer_types,
+            self.skip_combines,
+            self.layers,
+            self.layer_dropouts,
+            self.layer_integrators
+        )
+
+        # able to override the layers execution order on forward, for trying to depth extrapolate
+
+        layers_execute_order = default(layers_execute_order, self.layers_execute_order)
+        layer_variables = tuple(tuple(layer_variable[i] for i in layers_execute_order) for layer_variable in layer_variables)
+
+        # derived input for reinjection if needed
+
+        inp_inject = None
+
+        if self.reinject_input:
+            assert not exists(in_attn_cond)
+            inp_inject = self.reinject_input_proj(x)
+
+        elif exists(in_attn_cond):
+            # handle in-attention conditioning, which serves the same purpose of having the network learn the residual
+            inp_inject = in_attn_cond if in_attn_cond.ndim == 3 else rearrange(in_attn_cond, 'b d -> b 1 d')
+
+        if exists(inp_inject) and exists(self.learned_reinject_input_gate):
+            inp_inject_gate = self.learned_reinject_input_gate(x).sigmoid()
+            inp_inject = inp_inject * inp_inject_gate
+
+        # store all hiddens for skips
+
+        skip_hiddens = []
+
+        # for value residuals
+
+        first_self_attn_inter = None
+        first_cross_attn_inter = None
+
+        # go through the attention and feedforward layers
+
+        for ind, (layer_type, skip_combine, (norm, block, residual_fn), layer_dropout, layer_integrator) in enumerate(zip(*layer_variables)):
+            is_last = ind == (len(self.layers) - 1)
+
+            # handle skip connections
+
+            skip_hiddens.append(x)
+
+            if exists(skip_combine):
+                x = skip_combine(x, skip_hiddens)
+
+            # layer dropout
+
+            if self.training and layer_dropout > 0. and random() < layer_dropout:
+                continue
+
+            if layer_type == 'a':
+                if return_hiddens:
+                    hiddens.append(x)
+
+                layer_mem = mems.pop(0) if mems else None
+                layer_mem_mask = mem_masks.pop(0) if mem_masks else None
+
+            if layer_type == 'c':
+                if self.training and self.cross_attn_tokens_dropout > 0.:
+                    context, context_mask = dropout_seq(context, context_mask, self.cross_attn_tokens_dropout)
+
+            x, inner_residual, residual_kwargs = residual_fn.prepare(x)
+
+            layer_hiddens.append(x)
+
+            if exists(layer_integrator):
+                x = layer_integrator(x, layer_hiddens)
+
+            pre_norm, post_branch_norm, post_main_norm = norm
+
+            if self.need_condition:
+                pre_norm = maybe(partial)(pre_norm, **norm_kwargs)
+                post_branch_norm = maybe(partial)(post_branch_norm, **norm_kwargs)
+                post_main_norm = maybe(partial)(post_main_norm, **norm_kwargs)
+
+            if exists(inp_inject):
+                x = x + inp_inject
+
+            if exists(pre_norm):
+                x = pre_norm(x)
+
+                if layer_type == 'a' and exists(layer_mem):
+                    layer_mem = pre_norm(layer_mem)
+
+            block = partial(block, **block_forward_kwargs)
+
+            # handle maybe value residuals
+
+            maybe_self_attn_value_residual = None
+            maybe_cross_attn_value_residual = None
+
+            if self.add_value_residual:
+                if exists(first_self_attn_inter):
+                    maybe_self_attn_value_residual = first_self_attn_inter.values
+
+                if exists(first_cross_attn_inter):
+                    maybe_cross_attn_value_residual = first_cross_attn_inter.values
+
+            # forward depending on layer type
+
+            if layer_type == 'a':
+                out, inter = block(x, mask = mask, context_mask = self_attn_kv_mask, attn_mask = attn_mask, rel_pos = self.rel_pos, pos = pos, rotary_pos_emb = rotary_pos_emb, prev_attn = prev_attn, cache = next(iter_attn_cache, None), mem = layer_mem, mem_mask = layer_mem_mask, attn_bias = attn_bias, value_residual = maybe_self_attn_value_residual, return_intermediates = True)
+            elif layer_type == 'c':
+                out, inter = block(x, context = context, mask = mask, context_mask = context_mask, prev_attn = prev_cross_attn, cache = next(iter_attn_cache, None), value_residual = maybe_cross_attn_value_residual, **cross_attn_rotary_pos_emb, return_intermediates = True)
+            elif layer_type == 'f':
+                out = block(x)
+
+            # store first self or cross attention intermediate for value residual
+
+            if not exists(first_self_attn_inter) and layer_type == 'a':
+                first_self_attn_inter = inter
+
+            if not exists(first_cross_attn_inter) and layer_type == 'c':
+                first_cross_attn_inter = inter
+
+            if exists(post_branch_norm):
+                out = post_branch_norm(out)
+
+            x = residual_fn(out, inner_residual, **residual_kwargs)
+
+            if layer_type in ('a', 'c') and return_hiddens:
+                inter.layer_type = layer_type
+                intermediates.append(inter)
+
+            if layer_type == 'a' and self.residual_attn:
+                prev_attn = inter.pre_softmax_attn
+            elif layer_type == 'c' and self.cross_residual_attn:
+                prev_cross_attn = inter.pre_softmax_attn
+
+            if exists(post_main_norm):
+                x = post_main_norm(x)
+
+        if return_hiddens:
+            layer_hiddens.append(x)
+
+        if self.softclamp_output:
+            x = softclamp(x, self.softclamp_output_value)
+
+        final_norm = self.final_norm
+
+        if self.need_condition:
+            final_norm = maybe(partial)(final_norm, **norm_kwargs)
+
+        # take care of multistreams if needed, use sum for now
+
+        if is_multistream:
+            x = reduce(x, '(b s) n d -> b n d', 'sum', s = streams)
+
+        x = final_norm(x)
+
+        if not return_hiddens:
+            return x
+
+        intermediates = LayerIntermediates(
+            hiddens = hiddens,
+            last_hidden = x,
+            attn_intermediates = intermediates,
+            layer_hiddens = layer_hiddens,
+        )
+
+        return x, intermediates
+
+class Encoder(AttentionLayers):
+    def __init__(self, **kwargs):
+        assert 'causal' not in kwargs, 'cannot set causality on encoder'
+        super().__init__(causal = False, **kwargs)
+
+class Decoder(AttentionLayers):
+    def __init__(self, **kwargs):
+        assert 'causal' not in kwargs, 'cannot set causality on decoder'
+        super().__init__(causal = True, **kwargs)
+
+class PrefixDecoder(AttentionLayers):
+    def __init__(self, **kwargs):
+        assert 'causal' not in kwargs, 'cannot set causality on decoder'
+        super().__init__(causal = False, **kwargs)
+
+    def forward(
+        self,
+        x,
+        *args,
+        attn_mask = None,
+        prefix_attn_len = None,
+        **kwargs
+    ):
+        b, n, device = x.shape[0], x.shape[1], x.device
+        causal_mask = torch.ones((n, n), device = device, dtype = torch.bool).triu(1)
+
+        forwarded_mask = ~causal_mask
+
+        if exists(prefix_attn_len):
+            if isinstance(prefix_attn_len, int):
+                prefix_attn_len = torch.full((b,), prefix_attn_len, device = device)
+
+            prefix_mask = arange(n, device = device) < rearrange(prefix_attn_len, 'b -> b 1 1 1')
+            forwarded_mask = forwarded_mask | prefix_mask
+
+        if exists(attn_mask):
+            forwarded_mask = forwarded_mask & attn_mask
+
+        return super().forward(x, *args, attn_mask = forwarded_mask, **kwargs)
+
+class CrossAttender(AttentionLayers):
+    def __init__(self, **kwargs):
+        super().__init__(cross_attend = True, only_cross = True, **kwargs)
+
+class ViTransformerWrapper(Module):
+    def __init__(
+        self,
+        *,
+        image_size,
+        patch_size,
+        attn_layers: Encoder,
+        channels = 3,
+        num_classes = None,
+        post_emb_norm = False,
+        num_register_tokens = 0,
+        emb_dropout = 0.
+    ):
+        super().__init__()
+        assert divisible_by(image_size, patch_size), 'image dimensions must be divisible by the patch size'
+        dim = attn_layers.dim
+        num_patches = (image_size // patch_size) ** 2
+        patch_dim = channels * patch_size ** 2
+
+        self.patch_size = patch_size
+
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))
+
+        has_register_tokens = num_register_tokens > 0
+        self.has_register_tokens = has_register_tokens
+
+        if has_register_tokens:
+            self.register_tokens = nn.Parameter(torch.randn(num_register_tokens, dim))
+
+        self.patch_to_embedding = nn.Sequential(
+            LayerNorm(patch_dim),
+            nn.Linear(patch_dim, dim),
+            LayerNorm(dim)
+        )
+
+        self.post_emb_norm = LayerNorm(dim) if post_emb_norm else nn.Identity()
+        self.dropout = nn.Dropout(emb_dropout)
+
+        self.attn_layers = attn_layers
+
+        self.mlp_head = nn.Linear(dim, num_classes) if exists(num_classes) else nn.Identity()
+
+    def forward(
+        self,
+        img,
+        return_embeddings = False,
+        return_logits_and_embeddings = False
+    ):
+        b, p = img.shape[0], self.patch_size
+
+        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
+        x = self.patch_to_embedding(x)
+        n = x.shape[1]
+
+        x = x + self.pos_embedding[:, :n]
+
+        x = self.post_emb_norm(x)
+        x = self.dropout(x)
+
+        if self.has_register_tokens:
+            r = repeat(self.register_tokens, 'n d -> b n d', b = b)
+            x, ps = pack((x, r), 'b * d')
+
+        embed = self.attn_layers(x)
+
+        if self.has_register_tokens:
+            embed, _ = unpack(embed, ps, 'b * d')
+
+        assert at_most_one_of(return_embeddings, return_logits_and_embeddings)
+
+        if not exists(self.mlp_head) or return_embeddings:
+            return embed
+
+        pooled = embed.mean(dim = -2)
+        logits = self.mlp_head(pooled)
+
+        if not return_logits_and_embeddings:
+            return logits
+
+        return logits, embed
+
+class TransformerWrapper(Module):
+    def __init__(
+        self,
+        *,
+        num_tokens,
+        max_seq_len,
+        attn_layers: AttentionLayers,
+        embed_num_tokens: dict[str, int] = dict(),
+        emb_dim = None,
+        max_mem_len = 0,
+        shift_mem_down = 0,
+        emb_dropout = 0.,
+        post_emb_norm = False,
+        num_memory_tokens = None,
+        memory_tokens_interspersed_every = None,
+        tie_embedding = False,
+        logits_dim = None,
+        return_only_embed = False,
+        num_output_heads = 1,
+        use_abs_pos_emb = True,
+        scaled_sinu_pos_emb = False,
+        l2norm_embed = False,
+        recycling = False,            # from Jumper et al. - Alphafold2
+        train_max_recycle_steps = 4,  # saw a benefit for language modeling up to 3 recycling steps, so let's default this to 4
+        emb_frac_gradient = 1.,       # GLM-130B and Cogview successfully used this, set at 0.1
+        attn_z_loss_weight = 1e-4,
+        average_pool_embed = False,
+        use_cls_token = False,
+        num_cls_tokens = 1,
+        squeeze_out_last_dim = False,
+        token_emb: TokenEmbedding | None = None,
+        mixture_of_softmax = False,
+        mixture_of_softmax_k = 4,
+        sigsoftmax_logits = False,
+        to_logits: Module | None = None,
+    ):
+        super().__init__()
+
+        dim = attn_layers.dim
+        emb_dim = default(emb_dim, dim)
+        self.emb_dim = emb_dim
+        self.num_tokens = num_tokens
+        self.num_cls_tokens = num_cls_tokens
+
+        self.max_seq_len = max_seq_len
+        self.max_mem_len = max_mem_len
+        self.shift_mem_down = shift_mem_down
+
+        self.l2norm_embed = l2norm_embed
+
+        if not exists(token_emb):
+            token_emb = TokenEmbedding(emb_dim, num_tokens, l2norm_embed = l2norm_embed)
+
+        self.token_emb = token_emb
+
+        no_abs_pos_emb = max_seq_len == 0 or not (use_abs_pos_emb and not attn_layers.disable_abs_pos_emb)
+
+        if no_abs_pos_emb:
+            self.pos_emb = always(0)
+        elif scaled_sinu_pos_emb:
+            self.pos_emb = ScaledSinusoidalEmbedding(emb_dim)
+        else:
+            self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len, l2norm_embed = l2norm_embed)
+
+        # additional embeddings - say type embedding from BERT
+
+        self.embeds = None
+
+        if len(embed_num_tokens) > 0:
+            self.embeds = ModuleDict({f'{name}_embed': nn.Embedding(num_tokens, emb_dim) for name, num_tokens in embed_num_tokens.items()})
+
+        # fraction of the gradient that should go to the embedding, https://arxiv.org/abs/2105.13290
+
+        self.emb_frac_gradient = emb_frac_gradient
+
+        self.post_emb_norm = LayerNorm(emb_dim) if post_emb_norm else nn.Identity()
+        self.emb_dropout = nn.Dropout(emb_dropout)
+
+        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
+        self.attn_layers = attn_layers
+
+        self.init_()
+
+        assert num_output_heads > 0
+
+        assert at_most_one_of(average_pool_embed, use_cls_token)
+
+        # maybe recycling
+
+        self.recycling = recycling
+        self.recycled_proj = LinearNoBias(dim, dim) if recycling else None
+
+        self.train_max_recycle_steps = train_max_recycle_steps
+
+        # classic cls token from the bert days
+
+        self.cls_token = None
+
+        if use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(num_cls_tokens, dim))
+            nn.init.normal_(self.cls_token, std = 0.02)
+
+        # whether to average pool the embed (`global average pool`)
+
+        self.average_pool_embed = average_pool_embed
+
+        # output type
+
+        self.output_is_log_prob = mixture_of_softmax
+
+        self.to_mixture = None
+        self.combine_mixture = None
+
+        if mixture_of_softmax:
+            assert num_output_heads == 1
+
+            self.to_mixture = Sequential(
+                LinearNoBias(dim, dim * mixture_of_softmax_k),
+                Rearrange('... (k d) -> ... k d', k = mixture_of_softmax_k)
+            )
+
+            self.combine_mixture = LinearNoBias(dim, mixture_of_softmax_k)
+
+        # sig softmax
+
+        self.sigsoftmax_logits = sigsoftmax_logits
+
+        # output head, usually to logits of num_tokens
+
+        logits_dim = default(logits_dim, num_tokens)
+
+        self.has_multiple_heads = num_output_heads > 1
+
+        if return_only_embed:
+            self.to_logits = None
+        elif tie_embedding:
+            assert isinstance(token_emb, TokenEmbedding), 'can only tie embedding if using `TokenEmbedding`'
+            self.to_logits = lambda t: t @ self.token_emb.emb.weight.t()
+        elif num_output_heads > 1:
+            self.to_logits = ModuleList([LinearNoBias(dim, logits_dim) for _ in range(num_output_heads)])
+        else:
+            self.to_logits = LinearNoBias(dim, logits_dim) if not exists(to_logits) else to_logits
+
+        # memory tokens (like [cls]) from Memory Transformers paper
+
+        num_memory_tokens = default(num_memory_tokens, 0)
+        self.num_memory_tokens = num_memory_tokens
+        if num_memory_tokens > 0:
+            self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
+
+        self.memory_tokens_interspersed_every = memory_tokens_interspersed_every
+
+        # squeeze out last dimension if possible
+
+        self.squeeze_out_last_dim = squeeze_out_last_dim
+
+        # whether can do cached kv decoding
+
+        self.can_cache_kv = self.num_memory_tokens == 0 and not recycling and self.attn_layers.can_cache_kv
+        self.can_cache_kv_outside_max_seq_len = no_abs_pos_emb
+
+    def init_(self):
+        if hasattr(self.token_emb, 'init_'):
+            self.token_emb.init_()
+
+        if self.l2norm_embed:
+            if not isinstance(self.pos_emb, always):
+                nn.init.normal_(self.pos_emb.emb.weight, std = 1e-5)
+
+    def forward(
+        self,
+        x,
+        return_embeddings = False,
+        return_logits_and_embeddings = False,
+        return_intermediates = False,
+        return_embeddings_and_intermediates = False,
+        return_logit_entropies = False,
+        mask = None,
+        return_mems = False,
+        return_attn = False,
+        mems = None,
+        mem_masks = None,
+        recycle_steps = None,
+        pos = None,
+        prepend_embeds = None,
+        prepend_mask = None,
+        embed_ids: dict[str, Tensor] = dict(),
+        sum_embeds = None,
+        return_attn_z_loss = False,
+        attn_z_loss_weight = 1e-4,
+        seq_start_pos = None,
+        cache: LayerIntermediates | None = None,
+        token_emb_kwargs = dict(),
+        to_logits_kwargs = dict(),
+        **kwargs,
+    ):
+
+        # if sequence is None, auto create an empty one if `prepend_embeds` was supplied
+
+        if not exists(x):
+            assert exists(prepend_embeds)
+            x = prepend_embeds.new_empty((prepend_embeds.shape[0], 0), dtype = torch.long)
+
+        # shapes and variables
+
+        b, n, device, num_mems, has_memory_tokens, emb_frac_gradient, orig_mask = x.shape[0], x.shape[1], x.device, self.num_memory_tokens, self.num_memory_tokens > 0, self.emb_frac_gradient, mask
+
+        return_hiddens = return_mems | return_attn | return_intermediates | return_attn_z_loss | return_embeddings_and_intermediates
+        return_embeddings = return_embeddings | (not exists(self.to_logits)) | return_embeddings_and_intermediates
+
+        # absolute positional embedding
+
+        external_pos_emb = exists(pos) and pos.dtype != torch.long
+        pos_emb = self.pos_emb(x, pos = pos, seq_start_pos = seq_start_pos) if not external_pos_emb else pos
+        x = self.token_emb(x, **token_emb_kwargs) + pos_emb
+
+        # add additional embeddings
+
+        assert not (exists(self.embeds) ^ (len(embed_ids) > 0)), '`embed_num_tokens` must be defined on `TransformerWrapper`'
+
+        if exists(self.embeds):
+            assert len(embed_ids) == len(self.embeds)
+
+            for name, embed_id in embed_ids.items():
+                embed_key = f'{name}_embed'
+
+                assert embed_key in self.embeds
+                embed = self.embeds[embed_key](embed_id)
+
+                x = x + embed
+
+        # for summing embeddings passed externally - needs this for self-conditioning in non-autoregressive training
+
+        if exists(sum_embeds):
+            x = x + sum_embeds
+
+        # post embedding norm, purportedly leads to greater stabilization
+
+        x = self.post_emb_norm(x)
+
+        # whether to append embeds, as in PaLI, for image embeddings
+
+        if exists(prepend_embeds):
+            prepend_seq, prepend_dim = prepend_embeds.shape[1:]
+            assert prepend_dim == x.shape[-1], 'prepended embeddings need to have same dimensions as text model dimensions'
+
+            x = cat((prepend_embeds, x), dim = -2)
+
+            if exists(prepend_mask) or exists(mask):
+                mask = default(mask, lambda: torch.ones((b, n), device = device, dtype = torch.bool))
+                prepend_mask = default(prepend_mask, lambda: torch.ones((b, prepend_seq), device = device, dtype = torch.bool))
+
+                mask = cat((prepend_mask, mask), dim = -1)
+
+        # whether to reduce the gradient going to the embedding, from cogview paper, corroborated by GLM-130B model
+
+        if emb_frac_gradient < 1:
+            assert emb_frac_gradient > 0
+            x = x * emb_frac_gradient + x.detach() * (1 - emb_frac_gradient)
+
+        # embedding dropout
+
+        x = self.emb_dropout(x)
+
+        x = self.project_emb(x)
+
+        # maybe cls token
+
+        if exists(self.cls_token):
+            cls_tokens = repeat(self.cls_token, '... -> b ...', b = b)
+            x, cls_packed_shape = pack([cls_tokens, x], 'b * d')
+
+            if exists(mask):
+                mask = F.pad(mask, (self.num_cls_tokens, 0), value = True)
+
+        # maybe memory / register tokens
+
+        if has_memory_tokens:
+            mem_seq = x.shape[-2]
+            mem_every = self.memory_tokens_interspersed_every
+
+            if exists(mem_every):
+                assert mem_every > 0
+                assert isinstance(self.attn_layers, Decoder), 'only for decoder'
+                next_seq_len = math.ceil(n / mem_every) * mem_every
+
+                x = pad_at_dim(x, (0, next_seq_len - n), dim = -2, value = 0.)
+                x = rearrange(x, 'b (n m) d -> (b n) m d', m = mem_every)
+
+            mem = repeat(self.memory_tokens, 'n d -> b n d', b = x.shape[0])
+            x, mem_packed_shape = pack((mem, x), 'b * d')
+
+            # auto-handle masking after appending memory tokens
+            if not exists(mem_every) and exists(mask):
+                mask = pad_at_dim(mask, (num_mems, 0), dim = -1, value = True)
+
+            if exists(mem_every):
+                x = rearrange(x, '(b n) m d -> b (n m) d', b = b)
+
+        # handle maybe shifting of memories
+
+        if self.shift_mem_down and exists(mems):
+            mems_l, mems_r = mems[:self.shift_mem_down], mems[self.shift_mem_down:]
+            mems = [*mems_r, *mems_l]
+
+        # attention layers
+
+        if not self.recycling:
+            assert not exists(recycle_steps) or recycle_steps == 1, 'you did not train with recycling'
+
+            # regular
+
+            attended, intermediates = self.attn_layers(x, mask = mask, mems = mems, mem_masks = mem_masks, cache = cache, return_hiddens = True, seq_start_pos = seq_start_pos, **kwargs)
+
+        else:
+            # recycling
+
+            recycle_steps = default(recycle_steps, (randrange(self.train_max_recycle_steps) + 1) if self.training else None)
+            assert exists(recycle_steps) and recycle_steps > 0, '`recycle_steps` must be provided on forward if recycling is turned on and not training'
+
+            for i in range(recycle_steps):
+                first_step = i == 0
+                last_step = i == (recycle_steps - 1)
+
+                context = nullcontext if last_step else torch.no_grad
+
+                with context():
+                    maybe_recycled = self.recycled_proj(attended.detach()) if not first_step else 0.
+
+                    attended, intermediates = self.attn_layers(x + maybe_recycled, mask = mask, mems = mems, mem_masks = mem_masks, cache = cache, return_hiddens = True, seq_start_pos = seq_start_pos, **kwargs)
+
+        x = attended
+
+        # handle memories post-attention
+
+        if has_memory_tokens:
+            if exists(mem_every):
+                x = rearrange(x, 'b (n m) d -> (b n) m d', m = (mem_every + num_mems))
+
+            mem, x = unpack(x, mem_packed_shape, 'b * d')
+
+            intermediates.memory_tokens = mem
+
+            if exists(mem_every):
+                x = rearrange(x, '(b n) m d -> b (n m) d', b = b)
+
+            x = x[:, :mem_seq]
+
+        # global average pool
+
+        if self.average_pool_embed:
+            x = masked_mean(x, mask = orig_mask, dim = 1)
+
+        if exists(self.cls_token):
+            x, _ = unpack(x, cls_packed_shape, 'b * d')
+            x = x.squeeze(1)  # Remove sequence dimension if num_cls_tokens=1 to keep previous behavior
+
+        # handle expansion to mixture if needed (for mixture of softmax)
+
+        combine_mixture = None
+
+        if exists(self.to_mixture):
+            combine_mixture = self.combine_mixture(x).softmax(dim = -1)
+            x = self.to_mixture(x)
+
+        # projecting to logits
+
+        if not return_embeddings:
+            if self.has_multiple_heads:
+                logits = tuple(fn(x, **to_logits_kwargs) for fn in self.to_logits)
+            else:
+                logits = self.to_logits(x, **to_logits_kwargs)
+
+        # maybe sig softmax
+
+        if self.sigsoftmax_logits:
+            logits = logits + logits.sigmoid().log()
+
+        # handle maybe combine mixture
+
+        if exists(combine_mixture):
+            with autocast('cuda', enabled = False):
+                prob = logits.softmax(dim = -1)
+                mos = einsum('... k d, ... k -> ... d', prob, combine_mixture)
+                logits = log(mos)
+
+        # maybe squeeze out last dimension of logits
+
+        if self.squeeze_out_last_dim:
+            logits = tuple((rearrange(t, '... 1 -> ...') if t.shape[-1] == 1 else t) for t in cast_tuple(logits))
+
+            if not self.has_multiple_heads:
+                logits = first(logits)
+
+        # different returns
+
+        if return_logits_and_embeddings:
+            out = (logits, x)
+        elif return_embeddings_and_intermediates:
+            out = (x, intermediates)
+        elif return_embeddings:
+            out = x
+        else:
+            out = logits
+
+        # logit entropies
+
+        if return_logit_entropies:
+            intermediates.logit_entropies = calc_entropy(logits)
+            return_intermediates = True
+
+        # aux loss
+
+        if return_attn_z_loss:
+            pre_softmax_attns = [t.pre_softmax_attn for t in  intermediates.attn_intermediates]
+            intermediates.attn_z_loss = calc_z_loss(pre_softmax_attns, weight = attn_z_loss_weight)
+            return_intermediates = True
+
+        if return_mems:
+            hiddens = intermediates.hiddens
+            new_mems = [cat(pair, dim = -2) for pair in zip(mems, hiddens)] if exists(mems) else hiddens
+            new_mems = [t[..., -self.max_mem_len:, :].detach() for t in new_mems]
+
+            if not return_intermediates:
+                return out, new_mems
+
+            intermediates.mems = new_mems
+
+        if return_intermediates:
+            return out, intermediates
+
+        if return_attn:
+            attn_maps = [t.post_softmax_attn for t in intermediates.attn_intermediates]
+            return out, attn_maps
+
+        return out
+
+class XTransformer(Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        tie_token_emb = False,
+        ignore_index = -100,
+        pad_value = 0,
+        cross_attn_tokens_dropout = 0.,
+        **kwargs
+    ):
+        super().__init__()
+        enc_kwargs, kwargs = groupby_prefix_and_trim('enc_', kwargs)
+        dec_kwargs, kwargs = groupby_prefix_and_trim('dec_', kwargs)
+
+        assert 'dim' not in enc_kwargs and 'dim' not in dec_kwargs, 'dimension of either encoder or decoder must be set with `dim` keyword'
+        enc_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], enc_kwargs)
+        enc_transformer_kwargs['emb_dropout'] = enc_kwargs.pop('emb_dropout', 0)
+        enc_transformer_kwargs['num_memory_tokens'] = enc_kwargs.pop('num_memory_tokens', None)
+        enc_transformer_kwargs['scaled_sinu_pos_emb'] = enc_kwargs.pop('scaled_sinu_pos_emb', False)
+        enc_transformer_kwargs['use_abs_pos_emb'] = enc_kwargs.pop('use_abs_pos_emb', True)
+
+        dec_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], dec_kwargs)
+        dec_transformer_kwargs['emb_dropout'] = dec_kwargs.pop('emb_dropout', 0)
+        dec_transformer_kwargs['scaled_sinu_pos_emb'] = dec_kwargs.pop('scaled_sinu_pos_emb', False)
+        dec_transformer_kwargs['use_abs_pos_emb'] = dec_kwargs.pop('use_abs_pos_emb', True)
+
+        self.cross_attn_tokens_dropout = cross_attn_tokens_dropout  # how many tokens from the encoder to dropout when cross attending from decoder - seen in a couple papers, including Perceiver AR - this will also be very effective regularization when cross attending to very long memories
+
+        self.encoder = TransformerWrapper(
+            **enc_transformer_kwargs,
+            return_only_embed = True,
+            attn_layers = Encoder(dim = dim, **enc_kwargs)
+        )
+
+        self.decoder = TransformerWrapper(
+            **dec_transformer_kwargs,
+            attn_layers = Decoder(dim = dim, cross_attend = True, **dec_kwargs)
+        )
+
+        if tie_token_emb:
+            self.decoder.token_emb = self.encoder.token_emb
+
+        self.decoder = AutoregressiveWrapper(self.decoder, ignore_index=ignore_index, pad_value=pad_value)
+
+    @torch.no_grad()
+    def generate(self, seq_in, seq_out_start, seq_len, mask = None, attn_mask = None, **kwargs):
+        encodings = self.encoder(seq_in, mask = mask, attn_mask = attn_mask, return_embeddings = True)
+        return self.decoder.generate(seq_out_start, seq_len, context = encodings, context_mask = mask, **kwargs)
+
+    def forward(self, src, tgt, mask = None, attn_mask = None, src_prepend_embeds = None):
+
+        enc = self.encoder(src, mask = mask, attn_mask = attn_mask, prepend_embeds = src_prepend_embeds, return_embeddings = True)
+
+        if exists(src_prepend_embeds) and exists(mask):
+            mask = pad_at_dim(mask, (src_prepend_embeds.shape[-2], 0), dim = -1, value = True)
+
+        if self.training and self.cross_attn_tokens_dropout > 0:
+            enc, mask = dropout_seq(enc, mask, self.cross_attn_tokens_dropout)
+
+        out = self.decoder(tgt, context = enc, context_mask = mask)
+        return out
diff --git a/src/models/dit/positional_embedding.py b/src/models/dit/positional_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef3d43aadfbe54520af5cc9e131d4cc121427928
--- /dev/null
+++ b/src/models/dit/positional_embedding.py
@@ -0,0 +1,64 @@
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+import numpy as np
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed(embed_dim, sequence_length, cls_token=False, extra_tokens=0):
+    poses = np.arange(sequence_length, dtype=np.float32)
+    pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, poses)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
diff --git a/src/models/dit/spade_generator.py b/src/models/dit/spade_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..147a9aed0c7707fe6ae3d59ce1a30154ef75afcc
--- /dev/null
+++ b/src/models/dit/spade_generator.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+
+"""
+Spade decoder(G) defined in the paper, which input the warped feature to generate the animated image.
+"""
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .util import SPADEResnetBlock
+
+
+class SPADEDecoder(nn.Module):
+    def __init__(self, upscale=1, max_features=256, block_expansion=64, out_channels=64, num_down_blocks=2):
+        for i in range(num_down_blocks):
+            input_channels = min(max_features, block_expansion * (2 ** (i + 1)))
+        self.upscale = upscale
+        super().__init__()
+        norm_G = 'spadespectralinstance'
+        label_num_channels = input_channels  # 256
+
+        self.fc = nn.Conv2d(input_channels, 2 * input_channels, 3, padding=1)
+        self.G_middle_0 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_1 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_2 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_3 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_4 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_5 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.up_0 = SPADEResnetBlock(2 * input_channels, input_channels, norm_G, label_num_channels)
+        self.up_1 = SPADEResnetBlock(input_channels, out_channels, norm_G, label_num_channels)
+        self.up = nn.Upsample(scale_factor=2)
+
+        if self.upscale is None or self.upscale <= 1:
+            self.conv_img = nn.Conv2d(out_channels, 3, 3, padding=1)
+        else:
+            self.conv_img = nn.Sequential(
+                nn.Conv2d(out_channels, 3 * (2 * 2), kernel_size=3, padding=1),
+                nn.PixelShuffle(upscale_factor=2)
+            )
+
+    def forward(self, feature):
+        seg = feature  # Bx256x64x64
+        x = self.fc(feature)  # Bx512x64x64
+        x = self.G_middle_0(x, seg)
+        x = self.G_middle_1(x, seg)
+        x = self.G_middle_2(x, seg)
+        x = self.G_middle_3(x, seg)
+        x = self.G_middle_4(x, seg)
+        x = self.G_middle_5(x, seg)
+
+        x = self.up(x)  # Bx512x64x64 -> Bx512x128x128
+        x = self.up_0(x, seg)  # Bx512x128x128 -> Bx256x128x128
+        x = self.up(x)  # Bx256x128x128 -> Bx256x256x256
+        x = self.up_1(x, seg)  # Bx256x256x256 -> Bx64x256x256
+
+        x = self.conv_img(F.leaky_relu(x, 2e-1))  # Bx64x256x256 -> Bx3xHxW
+        x = torch.sigmoid(x)  # Bx3xHxW
+
+        return x
\ No newline at end of file
diff --git a/src/models/dit/stitching_retargeting_network.py b/src/models/dit/stitching_retargeting_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f50b7cf5a21cd71c70a7bbaaa4b6b68b4762ea3
--- /dev/null
+++ b/src/models/dit/stitching_retargeting_network.py
@@ -0,0 +1,38 @@
+# coding: utf-8
+
+"""
+Stitching module(S) and two retargeting modules(R) defined in the paper.
+
+- The stitching module pastes the animated portrait back into the original image space without pixel misalignment, such as in
+the stitching region.
+
+- The eyes retargeting module is designed to address the issue of incomplete eye closure during cross-id reenactment, especially
+when a person with small eyes drives a person with larger eyes.
+
+- The lip retargeting module is designed similarly to the eye retargeting module, and can also normalize the input by ensuring that
+the lips are in a closed state, which facilitates better animation driving.
+"""
+from torch import nn
+
+
+class StitchingRetargetingNetwork(nn.Module):
+    def __init__(self, input_size, hidden_sizes, output_size):
+        super(StitchingRetargetingNetwork, self).__init__()
+        layers = []
+        for i in range(len(hidden_sizes)):
+            if i == 0:
+                layers.append(nn.Linear(input_size, hidden_sizes[i]))
+            else:
+                layers.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
+            layers.append(nn.ReLU(inplace=True))
+        layers.append(nn.Linear(hidden_sizes[-1], output_size))
+        self.mlp = nn.Sequential(*layers)
+
+    def initialize_weights_to_zero(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.zeros_(m.weight)
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x):
+        return self.mlp(x)
diff --git a/src/models/dit/talking_head_diffusion.py b/src/models/dit/talking_head_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..802e1f5f881aecd8d5df475e630f5c52bb9ce350
--- /dev/null
+++ b/src/models/dit/talking_head_diffusion.py
@@ -0,0 +1,301 @@
+# encoding = 'utf-8'
+import os.path as osp
+
+import math
+from rich.progress import track
+
+from omegaconf import OmegaConf
+
+import torch
+import torch.nn as nn
+
+from .talking_head_dit import TalkingHeadDiT_models
+import sys
+from ..schedulers.scheduling_ddim import DDIMScheduler
+from ..schedulers.flow_matching import ModelSamplingDiscreteFlow
+sys.path.append(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__))))))
+scheduler_map = {
+    "ddim": DDIMScheduler,
+    # "ddpm": DiffusionSchedule,
+    "flow_matching": ModelSamplingDiscreteFlow
+}
+lip_dims=[18, 19, 20, 36, 37, 38, 42, 43, 44, 51, 52, 53, 57, 58, 59, 60, 61, 62]
+
+class MotionDiffusion(nn.Module):
+    def __init__(self, config, device="cuda", dtype=torch.float32, smo_wsize=3, loss_type="l2"):
+        super().__init__()
+
+        self.config = config
+        self.smo_wsize = smo_wsize
+        print(f"================================== Init Motion GeneratorV2 ==================================")
+        print(OmegaConf.to_yaml(self.config))
+        
+        motion_gen_config = config.motion_generator
+        motion_gen_params = motion_gen_config.params
+        
+        audio_proj_config = config.audio_projector
+        audio_proj_params = audio_proj_config.params
+        
+        scheduler_config = config.noise_scheduler
+        scheduler_params = scheduler_config.params
+
+        self.device = device
+
+        # init motion generator
+        self.talking_head_dit = TalkingHeadDiT_models[config.model_name](
+            input_dim           = motion_gen_params.input_dim * 2,
+            output_dim          = motion_gen_params.output_dim,
+            seq_len             = motion_gen_params.n_pred_frames,
+            audio_unit_len      = audio_proj_params.sequence_length,
+            audio_blocks        = audio_proj_params.blocks,
+            audio_dim           = audio_proj_params.audio_feat_dim,
+            audio_tokens        = audio_proj_params.context_tokens,
+            audio_embedder_type = audio_proj_params.audio_embedder_type,
+            audio_cond_dim      = audio_proj_params.audio_cond_dim,
+            norm_type           = motion_gen_params.norm_type,
+            qk_norm             = motion_gen_params.qk_norm,
+            exp_dim             = motion_gen_params.exp_dim
+        )
+        self.input_dim = motion_gen_params.input_dim
+        self.exp_dim = motion_gen_params.exp_dim
+
+        self.audio_feat_dim = audio_proj_params.audio_feat_dim 
+        self.audio_seq_len = audio_proj_params.sequence_length
+        self.audio_blocks = audio_proj_params.blocks
+        self.audio_margin = (audio_proj_params.sequence_length - 1) // 2
+        self.indices = (
+            torch.arange(2 * self.audio_margin + 1) - self.audio_margin
+        ).unsqueeze(0)  # Generates [-2, -1, 0, 1, 2], size 1 x (2*self.audio_margin+1)
+        
+        self.n_prev_frames = motion_gen_params.n_prev_frames
+        self.n_pred_frames = motion_gen_params.n_pred_frames
+        
+        # init diffusion schedule
+        self.scheduler = scheduler_map[scheduler_config.type](
+            num_train_timesteps = scheduler_params.num_train_timesteps,
+            beta_start          = scheduler_params.beta_start, 
+            beta_end            = scheduler_params.beta_end, 
+            beta_schedule       = scheduler_params.mode,
+            prediction_type     = scheduler_config.sample_mode,
+            time_shifting       = scheduler_params.time_shifting,
+        )
+        self.scheduler_type = scheduler_config.type
+        self.eta = scheduler_params.eta
+        self.scheduler.set_timesteps(scheduler_params.num_inference_steps, device=self.device)
+        self.timesteps = self.scheduler.timesteps
+        print(f"time steps: {self.timesteps}")
+        
+        self.sample_mode = scheduler_config.sample_mode
+        assert (self.sample_mode in ["noise", "sample"], f"Unknown sample mode {self.sample_mode}, should be noise or sample")
+
+        # init other params
+        self.audio_drop_ratio = config.train.audio_drop_ratio
+        self.pre_drop_ratio = config.train.pre_drop_ratio
+
+        self.null_audio_feat = nn.Parameter(
+            torch.randn(1, 1, 1, 1, self.audio_feat_dim), 
+            requires_grad=True
+        ).to(device=self.device, dtype=dtype)
+
+        self.null_motion_feat = nn.Parameter(
+            torch.randn(1, 1, self.input_dim),
+            requires_grad=True
+        ).to(device=self.device, dtype=dtype)
+        
+        # for segments fusion
+        self.overlap_len = min(16, self.n_pred_frames - 16)
+        self.fuse_alpha = torch.arange(self.overlap_len, device=self.device, dtype=dtype).reshape(1, -1, 1) / self.overlap_len
+
+        self.dtype = dtype
+        self.loss_type = loss_type
+
+        total_params = sum(p.numel() for p in self.parameters())
+        print('Number of parameter: % .4fM' % (total_params / 1e6))
+        print(f"================================== init Motion GeneratorV2: Done ==================================")
+        
+    def _smooth(self, motion):
+        # motion, B x L x D
+        if self.smo_wsize <= 1:
+            return motion
+        new_motion = motion.clone()
+        n = motion.shape[1]
+        half_k = self.smo_wsize // 2
+        for i in range(n):
+            ss = max(0, i - half_k)
+            ee = min(n, i + half_k + 1)
+            # only smooth head pose motion
+            motion[:, i, self.exp_dim:] = torch.mean(new_motion[:, ss:ee, self.exp_dim:], dim=1)
+            
+        return motion
+
+    def _fuse(self, prev_motion, cur_motion):
+        r1 = prev_motion[:, -self.overlap_len:]
+        r2 = cur_motion[:, :self.overlap_len]
+        r_fuse = r1 * (1 - self.fuse_alpha) + r2 * self.fuse_alpha
+
+        prev_motion[:, -self.overlap_len:] = r_fuse    # fuse last
+        return prev_motion
+    
+    @torch.no_grad()
+    def sample_subclip(
+        self, 
+        audio, 
+        ref_kp,
+        prev_motion,
+        emo=None,
+        cfg_scale=1.15, 
+        init_latents=None,
+        dynamic_threshold = None
+    ):
+        # prepare audio feat
+        batch_size = audio.shape[0]
+        audio = audio.to(self.device)
+        if audio.ndim == 4:
+            audio = audio.unsqueeze(2)
+        
+        # reference keypoints
+        ref_kp = ref_kp.view(batch_size, 1, -1)
+        
+        # cfg
+        if cfg_scale > 1:
+            uncond_audio = self.null_audio_feat.expand(
+               batch_size, self.n_pred_frames, self.audio_seq_len, self.audio_blocks, -1
+            )
+            audio = torch.cat([uncond_audio,audio], dim=0)
+            ref_kp = torch.cat([ref_kp] * 2, dim=0)
+            if emo is not None:
+                uncond_emo = torch.Tensor([self.talking_head_dit.num_emo_class]).long().to(self.device)
+                emo = torch.cat([uncond_emo,emo], dim=0)
+        ref_kp = ref_kp.repeat(1, audio.shape[1], 1)  # B, L, kD
+
+        # prepare noisy motion
+        if init_latents is None:
+            latents = torch.randn((batch_size, self.n_pred_frames, self.input_dim)).to(self.device)
+        else:
+            latents = init_latents
+        
+        prev_motion = prev_motion.expand_as(latents).to(dtype=self.dtype)
+        latents = latents.to(dtype=self.dtype)
+        audio = audio.to(dtype=self.dtype)
+        ref_kp = ref_kp.to(dtype=self.dtype)
+        for t in track(self.timesteps, description='🚀Denosing', total=len(self.timesteps)):
+            motion_in = torch.cat([prev_motion, latents], dim=-1)
+            step_in = torch.tensor([t] * batch_size, device=self.device, dtype=self.dtype)
+            if cfg_scale > 1:
+                motion_in = torch.cat([motion_in] * 2, dim=0)
+                step_in = torch.cat([step_in] * 2, dim=0)
+            # predict
+            pred = self.talking_head_dit(
+                motion     = motion_in, 
+                times       = step_in,
+                audio      = audio,
+                emo        = emo,
+                audio_cond = ref_kp
+            )
+
+            if dynamic_threshold:
+                dt_ratio, dt_min, dt_max = dynamic_threshold
+                abs_results = pred.reshape(batch_size * 2, -1).abs()
+                s = torch.quantile(abs_results, dt_ratio, dim=1)
+                s = torch.clamp(s, min=dt_min, max=dt_max)
+                s = s[..., None, None]
+                pred = torch.clamp(pred, min=-s, max=s)
+
+            # CFG
+            if cfg_scale > 1:
+                # uncond_pred, emo_cond_pred, all_cond_pred = pred.chunk(3, dim=0)
+                # pred = uncond_pred + 8 * (emo_cond_pred - uncond_pred) + 1.2 * (all_cond_pred - emo_cond_pred)
+                uncond_pred, cond_pred = pred.chunk(2, dim=0)
+                pred = uncond_pred + cfg_scale * (cond_pred - uncond_pred)
+            # Step
+            latents = self.scheduler.step(pred, t, latents, eta=self.eta, return_dict=False)[0]
+        self.talking_head_dit.bank=[]
+        return latents
+            
+    @torch.no_grad()
+    def sample(self, audio, ref_kp, prev_motion, cfg_scale=1.15, audio_pad_mode="zero", emo=None,dynamic_threshold=None):
+        # prev_motion, B, 1, D
+        # for inference with any length audio
+        # crop audio into n_subdivision according to n_pred_frames 
+        clip_len = audio.shape[0]
+        stride = self.n_pred_frames - self.overlap_len
+        if clip_len <= self.n_pred_frames:
+            n_subdivision = 1
+        else:
+            n_subdivision = math.ceil((clip_len - self.n_pred_frames) / stride) + 1
+        
+        # padding
+        n_padding_frames = self.n_pred_frames + stride * (n_subdivision - 1) - clip_len
+        if n_padding_frames > 0:
+            padding_value = 0
+            if audio_pad_mode == 'zero':
+                padding_value = torch.zeros_like(audio[-1:])
+            elif audio_pad_mode == 'replicate':
+                padding_value = audio[-1:]
+            else:
+                raise ValueError(f'Unknown pad mode: {audio_pad_mode}')
+            audio = torch.cat(
+                [audio[:1]] * self.audio_margin \
+                + [audio] + [padding_value] * n_padding_frames \
+                + [audio[-1:]] * self.audio_margin, 
+                dim=0
+            )
+        
+        center_indices = torch.arange(
+            self.audio_margin,
+            audio.shape[0] - self.audio_margin
+        ).unsqueeze(1) + self.indices
+        audio_tensor = audio[center_indices]   # T, L, b, aD
+
+        # add reference keypoints
+        motion_lst = []
+        #init_latents = torch.randn((1, self.n_pred_frames, self.motion_dim)).to(device=self.device)
+        init_latents = None
+        # emotion label
+        if emo is not None:
+            emo = torch.Tensor([emo]).long().to(self.device)
+        start_idx = 0
+        for i in range(0, n_subdivision):
+            print(f"Sample subclip {i+1}/{n_subdivision}")
+            end_idx = start_idx + self.n_pred_frames
+            audio_segment = audio_tensor[start_idx: end_idx].unsqueeze(0)
+            start_idx += stride
+
+            # debug
+            #print(f"scale:")
+            
+            motion_segment = self.sample_subclip(
+                audio             = audio_segment, 
+                ref_kp            = ref_kp,
+                prev_motion       = prev_motion,
+                emo               = emo,
+                cfg_scale         = cfg_scale,
+                init_latents      = init_latents,
+                dynamic_threshold = dynamic_threshold
+            )
+            # smooth
+
+            motion_segment = self._smooth(motion_segment)
+            # update prev motion
+            prev_motion = motion_segment[:, stride-1:stride].clone()
+
+            # save results
+            motion_coef = motion_segment
+            if i == n_subdivision - 1 and n_padding_frames > 0:
+                motion_coef = motion_coef[:, :-n_padding_frames]  # delete padded frames
+            
+            if len(motion_lst) > 0:
+                # fuse segments
+                motion_lst[-1] = self._fuse(motion_lst[-1], motion_coef)
+                motion_lst.append(motion_coef[:, self.overlap_len:])
+            else:
+                motion_lst.append(motion_coef)
+                
+        motion = torch.cat(motion_lst, dim=1)
+        # smooth for full clip
+        motion = self._smooth(motion)
+        motion = motion.squeeze()
+        return motion.float()
+
+
+    
\ No newline at end of file
diff --git a/src/models/dit/talking_head_dit.py b/src/models/dit/talking_head_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..436bc3dc195af86cbd648914d0f9135ff48ceaef
--- /dev/null
+++ b/src/models/dit/talking_head_dit.py
@@ -0,0 +1,244 @@
+# Reference: 
+# 1. DiT https://github.com/facebookresearch/DiT
+# 2. TIMM https://github.com/rwightman/pytorch-image-models
+
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+import time
+from .blocks import FinalLayer
+from .blocks import MMDoubleStreamBlock as DiTBlock2
+from .blocks import MMSingleStreamBlock as DiTBlock
+from .blocks import CrossDiTBlock as DiTBlock3
+from .blocks import MMfourStreamBlock as DiTBlock4
+# from .positional_embedding import get_1d_sincos_pos_embed
+from .posemb_layers import apply_rotary_emb, get_1d_rotary_pos_embed
+from .embedders import TimestepEmbedder, MotionEmbedder, AudioEmbedder, ConditionAudioEmbedder, SimpleAudioEmbedder, LabelEmbedder
+from einops import rearrange, repeat
+audio_embedder_map = {
+    "normal": AudioEmbedder,
+    "cond": ConditionAudioEmbedder,
+    "simple": SimpleAudioEmbedder
+}
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+class TalkingHeadDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_dim=265,
+        output_dim =265,
+        seq_len=80,
+        audio_unit_len=5,
+        audio_blocks=12,
+        audio_dim=768,
+        audio_tokens = 1,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        audio_embedder_type="normal",
+        audio_cond_dim = 63,
+        norm_type="rms_norm",
+        qk_norm="rms_norm",
+        **kwargs
+    ):
+        super().__init__()
+        
+        self.num_emo_class = 8
+        self.emo_drop_prob = 0.1
+
+        self.num_heads = num_heads
+        self.out_channels = output_dim
+
+        self.motion_embedder = MotionEmbedder(input_dim, hidden_size)
+        self.identity_embedder=MotionEmbedder(audio_cond_dim, hidden_size)
+        self.time_embedder = TimestepEmbedder(hidden_size)       
+        self.audio_embedder = audio_embedder_map['normal'](
+            seq_len          = audio_unit_len, 
+            blocks           = audio_blocks,
+            channels         = audio_dim,
+            intermediate_dim = hidden_size,
+            output_dim       = hidden_size,
+            context_tokens   = audio_tokens, 
+            input_len        = seq_len,
+            condition_dim    = audio_cond_dim, 
+            norm_type        = norm_type, 
+            # qk_norm          = qk_norm,
+            # n_heads          =num_heads
+        )
+        self.dim=hidden_size//num_heads
+        
+        self.emo_embedder = LabelEmbedder(num_classes=self.num_emo_class, hidden_size=hidden_size, dropout_prob=self.emo_drop_prob)
+        
+        # Will use fixed sin-cos embedding:
+        # self.pos_embed = nn.Parameter(torch.zeros(1, seq_len, hidden_size), requires_grad=False)
+        self.blocks4 = nn.ModuleList([
+            DiTBlock4(
+                hidden_size, num_heads, 
+                mlp_ratio=mlp_ratio, 
+                norm_type=norm_type, 
+                qk_norm=qk_norm
+            ) for _ in range(3)
+        ])
+        self.blocks2 = nn.ModuleList([
+            DiTBlock2(
+                hidden_size, num_heads, 
+                mlp_ratio=mlp_ratio, 
+                norm_type=norm_type, 
+                qk_norm=qk_norm
+            ) for _ in range(6)
+        ])
+        self.blocks=nn.ModuleList([
+            DiTBlock(
+                hidden_size, num_heads, 
+                mlp_ratio=mlp_ratio, 
+                norm_type=norm_type, 
+                qk_norm=qk_norm
+            ) for _ in range(12)
+        ])
+        self.final_layer = FinalLayer(hidden_size, self.out_channels, norm_type=norm_type)
+        self.initialize_weights()
+        self.bank=[]
+    def initialize_weights(self):
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        # pos_embed = get_1d_sincos_pos_embed(self.pos_embed.shape[-1], self.pos_embed.shape[-2])
+        # self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+
+        # Initialize input layers nn.Linear
+        self.motion_embedder.initialize_weights()
+        self.identity_embedder.initialize_weights()
+        # Initialize audio embedding 
+        self.audio_embedder.initialize_weights()
+
+        # Initialize emotion embedding
+        self.emo_embedder.initialize_weights()
+
+        # Initialize timestep embedding MLP
+        self.time_embedder.initialize_weights()
+        
+        # Initialize DiT blocks:
+        for block in self.blocks:
+            block.initialize_weights()
+        for block in self.blocks2:
+            block.initialize_weights()
+        for block in self.blocks4:
+            block.initialize_weights()
+        # Initialize output layers:
+        # self.final_layer.initialize_weights()
+    def cal_sync_loss(self, audio_embedding, mouth_embedding, label):
+        if isinstance(label, torch.Tensor):
+            gt_d = label.float().view(-1,1).to(audio_embedding.device)
+        else:
+            gt_d = (torch.ones([audio_embedding.shape[0],1]) * label).float().to(audio_embedding.device) # int
+        d = nn.functional.cosine_similarity(audio_embedding, mouth_embedding)
+        loss = self.logloss(d.unsqueeze(1), gt_d)
+        return loss, d
+
+    def forward(self, motion, times, audio, emo, audio_cond,mask=None):
+        """
+        Forward pass of Talking Head DiT.
+        motion: (B, N, xD) tensor of moton features inputs (head motion, emotion, etc.)
+        time: (B,) tensor of diffusion timesteps
+        audio: (B, N, M, yD) tensor of audio features, (batch_size, video_length, blocks, channels).
+        cond: (B, N, cD) tensor of conditional features
+        audio_cond: (B, N, zD) or (B, zD) tensor of audio conditional features
+        """
+        # bianma=time.time()                     # (B, D)
+        motion_embeds = self.motion_embedder(motion) # (B, N, D), N: seq length
+        _,seq_len,_=motion.shape
+        time_embeds = self.time_embedder(times)    
+        cache=True
+        if cache:
+            # emotion embedding
+            emo_embeds = self.emo_embedder(emo, self.training)# (B, D)
+            audio_cond=audio_cond.mean(1)
+            audio_cond_embeds = self.identity_embedder(audio_cond)
+    
+            # audio embedding
+            freqs_cos, freqs_sin = get_1d_rotary_pos_embed(self.dim, seq_len,theta=256, use_real=True, theta_rescale_factor=1)
+            freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+            audio_embeds = self.audio_embedder(audio)  # (B, N, M, D)
+            # self.bank.append(audio_embeds)
+            M=audio_embeds.shape[2]
+            audio_embeds = rearrange(audio_embeds, "b n m d -> b (n m) d")
+            # print(audio_embeds.shape)
+            c = time_embeds+emo_embeds
+            # motion embedding
+
+            freqs_cos2=rearrange(freqs_cos.unsqueeze(0).repeat(M,1,1), "n m d -> (n m) d")
+            freqs_sin2=rearrange(freqs_sin.unsqueeze(0).repeat(M,1,1),"n m d -> (n m) d")
+            freqs_cis2 = (freqs_cos2, freqs_sin2) if freqs_cos2 is not None else None
+
+            freqs_cos3=rearrange(freqs_cos.unsqueeze(0).repeat(3*M,1,1), "n m d -> (n m) d")
+            freqs_sin3=rearrange(freqs_sin.unsqueeze(0).repeat(3*M,1,1),"n m d -> (n m) d")
+            freqs_cis3 = (freqs_cos3, freqs_sin3) if freqs_cos2 is not None else None
+            
+            # self.bank.append(emo_embeds)
+            # self.bank.append(audio_cond_embeds)
+            emo_embeds=emo_embeds.unsqueeze(1).repeat(1,seq_len,1)
+            audio_cond_embeds=audio_cond_embeds.unsqueeze(1).repeat(1,seq_len,1)
+        for block in (self.blocks4):
+            motion_embeds,audio_embeds,emo_embeds,audio_cond_embeds = block(motion_embeds, c, audio_embeds,emo_embeds,audio_cond_embeds,mask,freqs_cis,freqs_cis2,causal=False)  
+        audio_embeds=torch.cat((audio_embeds,emo_embeds,audio_cond_embeds), 1)
+        for block in self.blocks2:
+            motion_embeds,audio_embeds= block(seq_len,motion_embeds, c, audio_embeds,mask,freqs_cis,freqs_cis3,causal=False)
+        motion_embeds=torch.cat((motion_embeds, audio_embeds), 1)
+        for block in self.blocks:
+            motion_embeds = block(seq_len,motion_embeds, c,mask,freqs_cis,freqs_cis3,causal=False)
+        motion_embeds=motion_embeds[:,:seq_len,:]
+        out = self.final_layer(motion_embeds, c)                          # (B, N, out_channels)
+        # print("dit",time.time()-b)
+        return out
+
+    def forward_with_cfg(self, motion, time, audio, cfg_scale, emo=None, audio_cond=None):
+        """
+        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        pass
+        # # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        # half = x[: len(x) // 2]
+        # combined = torch.cat([half, half], dim=0)
+        # model_out = self.forward(combined, t, y)
+        # # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # # three channels by default. The standard approach to cfg applies it to all channels.
+        # # This can be done by uncommenting the following line and commenting-out the line following that.
+        # # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        # eps, rest = model_out[:, :3], model_out[:, 3:]
+        # cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        # half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        # eps = torch.cat([half_eps, half_eps], dim=0)
+        # return torch.cat([eps, rest], dim=1)
+
+
+
+def TalkingHeadDiT_XL(**kwargs):
+    return TalkingHeadDiT(depth=28, hidden_size=1152, num_heads=16, **kwargs)
+
+def TalkingHeadDiT_L(**kwargs):
+    return TalkingHeadDiT(depth=24, hidden_size=1024, num_heads=16, **kwargs)
+
+def TalkingHeadDiT_B(**kwargs):
+    return TalkingHeadDiT(depth=12, hidden_size=768, num_heads=12, **kwargs)
+def TalkingHeadDiT_MM(**kwargs):
+    return TalkingHeadDiT(depth=6, hidden_size=768, num_heads=12, **kwargs)
+def TalkingHeadDiT_S(**kwargs):
+    return TalkingHeadDiT(depth=12, hidden_size=384, num_heads=6, **kwargs)
+
+def TalkingHeadDiT_T(**kwargs):
+    return TalkingHeadDiT(depth=6, hidden_size=256, num_heads=4, **kwargs)
+
+
+
+
+TalkingHeadDiT_models = {
+    'TalkingHeadDiT-XL': TalkingHeadDiT_XL, 
+    'TalkingHeadDiT-L':  TalkingHeadDiT_L, 
+    'TalkingHeadDiT-MM': TalkingHeadDiT_MM, 
+    'TalkingHeadDiT-B':  TalkingHeadDiT_B, 
+    'TalkingHeadDiT-S':  TalkingHeadDiT_S, 
+    'TalkingHeadDiT-T':  TalkingHeadDiT_T,
+}
\ No newline at end of file
diff --git a/src/models/dit/util.py b/src/models/dit/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc6b925ff4d93dbb89d0d1e593bee15c888c39ee
--- /dev/null
+++ b/src/models/dit/util.py
@@ -0,0 +1,452 @@
+# coding: utf-8
+
+"""
+This file defines various neural network modules and utility functions, including convolutional and residual blocks,
+normalizations, and functions for spatial transformation and tensor manipulation.
+"""
+
+from torch import nn
+import torch.nn.functional as F
+import torch
+import torch.nn.utils.spectral_norm as spectral_norm
+import math
+import warnings
+import collections.abc
+from itertools import repeat
+
+def kp2gaussian(kp, spatial_size, kp_variance):
+    """
+    Transform a keypoint into gaussian like representation
+    """
+    mean = kp
+
+    coordinate_grid = make_coordinate_grid(spatial_size, mean)
+    number_of_leading_dimensions = len(mean.shape) - 1
+    shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape
+    coordinate_grid = coordinate_grid.view(*shape)
+    repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 1)
+    coordinate_grid = coordinate_grid.repeat(*repeats)
+
+    # Preprocess kp shape
+    shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 3)
+    mean = mean.view(*shape)
+
+    mean_sub = (coordinate_grid - mean)
+
+    out = torch.exp(-0.5 * (mean_sub ** 2).sum(-1) / kp_variance)
+
+    return out
+
+
+def make_coordinate_grid(spatial_size, ref, **kwargs):
+    d, h, w = spatial_size
+    x = torch.arange(w).type(ref.dtype).to(ref.device)
+    y = torch.arange(h).type(ref.dtype).to(ref.device)
+    z = torch.arange(d).type(ref.dtype).to(ref.device)
+
+    # NOTE: must be right-down-in
+    x = (2 * (x / (w - 1)) - 1)  # the x axis faces to the right
+    y = (2 * (y / (h - 1)) - 1)  # the y axis faces to the bottom
+    z = (2 * (z / (d - 1)) - 1)  # the z axis faces to the inner
+
+    yy = y.view(1, -1, 1).repeat(d, 1, w)
+    xx = x.view(1, 1, -1).repeat(d, h, 1)
+    zz = z.view(-1, 1, 1).repeat(1, h, w)
+
+    meshed = torch.cat([xx.unsqueeze_(3), yy.unsqueeze_(3), zz.unsqueeze_(3)], 3)
+
+    return meshed
+
+
+class ConvT2d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, stride=2, padding=1, output_padding=1):
+        super(ConvT2d, self).__init__()
+
+        self.convT = nn.ConvTranspose2d(in_features, out_features, kernel_size=kernel_size, stride=stride,
+                                        padding=padding, output_padding=output_padding)
+        self.norm = nn.InstanceNorm2d(out_features)
+
+    def forward(self, x):
+        out = self.convT(x)
+        out = self.norm(out)
+        out = F.leaky_relu(out)
+        return out
+
+
+class ResBlock3d(nn.Module):
+    """
+    Res block, preserve spatial resolution.
+    """
+
+    def __init__(self, in_features, kernel_size, padding):
+        super(ResBlock3d, self).__init__()
+        self.conv1 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
+        self.conv2 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
+        self.norm1 = nn.BatchNorm3d(in_features, affine=True)
+        self.norm2 = nn.BatchNorm3d(in_features, affine=True)
+
+    def forward(self, x):
+        out = self.norm1(x)
+        out = F.relu(out)
+        out = self.conv1(out)
+        out = self.norm2(out)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out += x
+        return out
+
+
+class UpBlock3d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(UpBlock3d, self).__init__()
+
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = nn.BatchNorm3d(out_features, affine=True)
+
+    def forward(self, x):
+        out = F.interpolate(x, scale_factor=(1, 2, 2))
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out)
+        return out
+
+
+class DownBlock2d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+        self.pool = nn.AvgPool2d(kernel_size=(2, 2))
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out)
+        out = self.pool(out)
+        return out
+
+
+class DownBlock3d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock3d, self).__init__()
+        '''
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                                padding=padding, groups=groups, stride=(1, 2, 2))
+        '''
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = nn.BatchNorm3d(out_features, affine=True)
+        self.pool = nn.AvgPool3d(kernel_size=(1, 2, 2))
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out)
+        out = self.pool(out)
+        return out
+
+
+class SameBlock2d(nn.Module):
+    """
+    Simple block, preserve spatial resolution.
+    """
+
+    def __init__(self, in_features, out_features, groups=1, kernel_size=3, padding=1, lrelu=False):
+        super(SameBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+        if lrelu:
+            self.ac = nn.LeakyReLU()
+        else:
+            self.ac = nn.ReLU()
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = self.ac(out)
+        return out
+
+
+class Encoder(nn.Module):
+    """
+    Hourglass Encoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Encoder, self).__init__()
+
+        down_blocks = []
+        for i in range(num_blocks):
+            down_blocks.append(DownBlock3d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)), min(max_features, block_expansion * (2 ** (i + 1))), kernel_size=3, padding=1))
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+    def forward(self, x):
+        outs = [x]
+        for down_block in self.down_blocks:
+            outs.append(down_block(outs[-1]))
+        return outs
+
+
+class Decoder(nn.Module):
+    """
+    Hourglass Decoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Decoder, self).__init__()
+
+        up_blocks = []
+
+        for i in range(num_blocks)[::-1]:
+            in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1)))
+            out_filters = min(max_features, block_expansion * (2 ** i))
+            up_blocks.append(UpBlock3d(in_filters, out_filters, kernel_size=3, padding=1))
+
+        self.up_blocks = nn.ModuleList(up_blocks)
+        self.out_filters = block_expansion + in_features
+
+        self.conv = nn.Conv3d(in_channels=self.out_filters, out_channels=self.out_filters, kernel_size=3, padding=1)
+        self.norm = nn.BatchNorm3d(self.out_filters, affine=True)
+
+    def forward(self, x):
+        out = x.pop()
+        for up_block in self.up_blocks:
+            out = up_block(out)
+            skip = x.pop()
+            out = torch.cat([out, skip], dim=1)
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out)
+        return out
+
+
+class Hourglass(nn.Module):
+    """
+    Hourglass architecture.
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Hourglass, self).__init__()
+        self.encoder = Encoder(block_expansion, in_features, num_blocks, max_features)
+        self.decoder = Decoder(block_expansion, in_features, num_blocks, max_features)
+        self.out_filters = self.decoder.out_filters
+
+    def forward(self, x):
+        return self.decoder(self.encoder(x))
+
+
+class SPADE(nn.Module):
+    def __init__(self, norm_nc, label_nc):
+        super().__init__()
+
+        self.param_free_norm = nn.InstanceNorm2d(norm_nc, affine=False)
+        nhidden = 128
+
+        self.mlp_shared = nn.Sequential(
+            nn.Conv2d(label_nc, nhidden, kernel_size=3, padding=1),
+            nn.ReLU())
+        self.mlp_gamma = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
+        self.mlp_beta = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
+
+    def forward(self, x, segmap):
+        normalized = self.param_free_norm(x)
+        segmap = F.interpolate(segmap, size=x.size()[2:], mode='nearest')
+        actv = self.mlp_shared(segmap)
+        gamma = self.mlp_gamma(actv)
+        beta = self.mlp_beta(actv)
+        out = normalized * (1 + gamma) + beta
+        return out
+
+
+class SPADEResnetBlock(nn.Module):
+    def __init__(self, fin, fout, norm_G, label_nc, use_se=False, dilation=1):
+        super().__init__()
+        # Attributes
+        self.learned_shortcut = (fin != fout)
+        fmiddle = min(fin, fout)
+        self.use_se = use_se
+        # create conv layers
+        self.conv_0 = nn.Conv2d(fin, fmiddle, kernel_size=3, padding=dilation, dilation=dilation)
+        self.conv_1 = nn.Conv2d(fmiddle, fout, kernel_size=3, padding=dilation, dilation=dilation)
+        if self.learned_shortcut:
+            self.conv_s = nn.Conv2d(fin, fout, kernel_size=1, bias=False)
+        # apply spectral norm if specified
+        if 'spectral' in norm_G:
+            self.conv_0 = spectral_norm(self.conv_0)
+            self.conv_1 = spectral_norm(self.conv_1)
+            if self.learned_shortcut:
+                self.conv_s = spectral_norm(self.conv_s)
+        # define normalization layers
+        self.norm_0 = SPADE(fin, label_nc)
+        self.norm_1 = SPADE(fmiddle, label_nc)
+        if self.learned_shortcut:
+            self.norm_s = SPADE(fin, label_nc)
+
+    def forward(self, x, seg1):
+        x_s = self.shortcut(x, seg1)
+        dx = self.conv_0(self.actvn(self.norm_0(x, seg1)))
+        dx = self.conv_1(self.actvn(self.norm_1(dx, seg1)))
+        out = x_s + dx
+        return out
+
+    def shortcut(self, x, seg1):
+        if self.learned_shortcut:
+            x_s = self.conv_s(self.norm_s(x, seg1))
+        else:
+            x_s = x
+        return x_s
+
+    def actvn(self, x):
+        return F.leaky_relu(x, 2e-1)
+
+
+def filter_state_dict(state_dict, remove_name='fc'):
+    new_state_dict = {}
+    for key in state_dict:
+        if remove_name in key:
+            continue
+        new_state_dict[key] = state_dict[key]
+    return new_state_dict
+
+
+class GRN(nn.Module):
+    """ GRN (Global Response Normalization) layer
+    """
+
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def drop_path(x, drop_prob=0., training=False, scale_by_keep=True):
+    """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """ Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+
+to_2tuple = _ntuple(2)
diff --git a/src/models/dit/warping_network.py b/src/models/dit/warping_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..9191a197055a954272ee8ed86c5e34f3f33f9ad5
--- /dev/null
+++ b/src/models/dit/warping_network.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+
+"""
+Warping field estimator(W) defined in the paper, which generates a warping field using the implicit
+keypoint representations x_s and x_d, and employs this flow field to warp the source feature volume f_s.
+"""
+
+from torch import nn
+import torch.nn.functional as F
+from .util import SameBlock2d
+from .dense_motion import DenseMotionNetwork
+
+
+class WarpingNetwork(nn.Module):
+    def __init__(
+        self,
+        num_kp,
+        block_expansion,
+        max_features,
+        num_down_blocks,
+        reshape_channel,
+        estimate_occlusion_map=False,
+        dense_motion_params=None,
+        **kwargs
+    ):
+        super(WarpingNetwork, self).__init__()
+
+        self.upscale = kwargs.get('upscale', 1)
+        self.flag_use_occlusion_map = kwargs.get('flag_use_occlusion_map', True)
+
+        if dense_motion_params is not None:
+            self.dense_motion_network = DenseMotionNetwork(
+                num_kp=num_kp,
+                feature_channel=reshape_channel,
+                estimate_occlusion_map=estimate_occlusion_map,
+                **dense_motion_params
+            )
+        else:
+            self.dense_motion_network = None
+
+        self.third = SameBlock2d(max_features, block_expansion * (2 ** num_down_blocks), kernel_size=(3, 3), padding=(1, 1), lrelu=True)
+        self.fourth = nn.Conv2d(in_channels=block_expansion * (2 ** num_down_blocks), out_channels=block_expansion * (2 ** num_down_blocks), kernel_size=1, stride=1)
+
+        self.estimate_occlusion_map = estimate_occlusion_map
+
+    def deform_input(self, inp, deformation):
+        return F.grid_sample(inp, deformation, align_corners=False)
+
+    def forward(self, feature_3d, kp_driving, kp_source):
+        if self.dense_motion_network is not None:
+            # Feature warper, Transforming feature representation according to deformation and occlusion
+            dense_motion = self.dense_motion_network(
+                feature=feature_3d, kp_driving=kp_driving, kp_source=kp_source
+            )
+            if 'occlusion_map' in dense_motion:
+                occlusion_map = dense_motion['occlusion_map']  # Bx1x64x64
+            else:
+                occlusion_map = None
+
+            deformation = dense_motion['deformation']  # Bx16x64x64x3
+            out = self.deform_input(feature_3d, deformation)  # Bx32x16x64x64
+
+            bs, c, d, h, w = out.shape  # Bx32x16x64x64
+            out = out.view(bs, c * d, h, w)  # -> Bx512x64x64
+            out = self.third(out)  # -> Bx256x64x64
+            out = self.fourth(out)  # -> Bx256x64x64
+
+            if self.flag_use_occlusion_map and (occlusion_map is not None):
+                out = out * occlusion_map
+
+        ret_dct = {
+            'occlusion_map': occlusion_map,
+            'deformation': deformation,
+            'out': out,
+        }
+
+        return ret_dct
diff --git a/src/models/dit/wav2vec2.py b/src/models/dit/wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..499140bbe90d147d07ba180b261ec8ea6f752df2
--- /dev/null
+++ b/src/models/dit/wav2vec2.py
@@ -0,0 +1,119 @@
+from packaging import version
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+
+_CONFIG_FOR_DOC = 'Wav2Vec2Config'
+
+
+# the implementation of Wav2Vec2Model is borrowed from
+# https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model
+# initialize our encoder with the pre-trained wav2vec 2.0 weights.
+def _compute_mask_indices(shape: Tuple[int, int], mask_prob: float, mask_length: int,
+                          attention_mask: Optional[torch.Tensor] = None, min_masks: int = 0, ) -> np.ndarray:
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    all_num_mask = int(mask_prob * all_sz / float(mask_length) + np.random.rand())
+    all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand())
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+
+        lengths = np.full(num_mask, mask_length)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        min_len = min(lengths)
+        if sz - min_len <= num_mask:
+            min_len = sz - num_mask - 1
+
+        mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+        mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return mask
+
+
+# linear interpolation layer
+def linear_interpolation(features, input_fps, output_fps, output_len=None):
+    # features: (N, C, L)
+    seq_len = features.shape[2] / float(input_fps)
+    if output_len is None:
+        output_len = int(seq_len * output_fps)
+    output_features = F.interpolate(features, size=output_len, align_corners=False, mode='linear')
+    return output_features
+
+
+class Wav2Vec2Model(Wav2Vec2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.is_old_version = version.parse(transformers.__version__) < version.parse('4.7.0')
+
+    def forward(self, input_values, output_fps=25, attention_mask=None, output_attentions=None,
+                output_hidden_states=None, return_dict=None, frame_num=None):
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.feature_extractor(input_values)  # (N, C, L)
+        # Resample the audio feature @ 50 fps to `output_fps`.
+        if frame_num is not None:
+            hidden_states_len = round(frame_num * 50 / output_fps)
+            hidden_states = hidden_states[:, :, :hidden_states_len]
+        hidden_states = linear_interpolation(hidden_states, 50, output_fps, output_len=frame_num)
+        hidden_states = hidden_states.transpose(1, 2)  # (N, L, C)
+
+        if attention_mask is not None:
+            output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+            attention_mask = torch.zeros(hidden_states.shape[:2], dtype=hidden_states.dtype,
+                                         device=hidden_states.device)
+            attention_mask[(torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1)] = 1
+            attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+
+        if self.is_old_version:
+            hidden_states = self.feature_projection(hidden_states)
+        else:
+            hidden_states = self.feature_projection(hidden_states)[0]
+
+        if self.config.apply_spec_augment and self.training:
+            batch_size, sequence_length, hidden_size = hidden_states.size()
+            if self.config.mask_time_prob > 0:
+                mask_time_indices = _compute_mask_indices((batch_size, sequence_length), self.config.mask_time_prob,
+                                                          self.config.mask_time_length, attention_mask=attention_mask,
+                                                          min_masks=2, )
+                hidden_states[torch.from_numpy(mask_time_indices)] = self.masked_spec_embed.to(hidden_states.dtype)
+            if self.config.mask_feature_prob > 0:
+                mask_feature_indices = _compute_mask_indices((batch_size, hidden_size), self.config.mask_feature_prob,
+                                                             self.config.mask_feature_length, )
+                mask_feature_indices = torch.from_numpy(mask_feature_indices).to(hidden_states.device)
+                hidden_states[mask_feature_indices[:, None].expand(-1, sequence_length, -1)] = 0
+        encoder_outputs = self.encoder(hidden_states, attention_mask=attention_mask,
+                                       output_attentions=output_attentions, output_hidden_states=output_hidden_states,
+                                       return_dict=return_dict, )
+        hidden_states = encoder_outputs[0]
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_outputs.hidden_states,
+                               attentions=encoder_outputs.attentions, )
diff --git a/src/models/inference/__pycache__/LiveVASAPipeline.cpython-310.pyc b/src/models/inference/__pycache__/LiveVASAPipeline.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a879fe6fd2a7bc37ae0249f4236413ab920c1af
Binary files /dev/null and b/src/models/inference/__pycache__/LiveVASAPipeline.cpython-310.pyc differ
diff --git a/src/models/inference/__pycache__/LiveVASAPipeline2.cpython-310.pyc b/src/models/inference/__pycache__/LiveVASAPipeline2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3187c75174ac34310858d341185b0d203b43db5
Binary files /dev/null and b/src/models/inference/__pycache__/LiveVASAPipeline2.cpython-310.pyc differ
diff --git a/src/models/inference/__pycache__/LiveVASAPipeline4.cpython-310.pyc b/src/models/inference/__pycache__/LiveVASAPipeline4.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2bfb788cf4a73a37f27587ae0acb2db0ff54f80
Binary files /dev/null and b/src/models/inference/__pycache__/LiveVASAPipeline4.cpython-310.pyc differ
diff --git a/src/models/inference/__pycache__/Motion2VideoPipeline.cpython-310.pyc b/src/models/inference/__pycache__/Motion2VideoPipeline.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee6166e9124be6f96e8ef54d960d81381545973f
Binary files /dev/null and b/src/models/inference/__pycache__/Motion2VideoPipeline.cpython-310.pyc differ
diff --git a/src/models/inference/moda_test.py b/src/models/inference/moda_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..524d1ec610edb558438f0028ebf7fbf0dce2d23e
--- /dev/null
+++ b/src/models/inference/moda_test.py
@@ -0,0 +1,293 @@
+# encoding = 'utf-8'
+import os
+import os.path as osp
+import sys
+from omegaconf import OmegaConf
+
+
+import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)
+
+import torch
+torch.backends.cudnn.benchmark = True # disable CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR warning
+
+sys.path.append(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__))))))
+
+from src.datasets.preprocess.extract_features.audio_processer import AudioProcessor
+from src.datasets.preprocess.extract_features.motion_processer import MotionProcesser
+from src.models.dit.talking_head_diffusion import MotionDiffusion
+
+
+from src.utils.rprint import rlog as log
+import time
+
+emo_map = {
+    0: 'Anger', 
+    1: 'Contempt', 
+    2: 'Disgust', 
+    3: 'Fear', 
+    4: 'Happiness', 
+    5: 'Neutral', 
+    6: 'Sadness', 
+    7: 'Surprise',
+    8: 'None'
+}
+# import torch
+import random
+import numpy as np
+
+def set_seed(seed: int = 42):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)  # 如果使用多个 GPU
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False  # 关闭 CuDNN 优化以保证可复现性
+
+# 在推理前调用
+set_seed(42)
+
+class NullableArgs:
+    def __init__(self, namespace):
+        for key, value in namespace.__dict__.items():
+            setattr(self, key, value)
+
+
+class LiveVASAPipeline(object):
+    def __init__(self, cfg_path: str, load_motion_generator: bool = True, motion_mean_std_path=None):
+        """The pipeline for LiveVASA
+        The pipeline for LiveVASA
+
+        Args:
+            cfg_path (str): YAML config file path of LiveVASA
+        """
+        # pretrained encoders of live portrait
+        cfg = OmegaConf.load(cfg_path)
+        self.device_id = cfg.device_id
+        self.device = f"cuda:{self.device_id}"
+        
+        # 1 load audio processor
+        self.audio_processor: AudioProcessor = AudioProcessor(cfg_path=cfg.audio_model_config, is_training=False)
+        log(f"Load audio_processor done.")
+
+        if cfg.motion_models_config is not None and load_motion_generator:
+            motion_models_config = OmegaConf.load(cfg.motion_models_config)
+            log(f"Load motion_models_config from {osp.realpath(cfg.motion_models_config)} done.")
+            self.motion_generator = MotionDiffusion(motion_models_config, device=self.device)
+            self.load_motion_generator(self.motion_generator, cfg.motion_generator_path)
+            # self.motion_generator.eval()
+        else:
+            self.motion_generator = None    
+            log(f"Init motion_generator as None.")
+        
+        # 3. load motion processer
+        self.motion_processer: MotionProcesser = MotionProcesser(cfg_path=cfg.motion_processer_config, device_id=cfg.device_id)
+        log(f"Load motion_processor done.")
+
+
+        self.motion_mean_std = None
+        if motion_mean_std_path is not None:
+            self.motion_mean_std = torch.load(motion_mean_std_path)
+            self.motion_mean_std["mean"] = self.motion_mean_std["mean"].to(self.device)
+            self.motion_mean_std["std"] = self.motion_mean_std["std"].to(self.device)
+            print(f"scale mean: {self.motion_mean_std['mean'][0, 63:64]}, std: {self.motion_mean_std['std'][0, 63:64]}")
+            print(f"t mean: {self.motion_mean_std['mean'][0, 64:67]}, std: {self.motion_mean_std['std'][0, 64:67]}")
+            print(f"pitch mean: {self.motion_mean_std['mean'][0, 67:68]}, std: {self.motion_mean_std['std'][0, 67:68]}")
+            print(f"yaw mean: {self.motion_mean_std['mean'][0, 68:69]}, std: {self.motion_mean_std['std'][0, 68:69]}")
+            print(f"scoll mean: {self.motion_mean_std['mean'][0, 69:70]}, std: {self.motion_mean_std['std'][0, 69:70]}")
+
+        self.cfg = cfg
+
+    def set_motion_generator(self, motion_generator: MotionDiffusion):
+        self.motion_generator = motion_generator
+        self.motion_generator.to(self.device)
+        
+    def load_motion_generator(self, model, motion_generator_path: str):
+        print(motion_generator_path)
+        model_data = torch.load(motion_generator_path, map_location=self.device)
+        model.load_state_dict(model_data, strict=False)
+       
+
+        model.to(self.device)
+        model.eval()
+
+    def modulate_lip(self, standard_motion: torch.Tensor, motions: torch.Tensor, alpha=5, beta=0.1):
+        # standard_motion: 63
+        # motions: Tx63
+        standard_exp = standard_motion[:63].reshape(1, 21, 3)
+        exps = motions[:, :63].reshape(-1, 21, 3)
+        exp_deltas = exps - standard_exp
+
+        # calc weights
+        lip_deltas = []
+        for lip_idx in [6, 12, 14, 17, 19, 20]:
+            lip_deltas.append(exp_deltas[:, lip_idx, :])
+        lip_deltas = torch.stack(lip_deltas, dim=1)   # T, 6, 3
+        lip_deltas = lip_deltas.view(lip_deltas.shape[0], -1) 
+        lip_dist = torch.sum(lip_deltas ** 2, dim=-1, keepdim=True)
+        max_dist = torch.max(lip_dist, dim=0)[0].squeeze()   # 1
+        weight = (torch.sigmoid(lip_dist*alpha) - 0.5) / (max_dist * beta + 0.05) 
+
+        # modulation
+        for lip_idx in [6, 12, 14, 17, 19, 20]:
+            exps[:, lip_idx, :] = standard_exp[:, lip_idx, :] + exp_deltas[:, lip_idx, :] * (1 + weight)
+        motions[:, :63] = exps.flatten(-2, -1)
+        
+        return motions
+    
+    def get_motion_sequence(self, motion_data: torch.Tensor, rescale_ratio=1.0):
+        n_frames = motion_data.shape[0]
+        # denorm
+        if self.motion_mean_std is not None:
+            if motion_data.shape[1] > 70:
+                motion_data[:, :63] = motion_data[:, :63] * (self.motion_mean_std["std"][:, :63] + 1e-5) + self.motion_mean_std["mean"][:, :63]
+                # denorm pose
+                motion_data[:, 63:] = motion_data[:, 63:] + self.motion_mean_std["mean"][:, 63:]
+            else:
+                motion_data = motion_data * (self.motion_mean_std["std"] + 1e-5) + self.motion_mean_std["mean"]
+
+        kp_infos = {"exp": [], "scale": [], "t": [], "pitch": [], "yaw": [], "roll": []}
+        for idx in range(n_frames):
+            exp = motion_data[idx][:63]
+            scale = motion_data[idx][63:64] * rescale_ratio
+            t = motion_data[idx][64:67] * rescale_ratio
+            if motion_data.shape[1] > 70:
+                pitch = motion_data[idx][67:133]
+                yaw = motion_data[idx][133:199]
+                roll = motion_data[idx][199:265]
+            else:
+                pitch = motion_data[idx][67:68]
+                yaw = motion_data[idx][68:69]
+                roll = motion_data[idx][69:70]
+
+            kp_infos["exp"].append(exp)
+            kp_infos["scale"].append(scale)
+            kp_infos["t"].append(t)
+            kp_infos["pitch"].append(pitch)
+            kp_infos["yaw"].append(yaw)
+            kp_infos["roll"].append(roll)
+
+        for k, v in kp_infos.items():
+            kp_infos[k] = torch.stack(v)
+
+        return kp_infos
+    
+    def get_prev_motion(self, x_s_info):
+        kp_infos = []
+        x_s_info["t"][:, 2] = 0  # zero tz
+        if self.motion_generator is not None and self.motion_generator.input_dim == 70:
+            x_s_info = self.motion_processer.refine_kp(x_s_info)
+            for k, v in x_s_info.items():
+                x_s_info[k] = v.reshape(1, -1)
+
+        rescale_ratio = 1.0 if self.motion_mean_std is None else (x_s_info["scale"] + 1e-5) / (self.motion_mean_std["mean"][:, 63:64] + 1e-5)
+
+        for feat_name in ["exp", "scale", "t", "pitch", "yaw", "roll"]:
+            if feat_name in ["scale", "t"]:
+                # set scale as the mean scale
+                kp_infos.append(x_s_info[feat_name] / rescale_ratio)
+            else:
+                kp_infos.append(x_s_info[feat_name])
+        kp_infos = torch.cat(kp_infos, dim=-1)   # B, D
+        
+        # normalize
+        if self.motion_mean_std is not None:
+            # normalize exp
+            if self.motion_generator is not None and self.motion_generator.input_dim > 70:
+                kp_infos[:, :63] = (kp_infos[:, :63] - self.motion_mean_std["mean"][:, :63]) / (self.motion_mean_std["std"][:, :63] + 1e-5)
+                # normalize pose
+                kp_infos[:, 63:] = kp_infos[:, 63:] - self.motion_mean_std["mean"][:, 63:]
+            else:
+                kp_infos = (kp_infos - self.motion_mean_std["mean"]) / (self.motion_mean_std["std"] + 1e-5)
+
+        kp_infos = kp_infos.unsqueeze(1)    # B, D
+        return kp_infos, rescale_ratio
+
+    def process_audio(self, audio_path: str, silent_audio_path = None, mode="post"):
+        # add silent audio to pad short input
+        ori_audio_path = audio_path
+        audio_path, add_frames = self.audio_processor.add_silent_audio(audio_path, silent_audio_path, add_duration=2, linear_fusion=False, mode=mode)
+        audio_emb = self.audio_processor.get_long_audio_emb(audio_path)
+        return audio_emb, audio_path, add_frames, ori_audio_path
+
+    def driven_sample(self, image_path: str, audio_path: str, cfg_scale: float=1., emo: int=8, save_dir=None, smooth=False, silent_audio_path = None, silent_mode="post"):
+        assert self.motion_generator is not None, f"Motion Generator is not set"
+        reference_name = osp.basename(image_path).split('.')[0]
+        audio_name = osp.basename(audio_path).split('.')[0]
+        # get audio embeddings
+        audio_emb, audio_path, add_frames, ori_audio_path = self.process_audio(audio_path, silent_audio_path, mode=silent_mode)
+
+        # get src image infos
+        source_rgb_lst = self.motion_processer.read_image(image_path)
+        src_img_256x256, s_lmk, crop_info = self.motion_processer.crop_image(source_rgb_lst[0], do_crop=True)
+        f_s, x_s_info = self.motion_processer.prepare_source(src_img_256x256)
+        prev_motion, rescale_ratio = self.get_prev_motion(x_s_info)
+        # generate motions
+        motion = self.motion_generator.sample(audio_emb, x_s_info["kp"], prev_motion=prev_motion, cfg_scale=cfg_scale, emo=emo)
+        if add_frames > 0:
+            standard_motion = motion[-max(add_frames*3//4, 1)]
+            motion = self.modulate_lip(standard_motion, motion, alpha=5)
+            if silent_mode == "both":
+                motion = motion[add_frames:-add_frames]
+            elif silent_mode == "pre":
+                motion = motion[add_frames:]
+            else:
+                motion = motion[:-add_frames]
+
+        print(f"length of motion: {len(motion)}")
+        kp_infos = self.get_motion_sequence(motion, rescale_ratio=rescale_ratio)
+        
+        # driven results
+        if save_dir is None:
+            save_dir = self.cfg.output_dir
+        if not osp.exists(save_dir):
+            os.makedirs(save_dir)
+        #save_path = osp.join(save_dir, f'{reference_name}_{audio_name}_cfg-{cfg_scale}_emo-{emo_map[emo]}.mp4')
+        save_path = osp.join(save_dir, f'{reference_name}.mp4')
+
+        self.motion_processer.driven_by_audio(source_rgb_lst[0], kp_infos, save_path, ori_audio_path, smooth=smooth)
+        return save_path
+
+
+
+    
+    def viz_motion(self, motion_data):
+        pass        
+        
+    def __call__(self):
+        pass
+
+
+if __name__ == "__main__":
+    import time
+    import random
+    import argparse
+    parser = argparse.ArgumentParser(description="Arguments for the task")
+    parser.add_argument('--task', type=str, default="test", help='Task to perform')
+    parser.add_argument('--cfg_path', type=str, default="configs/audio2motion/inference/inference.yaml", help='Path to configuration file')
+    parser.add_argument('--image_path', type=str, default="src/examples/reference_images/6.jpg", help='Path to the input image')
+    parser.add_argument('--audio_path', type=str, default="src/examples/driving_audios/5.wav", help='Path to the driving audio')
+    parser.add_argument('--silent_audio_path', type=str, default="src/examples/silent-audio.wav", help='Path to silent audio file')
+    parser.add_argument('--save_dir', type=str, default="output/", help='Directory to save results')
+    parser.add_argument('--motion_mean_std_path', type=str, default="src/datasets/mean.pt", help='Path to motion mean and standard deviation file')
+    parser.add_argument('--cfg_scale', type=float, default=1.2, help='Scaling factor for the configuration')
+    args = parser.parse_args()
+        
+    pipeline = LiveVASAPipeline(cfg_path=args.cfg_path, motion_mean_std_path=args.motion_mean_std_path)
+    emo=8
+    if not osp.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    save_dir = osp.join(args.save_dir, f"cfg-{args.cfg_scale}-emo-{emo_map[emo]}")
+    if not osp.exists(save_dir):
+        os.makedirs(save_dir)  
+  
+    video_path = pipeline.driven_sample(
+                    args.image_path, args.audio_path, 
+                    cfg_scale=args.cfg_scale, emo=emo, 
+                    save_dir=save_dir, smooth=False,
+                    silent_audio_path = args.silent_audio_path,
+                )
+    print(f"Video Result has been saved into: {video_path}")
+
+
diff --git a/src/models/schedulers/__pycache__/flow_matching.cpython-310.pyc b/src/models/schedulers/__pycache__/flow_matching.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23d878fe77aa19b48fd976b09ecbfcc8324ca874
Binary files /dev/null and b/src/models/schedulers/__pycache__/flow_matching.cpython-310.pyc differ
diff --git a/src/models/schedulers/__pycache__/flow_matching2.cpython-310.pyc b/src/models/schedulers/__pycache__/flow_matching2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bbfb2e12158c72390f1d0222ec0ac82bc400861
Binary files /dev/null and b/src/models/schedulers/__pycache__/flow_matching2.cpython-310.pyc differ
diff --git a/src/models/schedulers/__pycache__/scheduling_ddim.cpython-310.pyc b/src/models/schedulers/__pycache__/scheduling_ddim.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8de90f6f917c719881d2de6fa1689cfac1b680e8
Binary files /dev/null and b/src/models/schedulers/__pycache__/scheduling_ddim.cpython-310.pyc differ
diff --git a/src/models/schedulers/flow_matching.py b/src/models/schedulers/flow_matching.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3a677cd4f87586c25172eb91685dbb72a19bdba
--- /dev/null
+++ b/src/models/schedulers/flow_matching.py
@@ -0,0 +1,135 @@
+# modified from https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L23
+
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+class ModelSamplingDiscreteFlow(nn.Module):
+    """Helper for sampler scheduling (ie timestep/sigma calculations) for Discrete Flow models"""
+
+    def __init__(self, num_train_timesteps=1000, shift=1.0, **kwargs):
+        super().__init__()
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+        ts = self.to_sigma(torch.arange(1, num_train_timesteps + 1, 1))  # [1/1000, 1]
+        self.register_buffer("sigmas", ts)
+    
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+    
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+
+    def to_timestep(self, sigma):
+        return sigma * self.num_train_timesteps
+    
+    def to_sigma(self, timestep: torch.Tensor):
+        timestep = timestep / self.num_train_timesteps
+        if self.shift == 1.0:
+            return timestep
+        return self.shift * timestep / (1 + (self.shift - 1) * timestep)
+    
+    def uniform_sample_t(self, batch_size, device):
+        ts = (self.sigma_max - self.sigma_min) * torch.rand(batch_size, device=device) + self.sigma_min
+        return ts
+
+    def calculate_denoised(self, sigma, model_output, model_input):
+        # model ouput, vector field, v = dx = (x_1 - x_0)
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input - model_output * sigma
+
+    def noise_scaling(self, sigma, noise, latent_image):
+        return sigma * noise + (1.0 - sigma) * latent_image
+        
+    def add_noise(self, sample, noise=None, timesteps=None):
+        # sample, B, L, D
+        if timesteps is None:
+            # Sample time step
+            batch_size = sample.shape[0]
+            sigmas = self.uniform_sample_t(batch_size, device=sample.device).to(dtype=sample.dtype)  # (B,)
+            timesteps = self.to_timestep(sigmas)
+        else:
+            timesteps = timesteps.to(device=sample.device, dtype=sample.dtype)
+            sigmas = self.to_sigma(timesteps)
+        
+        sigmas = sigmas.view(-1, 1, 1)            # (B, 1, 1)
+        noise = torch.randn_like(sample)
+        noisy_samples = sigmas * noise + (1.0 - sigmas) * sample
+        return noisy_samples, noise, noise - sample, timesteps
+
+    def set_timesteps(self, num_inference_steps, device=None):
+        if num_inference_steps > self.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.num_train_timesteps`:"
+                f" {self.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        start = self.to_timestep(self.sigma_max)
+        end = self.to_timestep(self.sigma_min)
+        timesteps = torch.linspace(start, end, num_inference_steps)
+
+        self.timesteps = torch.from_numpy(np.array(timesteps)).to(device)
+
+    def append_dims(self, x, target_dims):
+        """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+        dims_to_append = target_dims - x.ndim
+        return x[(...,) + (None,) * dims_to_append]
+
+    def to_d(self, x, sigma, denoised):
+        """Converts a denoiser output to a Karras ODE derivative."""
+        return (x - denoised) / self.append_dims(sigma, x.ndim)
+
+    @torch.no_grad()
+    def step(self, model_output, timestep, sample, method="euler", **kwargs):
+        """
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model, direction (noise - x_0).
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process, x_t.
+            method (`str`):
+                ODE solver, `euler` or `dpmpp_2m`
+
+        Returns:
+            `tuple`:
+                the sample tensor.
+        """
+
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        sigma = self.to_sigma(timestep)
+        prev_sigma = sigma - (self.sigma_max - self.sigma_min) / (self.num_inference_steps - 1)
+        prev_sigma = 0.0 if prev_sigma < 0.0 else prev_sigma
+
+        if method == "euler":
+            """Implements Algorithm 2 (Euler steps) from Karras et al. (2022)."""
+            dt = prev_sigma - sigma
+            prev_sample = sample + model_output * dt
+        elif method == "dpmpp_2m":
+            """DPM-Solver++(2M)."""
+            raise NotImplementedError
+        else:
+            raise ValueError(f"Unsupported ode solver: {method}, only supports `euler` or `dpmpp_2m`")
+
+        pred_original_sample = sample - model_output * sigma
+
+        return (
+            prev_sample,
+            pred_original_sample
+        )
+
+    def get_pred_original_sample(self, model_output, timestep, sample):
+        sigma = self.to_sigma(timestep).view(-1, 1, 1)
+        pred_original_sample = sample - model_output * sigma
+
+        return pred_original_sample
\ No newline at end of file
diff --git a/src/models/schedulers/scheduling_ddim.py b/src/models/schedulers/scheduling_ddim.py
new file mode 100644
index 0000000000000000000000000000000000000000..41f884c191bea9ea62285ef37ffb3628da14baa8
--- /dev/null
+++ b/src/models/schedulers/scheduling_ddim.py
@@ -0,0 +1,645 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+# Modified from diffusers.schedulers.scheduling_ddpm
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from collections import OrderedDict
+
+class BaseOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+    Python dictionary.
+
+    <Tip warning={true}>
+
+    You can't unpack a [`BaseOutput`] directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert it to a tuple
+    first.
+
+    </Tip>
+    """
+
+    def __init_subclass__(cls) -> None:
+        """Register subclasses as pytree nodes.
+
+        This is necessary to synchronize gradients when using `torch.nn.parallel.DistributedDataParallel` with
+        `static_graph=True` with modules that output `ModelOutput` subclasses.
+        """
+        
+        import torch.utils._pytree
+        torch.utils._pytree.register_pytree_node(
+            cls,
+            torch.utils._pytree._dict_flatten,
+            lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
+        )
+
+
+def randn_tensor(
+    shape: Union[Tuple, List],
+    generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
+    device: Optional["torch.device"] = None,
+    dtype: Optional["torch.dtype"] = None,
+    layout: Optional["torch.layout"] = None,
+):
+    """A helper function to create random tensors on the desired `device` with the desired `dtype`. When
+    passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor
+    is always created on the CPU.
+    """
+    # device on which tensor is created defaults to device
+    rand_device = device
+    batch_size = shape[0]
+
+    layout = layout or torch.strided
+    device = device or torch.device("cpu")
+
+    if generator is not None:
+        gen_device_type = generator.device.type if not isinstance(generator, list) else generator[0].device.type
+        if gen_device_type != device.type and gen_device_type == "cpu":
+            rand_device = "cpu"
+            if device != "mps":
+                print(
+                    f"The passed generator was created on 'cpu' even though a tensor on {device} was expected."
+                    f" Tensors will be created on 'cpu' and then moved to {device}. Note that one can probably"
+                    f" slighly speed up this function by passing a generator that was created on the {device} device."
+                )
+        elif gen_device_type != device.type and gen_device_type == "cuda":
+            raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.")
+
+    # make sure generator list of length 1 is treated like a non-list
+    if isinstance(generator, list) and len(generator) == 1:
+        generator = generator[0]
+
+    if isinstance(generator, list):
+        shape = (1,) + shape[1:]
+        latents = [
+            torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype, layout=layout)
+            for i in range(batch_size)
+        ]
+        latents = torch.cat(latents, dim=0).to(device)
+    else:
+        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device)
+
+    return latents
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDIMScheduler():
+    """
+    `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+        time_shifting: bool = True,
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        self.beta_schedule = beta_schedule
+        self.trained_betas = trained_betas
+        self.clip_sample = clip_sample
+        self.set_alpha_to_one = set_alpha_to_one
+        self.steps_offset = steps_offset
+        self.prediction_type = prediction_type
+        self.thresholding = thresholding
+        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+        self.clip_sample_range = clip_sample_range
+        self.sample_max_value = sample_max_value
+        self.timestep_spacing = timestep_spacing
+        self.rescale_betas_zero_snr = rescale_betas_zero_snr
+        self.time_shifting = time_shifting
+
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def uniform_sample_t(self, batch_size):
+        ts = torch.randint(1, self.num_train_timesteps, (batch_size,))
+        return ts
+
+    def log_normal_sample_t(self, batch_size):
+        # TODO
+        ts = torch.randint(1, self.num_train_timesteps, (batch_size,))
+        return ts
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.num_train_timesteps`:"
+                f" {self.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.num_train_timesteps - 1, num_inference_steps)
+                .round()[::-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.timestep_spacing == "leading":
+            step_ratio = self.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.steps_offset
+        elif self.timestep_spacing == "trailing":
+            step_ratio = self.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.clip_sample_range, self.clip_sample_range
+            )
+
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._get_variance(timestep, prev_timestep)
+        std_dev_t = eta * variance ** (0.5)
+
+        if use_clipped_model_output:
+            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if eta > 0:
+            if variance_noise is not None and generator is not None:
+                raise ValueError(
+                    "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
+                    " `variance_noise` stays `None`."
+                )
+
+            if variance_noise is None:
+                variance_noise = randn_tensor(
+                    model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+                )
+            variance = std_dev_t * variance_noise
+
+            prev_sample = prev_sample + variance
+
+        if not return_dict:
+            return (
+                prev_sample,
+                pred_original_sample,
+            )
+
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: Optional[torch.Tensor] = None,
+        timesteps: Optional[torch.IntTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.IntTensor]]:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        if timesteps is None:
+            batch_size = original_samples.shape[0]
+            timesteps = self.uniform_sample_t(batch_size)
+        
+        if noise is None:
+            noise = torch.randn_like(original_samples)
+
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples, noise, timesteps.to(original_samples.dtype)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def get_pred_x(self, sample, noise, timesteps):
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        c0 = 1 / sqrt_alpha_prod
+        c1 = sqrt_one_minus_alpha_prod
+
+        prev_sample = c0 * (sample - c1 * noise)
+
+        return prev_sample
+
+    def __len__(self):
+        return self.num_train_timesteps
diff --git a/src/thirdparty/liveportrait/speed.py b/src/thirdparty/liveportrait/speed.py
new file mode 100644
index 0000000000000000000000000000000000000000..20a3482dfd9d486491e58316a24b74a70ed412cf
--- /dev/null
+++ b/src/thirdparty/liveportrait/speed.py
@@ -0,0 +1,195 @@
+# coding: utf-8
+
+"""
+Benchmark the inference speed of each module in LivePortrait.
+
+TODO: heavy GPT style, need to refactor
+"""
+
+import torch
+torch._dynamo.config.suppress_errors = True  # Suppress errors and fall back to eager execution
+
+import yaml
+import time
+import numpy as np
+
+from src.utils.helper import load_model, concat_feat
+from src.config.inference_config import InferenceConfig
+
+
+def initialize_inputs(batch_size=1, device_id=0):
+    """
+    Generate random input tensors and move them to GPU
+    """
+    feature_3d = torch.randn(batch_size, 32, 16, 64, 64).to(device_id).half()
+    kp_source = torch.randn(batch_size, 21, 3).to(device_id).half()
+    kp_driving = torch.randn(batch_size, 21, 3).to(device_id).half()
+    source_image = torch.randn(batch_size, 3, 256, 256).to(device_id).half()
+    generator_input = torch.randn(batch_size, 256, 64, 64).to(device_id).half()
+    eye_close_ratio = torch.randn(batch_size, 3).to(device_id).half()
+    lip_close_ratio = torch.randn(batch_size, 2).to(device_id).half()
+    feat_stitching = concat_feat(kp_source, kp_driving).half()
+    feat_eye = concat_feat(kp_source, eye_close_ratio).half()
+    feat_lip = concat_feat(kp_source, lip_close_ratio).half()
+
+    inputs = {
+        'feature_3d': feature_3d,
+        'kp_source': kp_source,
+        'kp_driving': kp_driving,
+        'source_image': source_image,
+        'generator_input': generator_input,
+        'feat_stitching': feat_stitching,
+        'feat_eye': feat_eye,
+        'feat_lip': feat_lip
+    }
+
+    return inputs
+
+
+def load_and_compile_models(cfg, model_config):
+    """
+    Load and compile models for inference
+    """
+    appearance_feature_extractor = load_model(cfg.checkpoint_F, model_config, cfg.device_id, 'appearance_feature_extractor')
+    motion_extractor = load_model(cfg.checkpoint_M, model_config, cfg.device_id, 'motion_extractor')
+    warping_module = load_model(cfg.checkpoint_W, model_config, cfg.device_id, 'warping_module')
+    spade_generator = load_model(cfg.checkpoint_G, model_config, cfg.device_id, 'spade_generator')
+    stitching_retargeting_module = load_model(cfg.checkpoint_S, model_config, cfg.device_id, 'stitching_retargeting_module')
+
+    models_with_params = [
+        ('Appearance Feature Extractor', appearance_feature_extractor),
+        ('Motion Extractor', motion_extractor),
+        ('Warping Network', warping_module),
+        ('SPADE Decoder', spade_generator)
+    ]
+
+    compiled_models = {}
+    for name, model in models_with_params:
+        model = model.half()
+        model = torch.compile(model, mode='max-autotune')  # Optimize for inference
+        model.eval()  # Switch to evaluation mode
+        compiled_models[name] = model
+
+    retargeting_models = ['stitching', 'eye', 'lip']
+    for retarget in retargeting_models:
+        module = stitching_retargeting_module[retarget].half()
+        module = torch.compile(module, mode='max-autotune')  # Optimize for inference
+        module.eval()  # Switch to evaluation mode
+        stitching_retargeting_module[retarget] = module
+
+    return compiled_models, stitching_retargeting_module
+
+
+def warm_up_models(compiled_models, stitching_retargeting_module, inputs):
+    """
+    Warm up models to prepare them for benchmarking
+    """
+    print("Warm up start!")
+    with torch.no_grad():
+        for _ in range(10):
+            compiled_models['Appearance Feature Extractor'](inputs['source_image'])
+            compiled_models['Motion Extractor'](inputs['source_image'])
+            compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source'])
+            compiled_models['SPADE Decoder'](inputs['generator_input'])  # Adjust input as required
+            stitching_retargeting_module['stitching'](inputs['feat_stitching'])
+            stitching_retargeting_module['eye'](inputs['feat_eye'])
+            stitching_retargeting_module['lip'](inputs['feat_lip'])
+    print("Warm up end!")
+
+
+def measure_inference_times(compiled_models, stitching_retargeting_module, inputs):
+    """
+    Measure inference times for each model
+    """
+    times = {name: [] for name in compiled_models.keys()}
+    times['Stitching and Retargeting Modules'] = []
+
+    overall_times = []
+
+    with torch.no_grad():
+        for _ in range(100):
+            torch.cuda.synchronize()
+            overall_start = time.time()
+
+            start = time.time()
+            compiled_models['Appearance Feature Extractor'](inputs['source_image'])
+            torch.cuda.synchronize()
+            times['Appearance Feature Extractor'].append(time.time() - start)
+
+            start = time.time()
+            compiled_models['Motion Extractor'](inputs['source_image'])
+            torch.cuda.synchronize()
+            times['Motion Extractor'].append(time.time() - start)
+
+            start = time.time()
+            compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source'])
+            torch.cuda.synchronize()
+            times['Warping Network'].append(time.time() - start)
+
+            start = time.time()
+            compiled_models['SPADE Decoder'](inputs['generator_input'])  # Adjust input as required
+            torch.cuda.synchronize()
+            times['SPADE Decoder'].append(time.time() - start)
+
+            start = time.time()
+            stitching_retargeting_module['stitching'](inputs['feat_stitching'])
+            stitching_retargeting_module['eye'](inputs['feat_eye'])
+            stitching_retargeting_module['lip'](inputs['feat_lip'])
+            torch.cuda.synchronize()
+            times['Stitching and Retargeting Modules'].append(time.time() - start)
+
+            overall_times.append(time.time() - overall_start)
+
+    return times, overall_times
+
+
+def print_benchmark_results(compiled_models, stitching_retargeting_module, retargeting_models, times, overall_times):
+    """
+    Print benchmark results with average and standard deviation of inference times
+    """
+    average_times = {name: np.mean(times[name]) * 1000 for name in times.keys()}
+    std_times = {name: np.std(times[name]) * 1000 for name in times.keys()}
+
+    for name, model in compiled_models.items():
+        num_params = sum(p.numel() for p in model.parameters())
+        num_params_in_millions = num_params / 1e6
+        print(f"Number of parameters for {name}: {num_params_in_millions:.2f} M")
+
+    for index, retarget in enumerate(retargeting_models):
+        num_params = sum(p.numel() for p in stitching_retargeting_module[retarget].parameters())
+        num_params_in_millions = num_params / 1e6
+        print(f"Number of parameters for part_{index} in Stitching and Retargeting Modules: {num_params_in_millions:.2f} M")
+
+    for name, avg_time in average_times.items():
+        std_time = std_times[name]
+        print(f"Average inference time for {name} over 100 runs: {avg_time:.2f} ms (std: {std_time:.2f} ms)")
+
+
+def main():
+    """
+    Main function to benchmark speed and model parameters
+    """
+    # Load configuration
+    cfg = InferenceConfig()
+    model_config_path = cfg.models_config
+    with open(model_config_path, 'r') as file:
+        model_config = yaml.safe_load(file)
+
+    # Sample input tensors
+    inputs = initialize_inputs(device_id = cfg.device_id)
+
+    # Load and compile models
+    compiled_models, stitching_retargeting_module = load_and_compile_models(cfg, model_config)
+
+    # Warm up models
+    warm_up_models(compiled_models, stitching_retargeting_module, inputs)
+
+    # Measure inference times
+    times, overall_times = measure_inference_times(compiled_models, stitching_retargeting_module, inputs)
+
+    # Print benchmark results
+    print_benchmark_results(compiled_models, stitching_retargeting_module, ['stitching', 'eye', 'lip'], times, overall_times)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/thirdparty/liveportrait/src/config/__init__.py b/src/thirdparty/liveportrait/src/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/thirdparty/liveportrait/src/config/argument_config.py b/src/thirdparty/liveportrait/src/config/argument_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..055f5f83e49225e0ce9f702b5d3851e4da19e6b0
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/config/argument_config.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+
+"""
+All configs for user
+"""
+from dataclasses import dataclass
+import tyro
+from typing_extensions import Annotated
+from typing import Optional, Literal
+from .base_config import PrintableConfig, make_abs_path
+
+
+@dataclass(repr=False)  # use repr from PrintableConfig
+class ArgumentConfig(PrintableConfig):
+    ########## input arguments ##########
+    source: Annotated[str, tyro.conf.arg(aliases=["-s"])] = make_abs_path('../../assets/examples/source/s0.jpg')  # path to the source portrait (human/animal) or video (human)
+    driving:  Annotated[str, tyro.conf.arg(aliases=["-d"])] = make_abs_path('../../assets/examples/driving/d0.mp4')  # path to driving video or template (.pkl format)
+    output_dir: Annotated[str, tyro.conf.arg(aliases=["-o"])] = 'animations/'  # directory to save output video
+
+    ########## inference arguments ##########
+    flag_use_half_precision: bool = True  # whether to use half precision (FP16). If black boxes appear, it might be due to GPU incompatibility; set to False.
+    flag_crop_driving_video: bool = False  # whether to crop the driving video, if the given driving info is a video
+    device_id: int = 0  # gpu device id
+    flag_force_cpu: bool = False  # force cpu inference, WIP!
+    flag_normalize_lip: bool = False  # whether to let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
+    flag_source_video_eye_retargeting: bool = False  # when the input is a source video, whether to let the eye-open scalar of each frame to be the same as the first source frame before the animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False, may cause the inter-frame jittering
+    flag_eye_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the eyes-open ratio of each driving frame to the source image or the corresponding source frame
+    flag_lip_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the lip-open ratio of each driving frame to the source image or the corresponding source frame
+    flag_stitching: bool = True  # recommend to True if head movement is small, False if head movement is large or the source image is an animal
+    flag_relative_motion: bool = True # whether to use relative motion
+    flag_pasteback: bool = True  # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
+    flag_do_crop: bool = True  # whether to crop the source portrait or video to the face-cropping space
+    driving_option: Literal["expression-friendly", "pose-friendly"] = "expression-friendly" # "expression-friendly" or "pose-friendly"; "expression-friendly" would adapt the driving motion with the global multiplier, and could be used when the source is a human image
+    driving_multiplier: float = 1.0 # be used only when driving_option is "expression-friendly"
+    driving_smooth_observation_variance: float = 3e-7  # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+    audio_priority: Literal['source', 'driving'] = 'driving'  # whether to use the audio from source or driving video
+    animation_region: Literal["exp", "pose", "lip", "eyes", "all"] = "all" # the region where the animation was performed, "exp" means the expression, "pose" means the head pose, "all" means all regions
+    ########## source crop arguments ##########
+    det_thresh: float = 0.15 # detection threshold
+    scale: float = 2.3  # the ratio of face area is smaller if scale is larger
+    vx_ratio: float = 0  # the ratio to move the face to left or right in cropping space
+    vy_ratio: float = -0.125  # the ratio to move the face to up or down in cropping space
+    flag_do_rot: bool = True  # whether to conduct the rotation when flag_do_crop is True
+    source_max_dim: int = 1280 # the max dim of height and width of source image or video, you can change it to a larger number, e.g., 1920
+    source_division: int = 2 # make sure the height and width of source image or video can be divided by this number
+
+    ########## driving crop arguments ##########
+    scale_crop_driving_video: float = 2.2  # scale factor for cropping driving video
+    vx_ratio_crop_driving_video: float = 0.  # adjust y offset
+    vy_ratio_crop_driving_video: float = -0.1  # adjust x offset
+
+    ########## gradio arguments ##########
+    server_port: Annotated[int, tyro.conf.arg(aliases=["-p"])] = 8890  # port for gradio server
+    share: bool = False  # whether to share the server to public
+    server_name: Optional[str] = "127.0.0.1"  # set the local server name, "0.0.0.0" to broadcast all
+    flag_do_torch_compile: bool = False  # whether to use torch.compile to accelerate generation
+    gradio_temp_dir: Optional[str] = None  # directory to save gradio temp files
diff --git a/src/thirdparty/liveportrait/src/config/base_config.py b/src/thirdparty/liveportrait/src/config/base_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e1c75ae358d173587c9ebd4afaa4b18ff70b464
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/config/base_config.py
@@ -0,0 +1,29 @@
+# coding: utf-8
+
+"""
+pretty printing class
+"""
+
+from __future__ import annotations
+import os.path as osp
+from typing import Tuple
+
+
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+class PrintableConfig:  
+    """Printable Config defining str function"""
+
+    def __repr__(self):
+        lines = [self.__class__.__name__ + ":"]
+        for key, val in vars(self).items():
+            if isinstance(val, Tuple):
+                flattened_val = "["
+                for item in val:
+                    flattened_val += str(item) + "\n"
+                flattened_val = flattened_val.rstrip("\n")
+                val = flattened_val + "]"
+            lines += f"{key}: {str(val)}".split("\n")
+        return "\n    ".join(lines)
diff --git a/src/thirdparty/liveportrait/src/config/crop_config.py b/src/thirdparty/liveportrait/src/config/crop_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaf6cc2020088a2c512a965d45a17f570c442d7f
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/config/crop_config.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+
+"""
+parameters used for crop faces
+"""
+
+from dataclasses import dataclass
+
+from .base_config import PrintableConfig, make_abs_path
+
+
+@dataclass(repr=False)  # use repr from PrintableConfig
+class CropConfig(PrintableConfig):
+    insightface_root: str = make_abs_path("../../pretrained_weights/insightface")
+    landmark_ckpt_path: str = make_abs_path("../../pretrained_weights/liveportrait/landmark.onnx")
+    xpose_config_file_path: str = make_abs_path("../utils/dependencies/XPose/config_model/UniPose_SwinT.py")
+    xpose_embedding_cache_path: str = make_abs_path('../utils/resources/clip_embedding')
+
+    xpose_ckpt_path: str = make_abs_path("../../pretrained_weights/liveportrait_animals/xpose.pth")
+    device_id: int = 0  # gpu device id
+    flag_force_cpu: bool = False  # force cpu inference, WIP
+    det_thresh: float = 0.1 # detection threshold
+    ########## source image or video cropping option ##########
+    dsize: int = 512  # crop size
+    scale: float = 2.3  # scale factor
+    vx_ratio: float = 0  # vx ratio
+    vy_ratio: float = -0.125  # vy ratio +up, -down
+    max_face_num: int = 0  # max face number, 0 mean no limit
+    flag_do_rot: bool = True # whether to conduct the rotation when flag_do_crop is True
+    animal_face_type: str = "animal_face_9"  # animal_face_68 -> 68 landmark points, animal_face_9 -> 9 landmarks
+    ########## driving video auto cropping option ##########
+    scale_crop_driving_video: float = 2.2  # 2.0 # scale factor for cropping driving video
+    vx_ratio_crop_driving_video: float = 0.0  # adjust y offset
+    vy_ratio_crop_driving_video: float = -0.1  # adjust x offset
+    direction: str = "large-small"  # direction of cropping
diff --git a/src/thirdparty/liveportrait/src/config/inference_config.py b/src/thirdparty/liveportrait/src/config/inference_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9ed197a7eb0398220b7b83e093615e7814ad745
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/config/inference_config.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+
+"""
+config dataclass used for inference
+"""
+
+import cv2
+from numpy import ndarray
+import pickle as pkl
+from dataclasses import dataclass, field
+from typing import Literal, Tuple
+from .base_config import PrintableConfig, make_abs_path
+
+def load_lip_array():
+    with open(make_abs_path('../utils/resources/lip_array.pkl'), 'rb') as f:
+        return pkl.load(f)
+
+@dataclass(repr=False)  # use repr from PrintableConfig
+class InferenceConfig(PrintableConfig):
+    # HUMAN MODEL CONFIG, NOT EXPORTED PARAMS
+    models_config: str = make_abs_path('./models.yaml')  # portrait animation config
+    checkpoint_F: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/appearance_feature_extractor.pth')  # path to checkpoint of F
+    checkpoint_M: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/motion_extractor.pth')  # path to checkpoint pf M
+    checkpoint_G: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/spade_generator.pth')  # path to checkpoint of G
+    checkpoint_W: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/warping_module.pth')  # path to checkpoint of W
+    checkpoint_S: str = make_abs_path('../../pretrained_weights/liveportrait/retargeting_models/stitching_retargeting_module.pth')  # path to checkpoint to S and R_eyes, R_lip
+
+    # ANIMAL MODEL CONFIG, NOT EXPORTED PARAMS
+    checkpoint_F_animal: str = make_abs_path('../../pretrained_weights/liveportrait_animals/base_models/appearance_feature_extractor.pth')  # path to checkpoint of F
+    checkpoint_M_animal: str = make_abs_path('../../pretrained_weights/liveportrait_animals/base_models/motion_extractor.pth')  # path to checkpoint pf M
+    checkpoint_G_animal: str = make_abs_path('../../pretrained_weights/liveportrait_animals/base_models/spade_generator.pth')  # path to checkpoint of G
+    checkpoint_W_animal: str = make_abs_path('../../pretrained_weights/liveportrait_animals/base_models/warping_module.pth')  # path to checkpoint of W
+    checkpoint_S_animal: str = make_abs_path('../../pretrained_weights/liveportrait/retargeting_models/stitching_retargeting_module.pth')  # path to checkpoint to S and R_eyes, R_lip, NOTE: use human temporarily!
+
+    # EXPORTED PARAMS
+    flag_use_half_precision: bool = True
+    flag_crop_driving_video: bool = False
+    device_id: int = 0
+    flag_normalize_lip: bool = True
+    flag_source_video_eye_retargeting: bool = False
+    flag_eye_retargeting: bool = False
+    flag_lip_retargeting: bool = False
+    flag_stitching: bool = True
+    flag_relative_motion: bool = True
+    flag_pasteback: bool = True
+    flag_do_crop: bool = True
+    flag_do_rot: bool = True
+    flag_force_cpu: bool = False
+    flag_do_torch_compile: bool = False
+    driving_option: str = "pose-friendly" # "expression-friendly" or "pose-friendly"
+    driving_multiplier: float = 1.0
+    driving_smooth_observation_variance: float = 3e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+    source_max_dim: int = 1280 # the max dim of height and width of source image or video
+    source_division: int = 2 # make sure the height and width of source image or video can be divided by this number
+    animation_region: Literal["exp", "pose", "lip", "eyes", "all"] = "all" # the region where the animation was performed, "exp" means the expression, "pose" means the head pose
+
+    # NOT EXPORTED PARAMS
+    lip_normalize_threshold: float = 0.03 # threshold for flag_normalize_lip
+    source_video_eye_retargeting_threshold: float = 0.18 # threshold for eyes retargeting if the input is a source video
+    anchor_frame: int = 0 # TO IMPLEMENT
+
+    input_shape: Tuple[int, int] = (256, 256)  # input shape
+    output_format: Literal['mp4', 'gif'] = 'mp4'  # output video format
+    crf: int = 15  # crf for output video
+    output_fps: int = 25 # default output fps
+
+    mask_crop: ndarray = field(default_factory=lambda: cv2.imread(make_abs_path('../utils/resources/mask_template.png'), cv2.IMREAD_COLOR))
+    lip_array: ndarray = field(default_factory=load_lip_array)
+    size_gif: int = 256 # default gif size, TO IMPLEMENT
diff --git a/src/thirdparty/liveportrait/src/config/models.yaml b/src/thirdparty/liveportrait/src/config/models.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..131d1c65025c31e37af9239e211ea14454128a2e
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/config/models.yaml
@@ -0,0 +1,43 @@
+model_params:
+  appearance_feature_extractor_params: # the F in the paper
+    image_channel: 3
+    block_expansion: 64
+    num_down_blocks: 2
+    max_features: 512
+    reshape_channel: 32
+    reshape_depth: 16
+    num_resblocks: 6
+  motion_extractor_params: # the M in the paper
+    num_kp: 21
+    backbone: convnextv2_tiny
+  warping_module_params: # the W in the paper
+    num_kp: 21
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+    reshape_channel: 32
+    estimate_occlusion_map: True
+    dense_motion_params:
+      block_expansion: 32
+      max_features: 1024
+      num_blocks: 5
+      reshape_depth: 16
+      compress: 4
+  spade_generator_params: # the G in the paper
+    upscale: 2 # represents upsample factor 256x256 -> 512x512
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+  stitching_retargeting_module_params: # the S in the paper
+    stitching:
+      input_size: 126 # (21*3)*2
+      hidden_sizes: [128, 128, 64]
+      output_size: 65 # (21*3)+2(tx,ty)
+    lip:
+      input_size: 65 # (21*3)+2
+      hidden_sizes: [128, 128, 64]
+      output_size: 63 # (21*3)
+    eye:
+      input_size: 66 # (21*3)+3
+      hidden_sizes: [256, 256, 128, 128, 64]
+      output_size: 63 # (21*3)
diff --git a/src/thirdparty/liveportrait/src/gradio_pipeline.py b/src/thirdparty/liveportrait/src/gradio_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccd9517ace91ffc16054cc59ab7c83d00f02eb5
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/gradio_pipeline.py
@@ -0,0 +1,665 @@
+# coding: utf-8
+
+"""
+Pipeline for gradio
+"""
+
+import os.path as osp
+import os
+import cv2
+from rich.progress import track
+import gradio as gr
+import numpy as np
+import torch
+
+from .config.argument_config import ArgumentConfig
+from .live_portrait_pipeline import LivePortraitPipeline
+from .live_portrait_pipeline_animal import LivePortraitPipelineAnimal
+from .utils.io import load_img_online, load_video, resize_to_limit
+from .utils.filter import smooth
+from .utils.rprint import rlog as log
+from .utils.crop import prepare_paste_back, paste_back
+from .utils.camera import get_rotation_matrix
+from .utils.video import get_fps, has_audio_stream, concat_frames, images2video, add_audio_to_video
+from .utils.helper import is_square_video, mkdir, dct2device, basename
+from .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio
+
+
+def update_args(args, user_args):
+    """update the args according to user inputs
+    """
+    for k, v in user_args.items():
+        if hasattr(args, k):
+            setattr(args, k, v)
+    return args
+
+
+class GradioPipeline(LivePortraitPipeline):
+    """gradio for human
+    """
+
+    def __init__(self, inference_cfg, crop_cfg, args: ArgumentConfig):
+        super().__init__(inference_cfg, crop_cfg)
+        # self.live_portrait_wrapper = self.live_portrait_wrapper
+        self.args = args
+
+    @torch.no_grad()
+    def update_delta_new_eyeball_direction(self, eyeball_direction_x, eyeball_direction_y, delta_new, **kwargs):
+        if eyeball_direction_x > 0:
+                delta_new[0, 11, 0] += eyeball_direction_x * 0.0007
+                delta_new[0, 15, 0] += eyeball_direction_x * 0.001
+        else:
+            delta_new[0, 11, 0] += eyeball_direction_x * 0.001
+            delta_new[0, 15, 0] += eyeball_direction_x * 0.0007
+
+        delta_new[0, 11, 1] += eyeball_direction_y * -0.001
+        delta_new[0, 15, 1] += eyeball_direction_y * -0.001
+        blink = -eyeball_direction_y / 2.
+
+        delta_new[0, 11, 1] += blink * -0.001
+        delta_new[0, 13, 1] += blink * 0.0003
+        delta_new[0, 15, 1] += blink * -0.001
+        delta_new[0, 16, 1] += blink * 0.0003
+
+        return delta_new
+
+    @torch.no_grad()
+    def update_delta_new_smile(self, smile, delta_new, **kwargs):
+        delta_new[0, 20, 1] += smile * -0.01
+        delta_new[0, 14, 1] += smile * -0.02
+        delta_new[0, 17, 1] += smile * 0.0065
+        delta_new[0, 17, 2] += smile * 0.003
+        delta_new[0, 13, 1] += smile * -0.00275
+        delta_new[0, 16, 1] += smile * -0.00275
+        delta_new[0, 3, 1] += smile * -0.0035
+        delta_new[0, 7, 1] += smile * -0.0035
+
+        return delta_new
+
+    @torch.no_grad()
+    def update_delta_new_wink(self, wink, delta_new, **kwargs):
+        delta_new[0, 11, 1] += wink * 0.001
+        delta_new[0, 13, 1] += wink * -0.0003
+        delta_new[0, 17, 0] += wink * 0.0003
+        delta_new[0, 17, 1] += wink * 0.0003
+        delta_new[0, 3, 1] += wink * -0.0003
+
+        return delta_new
+
+    @torch.no_grad()
+    def update_delta_new_eyebrow(self, eyebrow, delta_new, **kwargs):
+        if eyebrow > 0:
+            delta_new[0, 1, 1] += eyebrow * 0.001
+            delta_new[0, 2, 1] += eyebrow * -0.001
+        else:
+            delta_new[0, 1, 0] += eyebrow * -0.001
+            delta_new[0, 2, 0] += eyebrow * 0.001
+            delta_new[0, 1, 1] += eyebrow * 0.0003
+            delta_new[0, 2, 1] += eyebrow * -0.0003
+        return delta_new
+
+    @torch.no_grad()
+    def update_delta_new_lip_variation_zero(self, lip_variation_zero, delta_new, **kwargs):
+        delta_new[0, 19, 0] += lip_variation_zero
+
+        return delta_new
+
+    @torch.no_grad()
+    def update_delta_new_lip_variation_one(self, lip_variation_one, delta_new, **kwargs):
+        delta_new[0, 14, 1] += lip_variation_one * 0.001
+        delta_new[0, 3, 1] += lip_variation_one * -0.0005
+        delta_new[0, 7, 1] += lip_variation_one * -0.0005
+        delta_new[0, 17, 2] += lip_variation_one * -0.0005
+
+        return delta_new
+
+    @torch.no_grad()
+    def update_delta_new_lip_variation_two(self, lip_variation_two, delta_new, **kwargs):
+        delta_new[0, 20, 2] += lip_variation_two * -0.001
+        delta_new[0, 20, 1] += lip_variation_two * -0.001
+        delta_new[0, 14, 1] += lip_variation_two * -0.001
+
+        return delta_new
+
+    @torch.no_grad()
+    def update_delta_new_lip_variation_three(self, lip_variation_three, delta_new, **kwargs):
+        delta_new[0, 19, 1] += lip_variation_three * 0.001
+        delta_new[0, 19, 2] += lip_variation_three * 0.0001
+        delta_new[0, 17, 1] += lip_variation_three * -0.0001
+
+        return delta_new
+
+    @torch.no_grad()
+    def update_delta_new_mov_x(self, mov_x, delta_new, **kwargs):
+        delta_new[0, 5, 0] += mov_x
+
+        return delta_new
+
+    @torch.no_grad()
+    def update_delta_new_mov_y(self, mov_y, delta_new, **kwargs):
+        delta_new[0, 5, 1] += mov_y
+
+        return delta_new
+
+    @torch.no_grad()
+    def execute_video(
+        self,
+        input_source_image_path=None,
+        input_source_video_path=None,
+        input_driving_video_path=None,
+        input_driving_image_path=None,
+        input_driving_video_pickle_path=None,
+        flag_normalize_lip=False,
+        flag_relative_input=True,
+        flag_do_crop_input=True,
+        flag_remap_input=True,
+        flag_stitching_input=True,
+        animation_region="all",
+        driving_option_input="pose-friendly",
+        driving_multiplier=1.0,
+        flag_crop_driving_video_input=True,
+        # flag_video_editing_head_rotation=False,
+        scale=2.3,
+        vx_ratio=0.0,
+        vy_ratio=-0.125,
+        scale_crop_driving_video=2.2,
+        vx_ratio_crop_driving_video=0.0,
+        vy_ratio_crop_driving_video=-0.1,
+        driving_smooth_observation_variance=3e-7,
+        tab_selection=None,
+        v_tab_selection=None
+    ):
+        """ for video-driven portrait animation or video editing
+        """
+        if tab_selection == 'Image':
+            input_source_path = input_source_image_path
+        elif tab_selection == 'Video':
+            input_source_path = input_source_video_path
+        else:
+            input_source_path = input_source_image_path
+
+        if v_tab_selection == 'Video':
+            input_driving_path = input_driving_video_path
+        elif v_tab_selection == 'Image':
+            input_driving_path = input_driving_image_path
+        elif v_tab_selection == 'Pickle':
+            input_driving_path = input_driving_video_pickle_path
+        else:
+            input_driving_path = input_driving_video_path
+
+        if input_source_path is not None and input_driving_path is not None:
+            if osp.exists(input_driving_path) and v_tab_selection == 'Video' and not flag_crop_driving_video_input and is_square_video(input_driving_path) is False:
+                flag_crop_driving_video_input = True
+                log("The driving video is not square, it will be cropped to square automatically.")
+                gr.Info("The driving video is not square, it will be cropped to square automatically.", duration=2)
+
+            args_user = {
+                'source': input_source_path,
+                'driving': input_driving_path,
+                'flag_normalize_lip' : flag_normalize_lip,
+                'flag_relative_motion': flag_relative_input,
+                'flag_do_crop': flag_do_crop_input,
+                'flag_pasteback': flag_remap_input,
+                'flag_stitching': flag_stitching_input,
+                'animation_region': animation_region,
+                'driving_option': driving_option_input,
+                'driving_multiplier': driving_multiplier,
+                'flag_crop_driving_video': flag_crop_driving_video_input,
+                'scale': scale,
+                'vx_ratio': vx_ratio,
+                'vy_ratio': vy_ratio,
+                'scale_crop_driving_video': scale_crop_driving_video,
+                'vx_ratio_crop_driving_video': vx_ratio_crop_driving_video,
+                'vy_ratio_crop_driving_video': vy_ratio_crop_driving_video,
+                'driving_smooth_observation_variance': driving_smooth_observation_variance,
+            }
+            # update config from user input
+            self.args = update_args(self.args, args_user)
+            self.live_portrait_wrapper.update_config(self.args.__dict__)
+            self.cropper.update_config(self.args.__dict__)
+
+            output_path, output_path_concat = self.execute(self.args)
+            gr.Info("Run successfully!", duration=2)
+            if output_path.endswith(".jpg"):
+                return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), output_path, gr.update(visible=True), output_path_concat, gr.update(visible=True)
+            else:
+                return output_path, gr.update(visible=True), output_path_concat, gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+        else:
+            raise gr.Error("Please upload the source portrait or source video, and driving video 🤗🤗🤗", duration=5)
+
+    @torch.no_grad()
+    def execute_image_retargeting(
+        self,
+        input_eye_ratio: float,
+        input_lip_ratio: float,
+        input_head_pitch_variation: float,
+        input_head_yaw_variation: float,
+        input_head_roll_variation: float,
+        mov_x: float,
+        mov_y: float,
+        mov_z: float,
+        lip_variation_zero: float,
+        lip_variation_one: float,
+        lip_variation_two: float,
+        lip_variation_three: float,
+        smile: float,
+        wink: float,
+        eyebrow: float,
+        eyeball_direction_x: float,
+        eyeball_direction_y: float,
+        input_image,
+        retargeting_source_scale: float,
+        flag_stitching_retargeting_input=True,
+        flag_do_crop_input_retargeting_image=True):
+        """ for single image retargeting
+        """
+        if input_head_pitch_variation is None or input_head_yaw_variation is None or input_head_roll_variation is None:
+            raise gr.Error("Invalid relative pose input 💥!", duration=5)
+        # disposable feature
+        f_s_user, x_s_user, R_s_user, R_d_user, x_s_info, source_lmk_user, crop_M_c2o, mask_ori, img_rgb = \
+            self.prepare_retargeting_image(
+                input_image, input_head_pitch_variation, input_head_yaw_variation, input_head_roll_variation, retargeting_source_scale, flag_do_crop=flag_do_crop_input_retargeting_image)
+
+        if input_eye_ratio is None or input_lip_ratio is None:
+            raise gr.Error("Invalid ratio input 💥!", duration=5)
+        else:
+            device = self.live_portrait_wrapper.device
+            # inference_cfg = self.live_portrait_wrapper.inference_cfg
+            x_s_user = x_s_user.to(device)
+            f_s_user = f_s_user.to(device)
+            R_s_user = R_s_user.to(device)
+            R_d_user = R_d_user.to(device)
+            mov_x = torch.tensor(mov_x).to(device)
+            mov_y = torch.tensor(mov_y).to(device)
+            mov_z = torch.tensor(mov_z).to(device)
+            eyeball_direction_x = torch.tensor(eyeball_direction_x).to(device)
+            eyeball_direction_y = torch.tensor(eyeball_direction_y).to(device)
+            smile = torch.tensor(smile).to(device)
+            wink = torch.tensor(wink).to(device)
+            eyebrow = torch.tensor(eyebrow).to(device)
+            lip_variation_zero = torch.tensor(lip_variation_zero).to(device)
+            lip_variation_one = torch.tensor(lip_variation_one).to(device)
+            lip_variation_two = torch.tensor(lip_variation_two).to(device)
+            lip_variation_three = torch.tensor(lip_variation_three).to(device)
+
+            x_c_s = x_s_info['kp'].to(device)
+            delta_new = x_s_info['exp'].to(device)
+            scale_new = x_s_info['scale'].to(device)
+            t_new = x_s_info['t'].to(device)
+            R_d_new = (R_d_user @ R_s_user.permute(0, 2, 1)) @ R_s_user
+
+            if eyeball_direction_x != 0 or eyeball_direction_y != 0:
+                delta_new = self.update_delta_new_eyeball_direction(eyeball_direction_x, eyeball_direction_y, delta_new)
+            if smile != 0:
+                delta_new = self.update_delta_new_smile(smile, delta_new)
+            if wink != 0:
+                delta_new = self.update_delta_new_wink(wink, delta_new)
+            if eyebrow != 0:
+                delta_new = self.update_delta_new_eyebrow(eyebrow, delta_new)
+            if lip_variation_zero != 0:
+                delta_new = self.update_delta_new_lip_variation_zero(lip_variation_zero, delta_new)
+            if lip_variation_one !=  0:
+                delta_new = self.update_delta_new_lip_variation_one(lip_variation_one, delta_new)
+            if lip_variation_two != 0:
+                delta_new = self.update_delta_new_lip_variation_two(lip_variation_two, delta_new)
+            if lip_variation_three != 0:
+                delta_new = self.update_delta_new_lip_variation_three(lip_variation_three, delta_new)
+            if mov_x != 0:
+                delta_new = self.update_delta_new_mov_x(-mov_x, delta_new)
+            if mov_y !=0 :
+                delta_new = self.update_delta_new_mov_y(mov_y, delta_new)
+
+            x_d_new = mov_z * scale_new * (x_c_s @ R_d_new + delta_new) + t_new
+            eyes_delta, lip_delta = None, None
+            if input_eye_ratio != self.source_eye_ratio:
+                combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio([[float(input_eye_ratio)]], source_lmk_user)
+                eyes_delta = self.live_portrait_wrapper.retarget_eye(x_s_user, combined_eye_ratio_tensor)
+            if input_lip_ratio != self.source_lip_ratio:
+                combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio([[float(input_lip_ratio)]], source_lmk_user)
+                lip_delta = self.live_portrait_wrapper.retarget_lip(x_s_user, combined_lip_ratio_tensor)
+                print(lip_delta)
+            x_d_new = x_d_new + \
+                    (eyes_delta if eyes_delta is not None else 0) + \
+                    (lip_delta if lip_delta is not None else 0)
+
+            if flag_stitching_retargeting_input:
+                x_d_new = self.live_portrait_wrapper.stitching(x_s_user, x_d_new)
+            out = self.live_portrait_wrapper.warp_decode(f_s_user, x_s_user, x_d_new)
+            out = self.live_portrait_wrapper.parse_output(out['out'])[0]
+            if flag_do_crop_input_retargeting_image:
+                out_to_ori_blend = paste_back(out, crop_M_c2o, img_rgb, mask_ori)
+            else:
+                out_to_ori_blend = out
+            return out, out_to_ori_blend
+
+    @torch.no_grad()
+    def prepare_retargeting_image(
+        self,
+        input_image,
+        input_head_pitch_variation, input_head_yaw_variation, input_head_roll_variation,
+        retargeting_source_scale,
+        flag_do_crop=True):
+        """ for single image retargeting
+        """
+        if input_image is not None:
+            # gr.Info("Upload successfully!", duration=2)
+            args_user = {'scale': retargeting_source_scale}
+            self.args = update_args(self.args, args_user)
+            self.cropper.update_config(self.args.__dict__)
+            inference_cfg = self.live_portrait_wrapper.inference_cfg
+            ######## process source portrait ########
+            img_rgb = load_img_online(input_image, mode='rgb', max_dim=1280, n=2)
+            if flag_do_crop:
+                crop_info = self.cropper.crop_source_image(img_rgb, self.cropper.crop_cfg)
+                I_s = self.live_portrait_wrapper.prepare_source(crop_info['img_crop_256x256'])
+                source_lmk_user = crop_info['lmk_crop']
+                crop_M_c2o = crop_info['M_c2o']
+                mask_ori = prepare_paste_back(inference_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0]))
+            else:
+                I_s = self.live_portrait_wrapper.prepare_source(img_rgb)
+                source_lmk_user = self.cropper.calc_lmk_from_cropped_image(img_rgb)
+                crop_M_c2o = None
+                mask_ori = None
+            x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)
+            x_d_info_user_pitch = x_s_info['pitch'] + input_head_pitch_variation
+            x_d_info_user_yaw = x_s_info['yaw'] + input_head_yaw_variation
+            x_d_info_user_roll = x_s_info['roll'] + input_head_roll_variation
+            R_s_user = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+            R_d_user = get_rotation_matrix(x_d_info_user_pitch, x_d_info_user_yaw, x_d_info_user_roll)
+            ############################################
+            f_s_user = self.live_portrait_wrapper.extract_feature_3d(I_s)
+            x_s_user = self.live_portrait_wrapper.transform_keypoint(x_s_info)
+            return f_s_user, x_s_user, R_s_user, R_d_user, x_s_info, source_lmk_user, crop_M_c2o, mask_ori, img_rgb
+        else:
+            raise gr.Error("Please upload a source portrait as the retargeting input 🤗🤗🤗", duration=5)
+
+    @torch.no_grad()
+    def init_retargeting_image(self, retargeting_source_scale: float, source_eye_ratio: float, source_lip_ratio:float, input_image = None):
+        """ initialize the retargeting slider
+        """
+        if input_image != None:
+            args_user = {'scale': retargeting_source_scale}
+            self.args = update_args(self.args, args_user)
+            self.cropper.update_config(self.args.__dict__)
+            # inference_cfg = self.live_portrait_wrapper.inference_cfg
+            ######## process source portrait ########
+            img_rgb = load_img_online(input_image, mode='rgb', max_dim=1280, n=16)
+            log(f"Load source image from {input_image}.")
+            crop_info = self.cropper.crop_source_image(img_rgb, self.cropper.crop_cfg)
+            if crop_info is None:
+                raise gr.Error("Source portrait NO face detected", duration=2)
+            source_eye_ratio = calc_eye_close_ratio(crop_info['lmk_crop'][None])
+            source_lip_ratio = calc_lip_close_ratio(crop_info['lmk_crop'][None])
+            self.source_eye_ratio = round(float(source_eye_ratio.mean()), 2)
+            self.source_lip_ratio = round(float(source_lip_ratio[0][0]), 2)
+            log("Calculating eyes-open and lip-open ratios successfully!")
+            return self.source_eye_ratio, self.source_lip_ratio
+        else:
+            return source_eye_ratio, source_lip_ratio
+
+    @torch.no_grad()
+    def execute_video_retargeting(self, input_lip_ratio: float, input_video, retargeting_source_scale: float, driving_smooth_observation_variance_retargeting: float, video_retargeting_silence=False, flag_do_crop_input_retargeting_video=True):
+        """ retargeting the lip-open ratio of each source frame
+        """
+        # disposable feature
+        device = self.live_portrait_wrapper.device
+
+        if not video_retargeting_silence:
+            f_s_user_lst, x_s_user_lst, source_lmk_crop_lst, source_M_c2o_lst, mask_ori_lst, source_rgb_lst, img_crop_256x256_lst, lip_delta_retargeting_lst_smooth, source_fps, n_frames = \
+                self.prepare_retargeting_video(input_video, retargeting_source_scale, device, input_lip_ratio, driving_smooth_observation_variance_retargeting, flag_do_crop=flag_do_crop_input_retargeting_video)
+            if input_lip_ratio is None:
+                raise gr.Error("Invalid ratio input 💥!", duration=5)
+            else:
+                inference_cfg = self.live_portrait_wrapper.inference_cfg
+
+                I_p_pstbk_lst = None
+                if flag_do_crop_input_retargeting_video:
+                    I_p_pstbk_lst = []
+                I_p_lst = []
+                for i in track(range(n_frames), description='Retargeting video...', total=n_frames):
+                    x_s_user_i = x_s_user_lst[i].to(device)
+                    f_s_user_i = f_s_user_lst[i].to(device)
+
+                    lip_delta_retargeting = lip_delta_retargeting_lst_smooth[i]
+                    x_d_i_new = x_s_user_i + lip_delta_retargeting
+                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s_user_i, x_d_i_new)
+                    out = self.live_portrait_wrapper.warp_decode(f_s_user_i, x_s_user_i, x_d_i_new)
+                    I_p_i = self.live_portrait_wrapper.parse_output(out['out'])[0]
+                    I_p_lst.append(I_p_i)
+
+                    if flag_do_crop_input_retargeting_video:
+                        I_p_pstbk = paste_back(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], mask_ori_lst[i])
+                        I_p_pstbk_lst.append(I_p_pstbk)
+        else:
+            inference_cfg = self.live_portrait_wrapper.inference_cfg
+            f_s_user_lst, x_s_user_lst, x_d_i_new_lst, source_M_c2o_lst, mask_ori_lst, source_rgb_lst, img_crop_256x256_lst, source_fps, n_frames = \
+                self.prepare_video_lip_silence(input_video, device, flag_do_crop=flag_do_crop_input_retargeting_video)
+
+            I_p_pstbk_lst = None
+            if flag_do_crop_input_retargeting_video:
+                I_p_pstbk_lst = []
+            I_p_lst = []
+            for i in track(range(n_frames), description='Silencing lip...', total=n_frames):
+                x_s_user_i = x_s_user_lst[i].to(device)
+                f_s_user_i = f_s_user_lst[i].to(device)
+                x_d_i_new = x_d_i_new_lst[i]
+                x_d_i_new = self.live_portrait_wrapper.stitching(x_s_user_i, x_d_i_new)
+                out = self.live_portrait_wrapper.warp_decode(f_s_user_i, x_s_user_i, x_d_i_new)
+                I_p_i = self.live_portrait_wrapper.parse_output(out['out'])[0]
+                I_p_lst.append(I_p_i)
+
+                if flag_do_crop_input_retargeting_video:
+                    I_p_pstbk = paste_back(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], mask_ori_lst[i])
+                    I_p_pstbk_lst.append(I_p_pstbk)
+
+        mkdir(self.args.output_dir)
+        flag_source_has_audio = has_audio_stream(input_video)
+
+        ######### build the final concatenation result #########
+        # source frame | generation
+        frames_concatenated = concat_frames(driving_image_lst=None, source_image_lst=img_crop_256x256_lst, I_p_lst=I_p_lst)
+        wfp_concat = osp.join(self.args.output_dir, f'{basename(input_video)}_retargeting_concat.mp4')
+        images2video(frames_concatenated, wfp=wfp_concat, fps=source_fps)
+
+        if flag_source_has_audio:
+            # final result with concatenation
+            wfp_concat_with_audio = osp.join(self.args.output_dir, f'{basename(input_video)}_retargeting_concat_with_audio.mp4')
+            add_audio_to_video(wfp_concat, input_video, wfp_concat_with_audio)
+            os.replace(wfp_concat_with_audio, wfp_concat)
+            log(f"Replace {wfp_concat_with_audio} with {wfp_concat}")
+
+        # save the animated result
+        wfp = osp.join(self.args.output_dir, f'{basename(input_video)}_retargeting.mp4')
+        if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
+            images2video(I_p_pstbk_lst, wfp=wfp, fps=source_fps)
+        else:
+            images2video(I_p_lst, wfp=wfp, fps=source_fps)
+
+        ######### build the final result #########
+        if flag_source_has_audio:
+            wfp_with_audio = osp.join(self.args.output_dir, f'{basename(input_video)}_retargeting_with_audio.mp4')
+            add_audio_to_video(wfp, input_video, wfp_with_audio)
+            os.replace(wfp_with_audio, wfp)
+            log(f"Replace {wfp_with_audio} with {wfp}")
+        gr.Info("Run successfully!", duration=2)
+        return wfp_concat, wfp
+
+    @torch.no_grad()
+    def prepare_retargeting_video(self, input_video, retargeting_source_scale, device, input_lip_ratio, driving_smooth_observation_variance_retargeting, flag_do_crop=True):
+        """ for video retargeting
+        """
+        if input_video is not None:
+            # gr.Info("Upload successfully!", duration=2)
+            args_user = {'scale': retargeting_source_scale}
+            self.args = update_args(self.args, args_user)
+            self.cropper.update_config(self.args.__dict__)
+            inference_cfg = self.live_portrait_wrapper.inference_cfg
+            ######## process source video ########
+            source_rgb_lst = load_video(input_video)
+            source_rgb_lst = [resize_to_limit(img, inference_cfg.source_max_dim, inference_cfg.source_division) for img in source_rgb_lst]
+            source_fps = int(get_fps(input_video))
+            n_frames = len(source_rgb_lst)
+            log(f"Load source video from {input_video}. FPS is {source_fps}")
+
+            if flag_do_crop:
+                ret_s = self.cropper.crop_source_video(source_rgb_lst, self.cropper.crop_cfg)
+                log(f'Source video is cropped, {len(ret_s["frame_crop_lst"])} frames are processed.')
+                if len(ret_s["frame_crop_lst"]) != n_frames:
+                    n_frames = min(len(source_rgb_lst), len(ret_s["frame_crop_lst"]))
+                img_crop_256x256_lst, source_lmk_crop_lst, source_M_c2o_lst = ret_s['frame_crop_lst'], ret_s['lmk_crop_lst'], ret_s['M_c2o_lst']
+                mask_ori_lst = [prepare_paste_back(inference_cfg.mask_crop, source_M_c2o, dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0])) for source_M_c2o in source_M_c2o_lst]
+            else:
+                source_lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(source_rgb_lst)
+                img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in source_rgb_lst]  # force to resize to 256x256
+                source_M_c2o_lst, mask_ori_lst = None, None
+
+            c_s_eyes_lst, c_s_lip_lst = self.live_portrait_wrapper.calc_ratio(source_lmk_crop_lst)
+            # save the motion template
+            I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst)
+            source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps)
+
+            c_d_lip_retargeting = [input_lip_ratio]
+            f_s_user_lst, x_s_user_lst, lip_delta_retargeting_lst = [], [], []
+            for i in track(range(n_frames), description='Preparing retargeting video...', total=n_frames):
+                x_s_info = source_template_dct['motion'][i]
+                x_s_info = dct2device(x_s_info, device)
+                x_s_user = x_s_info['x_s']
+
+                source_lmk = source_lmk_crop_lst[i]
+                img_crop_256x256 = img_crop_256x256_lst[i]
+                I_s = I_s_lst[i]
+                f_s_user = self.live_portrait_wrapper.extract_feature_3d(I_s)
+
+                combined_lip_ratio_tensor_retargeting = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_retargeting, source_lmk)
+                lip_delta_retargeting = self.live_portrait_wrapper.retarget_lip(x_s_user, combined_lip_ratio_tensor_retargeting)
+                f_s_user_lst.append(f_s_user); x_s_user_lst.append(x_s_user); lip_delta_retargeting_lst.append(lip_delta_retargeting.cpu().numpy().astype(np.float32))
+            lip_delta_retargeting_lst_smooth = smooth(lip_delta_retargeting_lst, lip_delta_retargeting_lst[0].shape, device, driving_smooth_observation_variance_retargeting)
+
+            return f_s_user_lst, x_s_user_lst, source_lmk_crop_lst, source_M_c2o_lst, mask_ori_lst, source_rgb_lst, img_crop_256x256_lst, lip_delta_retargeting_lst_smooth, source_fps, n_frames
+        else:
+            # when press the clear button, go here
+            raise gr.Error("Please upload a source video as the retargeting input 🤗🤗🤗", duration=5)
+
+    @torch.no_grad()
+    def prepare_video_lip_silence(self, input_video, device, flag_do_crop=True):
+        """ for keeping lips in the source video silent
+        """
+        if input_video is not None:
+            inference_cfg = self.live_portrait_wrapper.inference_cfg
+            ######## process source video ########
+            source_rgb_lst = load_video(input_video)
+            source_rgb_lst = [resize_to_limit(img, inference_cfg.source_max_dim, inference_cfg.source_division) for img in source_rgb_lst]
+            source_fps = int(get_fps(input_video))
+            n_frames = len(source_rgb_lst)
+            log(f"Load source video from {input_video}. FPS is {source_fps}")
+
+            if flag_do_crop:
+                ret_s = self.cropper.crop_source_video(source_rgb_lst, self.cropper.crop_cfg)
+                log(f'Source video is cropped, {len(ret_s["frame_crop_lst"])} frames are processed.')
+                if len(ret_s["frame_crop_lst"]) != n_frames:
+                    n_frames = min(len(source_rgb_lst), len(ret_s["frame_crop_lst"]))
+                img_crop_256x256_lst, source_lmk_crop_lst, source_M_c2o_lst = ret_s['frame_crop_lst'], ret_s['lmk_crop_lst'], ret_s['M_c2o_lst']
+                mask_ori_lst = [prepare_paste_back(inference_cfg.mask_crop, source_M_c2o, dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0])) for source_M_c2o in source_M_c2o_lst]
+            else:
+                source_lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(source_rgb_lst)
+                img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in source_rgb_lst]  # force to resize to 256x256
+                source_M_c2o_lst, mask_ori_lst = None, None
+
+            c_s_eyes_lst, c_s_lip_lst = self.live_portrait_wrapper.calc_ratio(source_lmk_crop_lst)
+            # save the motion template
+            I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst)
+            source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps)
+
+            f_s_user_lst, x_s_user_lst, x_d_i_new_lst = [], [], []
+            for i in track(range(n_frames), description='Preparing silencing lip...', total=n_frames):
+                x_s_info = source_template_dct['motion'][i]
+                x_s_info = dct2device(x_s_info, device)
+                scale_s = x_s_info['scale']
+                x_s_user = x_s_info['x_s']
+                x_c_s = x_s_info['kp']
+                R_s = x_s_info['R']
+                t_s = x_s_info['t']
+                delta_new = torch.zeros_like(x_s_info['exp']) + torch.from_numpy(inference_cfg.lip_array).to(dtype=torch.float32, device=device)
+                for eyes_idx in [11, 13, 15, 16, 18]:
+                    delta_new[:, eyes_idx, :] = x_s_info['exp'][:, eyes_idx, :]
+                source_lmk = source_lmk_crop_lst[i]
+                img_crop_256x256 = img_crop_256x256_lst[i]
+                I_s = I_s_lst[i]
+                f_s_user = self.live_portrait_wrapper.extract_feature_3d(I_s)
+                x_d_i_new = scale_s * (x_c_s @ R_s + delta_new) + t_s
+                f_s_user_lst.append(f_s_user); x_s_user_lst.append(x_s_user); x_d_i_new_lst.append(x_d_i_new)
+            return f_s_user_lst, x_s_user_lst, x_d_i_new_lst, source_M_c2o_lst, mask_ori_lst, source_rgb_lst, img_crop_256x256_lst, source_fps, n_frames
+        else:
+            # when press the clear button, go here
+            raise gr.Error("Please upload a source video as the input 🤗🤗🤗", duration=5)
+
+class GradioPipelineAnimal(LivePortraitPipelineAnimal):
+    """gradio for animal
+    """
+    def __init__(self, inference_cfg, crop_cfg, args: ArgumentConfig):
+        inference_cfg.flag_crop_driving_video = True # ensure the face_analysis_wrapper is enabled
+        super().__init__(inference_cfg, crop_cfg)
+        # self.live_portrait_wrapper_animal = self.live_portrait_wrapper_animal
+        self.args = args
+
+    @torch.no_grad()
+    def execute_video(
+        self,
+        input_source_image_path=None,
+        input_driving_video_path=None,
+        input_driving_video_pickle_path=None,
+        flag_do_crop_input=False,
+        flag_remap_input=False,
+        driving_multiplier=1.0,
+        flag_stitching=False,
+        flag_crop_driving_video_input=False,
+        scale=2.3,
+        vx_ratio=0.0,
+        vy_ratio=-0.125,
+        scale_crop_driving_video=2.2,
+        vx_ratio_crop_driving_video=0.0,
+        vy_ratio_crop_driving_video=-0.1,
+        tab_selection=None,
+    ):
+        """ for video-driven potrait animation
+        """
+        input_source_path = input_source_image_path
+
+        if tab_selection == 'Video':
+            input_driving_path = input_driving_video_path
+        elif tab_selection == 'Pickle':
+            input_driving_path = input_driving_video_pickle_path
+        else:
+            input_driving_path = input_driving_video_pickle_path
+
+        if input_source_path is not None and input_driving_path is not None:
+            if osp.exists(input_driving_path) and tab_selection == 'Video' and is_square_video(input_driving_path) is False:
+                flag_crop_driving_video_input = True
+                log("The driving video is not square, it will be cropped to square automatically.")
+                gr.Info("The driving video is not square, it will be cropped to square automatically.", duration=2)
+
+            args_user = {
+                'source': input_source_path,
+                'driving': input_driving_path,
+                'flag_do_crop': flag_do_crop_input,
+                'flag_pasteback': flag_remap_input,
+                'driving_multiplier': driving_multiplier,
+                'flag_stitching': flag_stitching,
+                'flag_crop_driving_video': flag_crop_driving_video_input,
+                'scale': scale,
+                'vx_ratio': vx_ratio,
+                'vy_ratio': vy_ratio,
+                'scale_crop_driving_video': scale_crop_driving_video,
+                'vx_ratio_crop_driving_video': vx_ratio_crop_driving_video,
+                'vy_ratio_crop_driving_video': vy_ratio_crop_driving_video,
+            }
+            # update config from user input
+            self.args = update_args(self.args, args_user)
+            self.live_portrait_wrapper_animal.update_config(self.args.__dict__)
+            self.cropper.update_config(self.args.__dict__)
+            # video driven animation
+            video_path, video_path_concat, video_gif_path = self.execute(self.args)
+            gr.Info("Run successfully!", duration=2)
+            return video_path, video_path_concat, video_gif_path
+        else:
+            raise gr.Error("Please upload the source animal image, and driving video 🤗🤗🤗", duration=5)
diff --git a/src/thirdparty/liveportrait/src/live_portrait_pipeline.py b/src/thirdparty/liveportrait/src/live_portrait_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a58d4034d7d1f103572e3a0fc40692bed202696
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/live_portrait_pipeline.py
@@ -0,0 +1,518 @@
+# coding: utf-8
+
+"""
+Pipeline of LivePortrait (Human)
+"""
+
+import torch
+torch.backends.cudnn.benchmark = True # disable CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR warning
+
+import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)
+import numpy as np
+import os
+import os.path as osp
+from rich.progress import track
+
+from .config.argument_config import ArgumentConfig
+from .config.inference_config import InferenceConfig
+from .config.crop_config import CropConfig
+from .utils.cropper import Cropper
+from .utils.camera import get_rotation_matrix
+from .utils.video import images2video, concat_frames, get_fps, add_audio_to_video, has_audio_stream
+from .utils.crop import prepare_paste_back, paste_back
+from .utils.io import load_image_rgb, load_video, resize_to_limit, dump, load
+from .utils.helper import mkdir, basename, dct2device, is_video, is_template, remove_suffix, is_image, is_square_video, calc_motion_multiplier
+from .utils.filter import smooth
+from .utils.rprint import rlog as log
+# from .utils.viz import viz_lmk
+from .live_portrait_wrapper import LivePortraitWrapper
+
+
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+class LivePortraitPipeline(object):
+
+    def __init__(self, inference_cfg: InferenceConfig, crop_cfg: CropConfig):
+        self.live_portrait_wrapper: LivePortraitWrapper = LivePortraitWrapper(inference_cfg=inference_cfg)
+        self.cropper: Cropper = Cropper(crop_cfg=crop_cfg)
+
+    def make_motion_template(self, I_lst, c_eyes_lst, c_lip_lst, **kwargs):
+        n_frames = I_lst.shape[0]
+        template_dct = {
+            'n_frames': n_frames,
+            'output_fps': kwargs.get('output_fps', 25),
+            'motion': [],
+            'c_eyes_lst': [],
+            'c_lip_lst': [],
+        }
+
+        for i in track(range(n_frames), description='Making motion templates...', total=n_frames):
+            # collect s, R, δ and t for inference
+            I_i = I_lst[i]
+            x_i_info = self.live_portrait_wrapper.get_kp_info(I_i)
+            x_s = self.live_portrait_wrapper.transform_keypoint(x_i_info)
+            R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])
+
+            item_dct = {
+                'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),
+                'R': R_i.cpu().numpy().astype(np.float32),
+                'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),
+                't': x_i_info['t'].cpu().numpy().astype(np.float32),
+                'kp': x_i_info['kp'].cpu().numpy().astype(np.float32),
+                'x_s': x_s.cpu().numpy().astype(np.float32),
+            }
+
+            template_dct['motion'].append(item_dct)
+
+            c_eyes = c_eyes_lst[i].astype(np.float32)
+            template_dct['c_eyes_lst'].append(c_eyes)
+
+            c_lip = c_lip_lst[i].astype(np.float32)
+            template_dct['c_lip_lst'].append(c_lip)
+
+        return template_dct
+
+    def execute(self, args: ArgumentConfig):
+        # for convenience
+        inf_cfg = self.live_portrait_wrapper.inference_cfg
+        device = self.live_portrait_wrapper.device
+        crop_cfg = self.cropper.crop_cfg
+
+        ######## load source input ########
+        flag_is_source_video = False
+        source_fps = None
+        if is_image(args.source):
+            flag_is_source_video = False
+            img_rgb = load_image_rgb(args.source)
+            img_rgb = resize_to_limit(img_rgb, inf_cfg.source_max_dim, inf_cfg.source_division)
+            log(f"Load source image from {args.source}")
+            source_rgb_lst = [img_rgb]
+        elif is_video(args.source):
+            flag_is_source_video = True
+            source_rgb_lst = load_video(args.source)
+            source_rgb_lst = [resize_to_limit(img, inf_cfg.source_max_dim, inf_cfg.source_division) for img in source_rgb_lst]
+            source_fps = int(get_fps(args.source))
+            log(f"Load source video from {args.source}, FPS is {source_fps}")
+        else:  # source input is an unknown format
+            raise Exception(f"Unknown source format: {args.source}")
+
+        ######## process driving info ########
+        flag_load_from_template = is_template(args.driving)
+        driving_rgb_crop_256x256_lst = None
+        wfp_template = None
+
+        if flag_load_from_template:
+            # NOTE: load from template, it is fast, but the cropping video is None
+            log(f"Load from template: {args.driving}, NOT the video, so the cropping video and audio are both NULL.", style='bold green')
+            driving_template_dct = load(args.driving)
+            c_d_eyes_lst = driving_template_dct['c_eyes_lst'] if 'c_eyes_lst' in driving_template_dct.keys() else driving_template_dct['c_d_eyes_lst'] # compatible with previous keys
+            c_d_lip_lst = driving_template_dct['c_lip_lst'] if 'c_lip_lst' in driving_template_dct.keys() else driving_template_dct['c_d_lip_lst']
+            driving_n_frames = driving_template_dct['n_frames']
+            flag_is_driving_video = True if driving_n_frames > 1 else False
+            if flag_is_source_video and flag_is_driving_video:
+                n_frames = min(len(source_rgb_lst), driving_n_frames)  # minimum number as the number of the animated frames
+            elif flag_is_source_video and not flag_is_driving_video:
+                n_frames = len(source_rgb_lst)
+            else:
+                n_frames = driving_n_frames
+
+            # set output_fps
+            output_fps = driving_template_dct.get('output_fps', inf_cfg.output_fps)
+            log(f'The FPS of template: {output_fps}')
+
+            if args.flag_crop_driving_video:
+                log("Warning: flag_crop_driving_video is True, but the driving info is a template, so it is ignored.")
+
+        elif osp.exists(args.driving):
+            if is_video(args.driving):
+                flag_is_driving_video = True
+                # load from video file, AND make motion template
+                output_fps = int(get_fps(args.driving))
+                log(f"Load driving video from: {args.driving}, FPS is {output_fps}")
+                driving_rgb_lst = load_video(args.driving)
+            elif is_image(args.driving):
+                flag_is_driving_video = False
+                driving_img_rgb = load_image_rgb(args.driving)
+                output_fps = 25
+                log(f"Load driving image from {args.driving}")
+                driving_rgb_lst = [driving_img_rgb]
+            else:
+                raise Exception(f"{args.driving} is not a supported type!")
+            ######## make motion template ########
+            log("Start making driving motion template...")
+            driving_n_frames = len(driving_rgb_lst)
+            if flag_is_source_video and flag_is_driving_video:
+                n_frames = min(len(source_rgb_lst), driving_n_frames)  # minimum number as the number of the animated frames
+                driving_rgb_lst = driving_rgb_lst[:n_frames]
+            elif flag_is_source_video and not flag_is_driving_video:
+                n_frames = len(source_rgb_lst)
+            else:
+                n_frames = driving_n_frames
+            if inf_cfg.flag_crop_driving_video or (not is_square_video(args.driving)):
+                ret_d = self.cropper.crop_driving_video(driving_rgb_lst)
+                log(f'Driving video is cropped, {len(ret_d["frame_crop_lst"])} frames are processed.')
+                if len(ret_d["frame_crop_lst"]) is not n_frames and flag_is_driving_video:
+                    n_frames = min(n_frames, len(ret_d["frame_crop_lst"]))
+                driving_rgb_crop_lst, driving_lmk_crop_lst = ret_d['frame_crop_lst'], ret_d['lmk_crop_lst']
+                driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_crop_lst]
+            else:
+                driving_lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(driving_rgb_lst)
+                driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst]  # force to resize to 256x256
+            #######################################
+
+            c_d_eyes_lst, c_d_lip_lst = self.live_portrait_wrapper.calc_ratio(driving_lmk_crop_lst)
+            # save the motion template
+            I_d_lst = self.live_portrait_wrapper.prepare_videos(driving_rgb_crop_256x256_lst)
+            driving_template_dct = self.make_motion_template(I_d_lst, c_d_eyes_lst, c_d_lip_lst, output_fps=output_fps)
+
+            wfp_template = remove_suffix(args.driving) + '.pkl'
+            dump(wfp_template, driving_template_dct)
+            log(f"Dump motion template to {wfp_template}")
+        else:
+            raise Exception(f"{args.driving} does not exist!")
+        if not flag_is_driving_video:
+            c_d_eyes_lst = c_d_eyes_lst*n_frames
+            c_d_lip_lst = c_d_lip_lst*n_frames
+
+        ######## prepare for pasteback ########
+        I_p_pstbk_lst = None
+        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
+            I_p_pstbk_lst = []
+            log("Prepared pasteback mask done.")
+
+        I_p_lst = []
+        R_d_0, x_d_0_info = None, None
+        flag_normalize_lip = inf_cfg.flag_normalize_lip  # not overwrite
+        flag_source_video_eye_retargeting = inf_cfg.flag_source_video_eye_retargeting  # not overwrite
+        lip_delta_before_animation, eye_delta_before_animation = None, None
+
+        ######## process source info ########
+        if flag_is_source_video:
+            log(f"Start making source motion template...")
+
+            source_rgb_lst = source_rgb_lst[:n_frames]
+            if inf_cfg.flag_do_crop:
+                ret_s = self.cropper.crop_source_video(source_rgb_lst, crop_cfg)
+                log(f'Source video is cropped, {len(ret_s["frame_crop_lst"])} frames are processed.')
+                if len(ret_s["frame_crop_lst"]) is not n_frames:
+                    n_frames = min(n_frames, len(ret_s["frame_crop_lst"]))
+                img_crop_256x256_lst, source_lmk_crop_lst, source_M_c2o_lst = ret_s['frame_crop_lst'], ret_s['lmk_crop_lst'], ret_s['M_c2o_lst']
+            else:
+                source_lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(source_rgb_lst)
+                img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in source_rgb_lst]  # force to resize to 256x256
+
+            c_s_eyes_lst, c_s_lip_lst = self.live_portrait_wrapper.calc_ratio(source_lmk_crop_lst)
+            # save the motion template
+            I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst)
+            source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps)
+
+            key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d'  # compatible with previous keys
+            if inf_cfg.flag_relative_motion:
+                if flag_is_driving_video:
+                    x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]
+                    x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                else:
+                    x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + (driving_template_dct['motion'][0]['exp'] - inf_cfg.lip_array) for i in range(n_frames)]
+                    x_d_exp_lst_smooth = [torch.tensor(x_d_exp[0], dtype=torch.float32, device=device) for x_d_exp in x_d_exp_lst]
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
+                    if flag_is_driving_video:
+                        x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]
+                        x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                    else:
+                        x_d_r_lst = [source_template_dct['motion'][i]['R'] for i in range(n_frames)]
+                        x_d_r_lst_smooth = [torch.tensor(x_d_r[0], dtype=torch.float32, device=device) for x_d_r in x_d_r_lst]
+            else:
+                if flag_is_driving_video:
+                    x_d_exp_lst = [driving_template_dct['motion'][i]['exp'] for i in range(n_frames)]
+                    x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                else:
+                    x_d_exp_lst = [driving_template_dct['motion'][0]['exp']]
+                    x_d_exp_lst_smooth = [torch.tensor(x_d_exp[0], dtype=torch.float32, device=device) for x_d_exp in x_d_exp_lst]*n_frames
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
+                    if flag_is_driving_video:
+                        x_d_r_lst = [driving_template_dct['motion'][i][key_r] for i in range(n_frames)]
+                        x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                    else:
+                        x_d_r_lst = [driving_template_dct['motion'][0][key_r]]
+                        x_d_r_lst_smooth = [torch.tensor(x_d_r[0], dtype=torch.float32, device=device) for x_d_r in x_d_r_lst]*n_frames
+
+        else:  # if the input is a source image, process it only once
+            if inf_cfg.flag_do_crop:
+                crop_info = self.cropper.crop_source_image(source_rgb_lst[0], crop_cfg)
+                if crop_info is None:
+                    raise Exception("No face detected in the source image!")
+                source_lmk = crop_info['lmk_crop']
+                img_crop_256x256 = crop_info['img_crop_256x256']
+            else:
+                source_lmk = self.cropper.calc_lmk_from_cropped_image(source_rgb_lst[0])
+                img_crop_256x256 = cv2.resize(source_rgb_lst[0], (256, 256))  # force to resize to 256x256
+            I_s = self.live_portrait_wrapper.prepare_source(img_crop_256x256)
+            x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)
+            x_c_s = x_s_info['kp']
+            R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+            f_s = self.live_portrait_wrapper.extract_feature_3d(I_s)
+            x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)
+
+            # let lip-open scalar to be 0 at first
+            if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
+                c_d_lip_before_animation = [0.]
+                combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
+                if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
+                    lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+
+            if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
+                mask_ori_float = prepare_paste_back(inf_cfg.mask_crop, crop_info['M_c2o'], dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0]))
+
+        ######## animate ########
+        if flag_is_driving_video or (flag_is_source_video and not flag_is_driving_video):
+            log(f"The animated video consists of {n_frames} frames.")
+        else:
+            log(f"The output of image-driven portrait animation is an image.")
+        for i in track(range(n_frames), description='🚀Animating...', total=n_frames):
+            if flag_is_source_video:  # source video
+                x_s_info = source_template_dct['motion'][i]
+                x_s_info = dct2device(x_s_info, device)
+
+                source_lmk = source_lmk_crop_lst[i]
+                img_crop_256x256 = img_crop_256x256_lst[i]
+                I_s = I_s_lst[i]
+                f_s = self.live_portrait_wrapper.extract_feature_3d(I_s)
+
+                x_c_s = x_s_info['kp']
+                R_s = x_s_info['R']
+                x_s =x_s_info['x_s']
+
+                # let lip-open scalar to be 0 at first if the input is a video
+                if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
+                    c_d_lip_before_animation = [0.]
+                    combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
+                    if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
+                        lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+                    else:
+                        lip_delta_before_animation = None
+
+                # let eye-open scalar to be the same as the first frame if the latter is eye-open state
+                if flag_source_video_eye_retargeting and source_lmk is not None:
+                    if i == 0:
+                        combined_eye_ratio_tensor_frame_zero = c_s_eyes_lst[0]
+                        c_d_eye_before_animation_frame_zero = [[combined_eye_ratio_tensor_frame_zero[0][:2].mean()]]
+                        if c_d_eye_before_animation_frame_zero[0][0] < inf_cfg.source_video_eye_retargeting_threshold:
+                            c_d_eye_before_animation_frame_zero = [[0.39]]
+                    combined_eye_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_eye_ratio(c_d_eye_before_animation_frame_zero, source_lmk)
+                    eye_delta_before_animation = self.live_portrait_wrapper.retarget_eye(x_s, combined_eye_ratio_tensor_before_animation)
+
+                if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:  # prepare for paste back
+                    mask_ori_float = prepare_paste_back(inf_cfg.mask_crop, source_M_c2o_lst[i], dsize=(source_rgb_lst[i].shape[1], source_rgb_lst[i].shape[0]))
+            if flag_is_source_video and not flag_is_driving_video:
+                x_d_i_info = driving_template_dct['motion'][0]
+            else:
+                x_d_i_info = driving_template_dct['motion'][i]
+            x_d_i_info = dct2device(x_d_i_info, device)
+            R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys
+
+            if i == 0:  # cache the first frame
+                R_d_0 = R_d_i
+                x_d_0_info = x_d_i_info.copy()
+
+            delta_new = x_s_info['exp'].clone()
+            if inf_cfg.flag_relative_motion:
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
+                    R_new = x_d_r_lst_smooth[i] if flag_is_source_video else (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
+                else:
+                    R_new = R_s
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "exp":
+                    if flag_is_source_video:
+                        for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                            delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]
+                        delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1]
+                        delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2]
+                        delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2]
+                        delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]
+                    else:
+                        if flag_is_driving_video:
+                            delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
+                        else:
+                            delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(inf_cfg.lip_array).to(dtype=torch.float32, device=device))
+                elif inf_cfg.animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        if flag_is_source_video:
+                            delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :]
+                        elif flag_is_driving_video:
+                            delta_new[:, lip_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, lip_idx, :]
+                        else:
+                            delta_new[:, lip_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(inf_cfg.lip_array).to(dtype=torch.float32, device=device)))[:, lip_idx, :]
+                elif inf_cfg.animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        if flag_is_source_video:
+                            delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :]
+                        elif flag_is_driving_video:
+                            delta_new[:, eyes_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, eyes_idx, :]
+                        else:
+                            delta_new[:, eyes_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - 0))[:, eyes_idx, :]
+                if inf_cfg.animation_region == "all":
+                    scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
+                else:
+                    scale_new = x_s_info['scale']
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
+                    t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
+                else:
+                    t_new = x_s_info['t']
+            else:
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
+                    R_new = x_d_r_lst_smooth[i] if flag_is_source_video else R_d_i
+                else:
+                    R_new = R_s
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "exp":
+                    for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :] if flag_is_source_video else x_d_i_info['exp'][:, idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1] if flag_is_source_video else x_d_i_info['exp'][:, 3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2] if flag_is_source_video else x_d_i_info['exp'][:, 5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2] if flag_is_source_video else x_d_i_info['exp'][:, 8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:] if flag_is_source_video else x_d_i_info['exp'][:, 9, 1:]
+                elif inf_cfg.animation_region == "lip":
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :] if flag_is_source_video else x_d_i_info['exp'][:, lip_idx, :]
+                elif inf_cfg.animation_region == "eyes":
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :] if flag_is_source_video else x_d_i_info['exp'][:, eyes_idx, :]
+                scale_new = x_s_info['scale']
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
+                    t_new = x_d_i_info['t']
+                else:
+                    t_new = x_s_info['t']
+
+            t_new[..., 2].fill_(0)  # zero tz
+            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+
+            if inf_cfg.flag_relative_motion and inf_cfg.driving_option == "expression-friendly" and not flag_is_source_video and flag_is_driving_video:
+                if i == 0:
+                    x_d_0_new = x_d_i_new
+                    motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)
+                    # motion_multiplier *= inf_cfg.driving_multiplier
+                x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier
+                x_d_i_new = x_d_diff + x_s
+
+            # Algorithm 1:
+            if not inf_cfg.flag_stitching and not inf_cfg.flag_eye_retargeting and not inf_cfg.flag_lip_retargeting:
+                # without stitching or retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new += lip_delta_before_animation
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+                else:
+                    pass
+            elif inf_cfg.flag_stitching and not inf_cfg.flag_eye_retargeting and not inf_cfg.flag_lip_retargeting:
+                # with stitching and without retargeting
+                if flag_normalize_lip and lip_delta_before_animation is not None:
+                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new) + lip_delta_before_animation
+                else:
+                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)
+                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                    x_d_i_new += eye_delta_before_animation
+            else:
+                eyes_delta, lip_delta = None, None
+                if inf_cfg.flag_eye_retargeting and source_lmk is not None:
+                    c_d_eyes_i = c_d_eyes_lst[i]
+                    combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio(c_d_eyes_i, source_lmk)
+                    # ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)
+                    eyes_delta = self.live_portrait_wrapper.retarget_eye(x_s, combined_eye_ratio_tensor)
+                if inf_cfg.flag_lip_retargeting and source_lmk is not None:
+                    c_d_lip_i = c_d_lip_lst[i]
+                    combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_i, source_lmk)
+                    # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+                    lip_delta = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor)
+
+                if inf_cfg.flag_relative_motion:  # use x_s
+                    x_d_i_new = x_s + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+                else:  # use x_d,i
+                    x_d_i_new = x_d_i_new + \
+                        (eyes_delta if eyes_delta is not None else 0) + \
+                        (lip_delta if lip_delta is not None else 0)
+
+                if inf_cfg.flag_stitching:
+                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)
+
+            x_d_i_new = x_s + (x_d_i_new - x_s) * inf_cfg.driving_multiplier
+            out = self.live_portrait_wrapper.warp_decode(f_s, x_s, x_d_i_new)
+            I_p_i = self.live_portrait_wrapper.parse_output(out['out'])[0]
+            I_p_lst.append(I_p_i)
+
+            if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
+                # TODO: the paste back procedure is slow, considering optimize it using multi-threading or GPU
+                if flag_is_source_video:
+                    I_p_pstbk = paste_back(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], mask_ori_float)
+                else:
+                    I_p_pstbk = paste_back(I_p_i, crop_info['M_c2o'], source_rgb_lst[0], mask_ori_float)
+                I_p_pstbk_lst.append(I_p_pstbk)
+
+        mkdir(args.output_dir)
+        wfp_concat = None
+        ######### build the final concatenation result #########
+        # driving frame | source frame | generation
+        if flag_is_source_video and flag_is_driving_video:
+            frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, img_crop_256x256_lst, I_p_lst)
+        elif flag_is_source_video and not flag_is_driving_video:
+            if flag_load_from_template:
+                frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, img_crop_256x256_lst, I_p_lst)
+            else:
+                frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst*n_frames, img_crop_256x256_lst, I_p_lst)
+        else:
+            frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, [img_crop_256x256], I_p_lst)
+
+        if flag_is_driving_video or (flag_is_source_video and not flag_is_driving_video):
+            flag_source_has_audio = flag_is_source_video and has_audio_stream(args.source)
+            flag_driving_has_audio = (not flag_load_from_template) and has_audio_stream(args.driving)
+
+            wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.mp4')
+
+            # NOTE: update output fps
+            output_fps = source_fps if flag_is_source_video else output_fps
+            images2video(frames_concatenated, wfp=wfp_concat, fps=output_fps)
+
+            if flag_source_has_audio or flag_driving_has_audio:
+                # final result with concatenation
+                wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')
+                audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
+                log(f"Audio is selected from {audio_from_which_video}, concat mode")
+                add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)
+                os.replace(wfp_concat_with_audio, wfp_concat)
+                log(f"Replace {wfp_concat_with_audio} with {wfp_concat}")
+
+            # save the animated result
+            wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.mp4')
+            if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
+                images2video(I_p_pstbk_lst, wfp=wfp, fps=output_fps)
+            else:
+                images2video(I_p_lst, wfp=wfp, fps=output_fps)
+
+            ######### build the final result #########
+            if flag_source_has_audio or flag_driving_has_audio:
+                wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')
+                audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
+                log(f"Audio is selected from {audio_from_which_video}")
+                add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)
+                os.replace(wfp_with_audio, wfp)
+                log(f"Replace {wfp_with_audio} with {wfp}")
+
+            # final log
+            if wfp_template not in (None, ''):
+                log(f'Animated template: {wfp_template}, you can specify `-d` argument with this template path next time to avoid cropping video, motion making and protecting privacy.', style='bold green')
+            log(f'Animated video: {wfp}')
+            log(f'Animated video with concat: {wfp_concat}')
+        else:
+            wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.jpg')
+            cv2.imwrite(wfp_concat, frames_concatenated[0][..., ::-1])
+            wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.jpg')
+            if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
+                cv2.imwrite(wfp, I_p_pstbk_lst[0][..., ::-1])
+            else:
+                cv2.imwrite(wfp, frames_concatenated[0][..., ::-1])
+            # final log
+            log(f'Animated image: {wfp}')
+            log(f'Animated image with concat: {wfp_concat}')
+
+        return wfp, wfp_concat
diff --git a/src/thirdparty/liveportrait/src/live_portrait_pipeline_animal.py b/src/thirdparty/liveportrait/src/live_portrait_pipeline_animal.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e8227862abdec68f2add53c0dd1aa2cb6ecba77
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/live_portrait_pipeline_animal.py
@@ -0,0 +1,237 @@
+# coding: utf-8
+
+"""
+Pipeline of LivePortrait (Animal)
+"""
+
+import warnings
+warnings.filterwarnings("ignore", message="torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument.")
+warnings.filterwarnings("ignore", message="torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly.")
+warnings.filterwarnings("ignore", message="None of the inputs have requires_grad=True. Gradients will be None")
+
+import torch
+torch.backends.cudnn.benchmark = True # disable CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR warning
+
+import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)
+import numpy as np
+import os
+import os.path as osp
+from rich.progress import track
+
+from .config.argument_config import ArgumentConfig
+from .config.inference_config import InferenceConfig
+from .config.crop_config import CropConfig
+from .utils.cropper import Cropper
+from .utils.camera import get_rotation_matrix
+from .utils.video import images2video, concat_frames, get_fps, add_audio_to_video, has_audio_stream, video2gif
+from .utils.crop import _transform_img, prepare_paste_back, paste_back
+from .utils.io import load_image_rgb, load_video, resize_to_limit, dump, load
+from .utils.helper import mkdir, basename, dct2device, is_video, is_template, remove_suffix, is_image, calc_motion_multiplier
+from .utils.rprint import rlog as log
+# from .utils.viz import viz_lmk
+from .live_portrait_wrapper import LivePortraitWrapperAnimal
+
+
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+class LivePortraitPipelineAnimal(object):
+
+    def __init__(self, inference_cfg: InferenceConfig, crop_cfg: CropConfig):
+        self.live_portrait_wrapper_animal: LivePortraitWrapperAnimal = LivePortraitWrapperAnimal(inference_cfg=inference_cfg)
+        self.cropper: Cropper = Cropper(crop_cfg=crop_cfg, image_type='animal_face', flag_use_half_precision=inference_cfg.flag_use_half_precision)
+
+    def make_motion_template(self, I_lst, **kwargs):
+        n_frames = I_lst.shape[0]
+        template_dct = {
+            'n_frames': n_frames,
+            'output_fps': kwargs.get('output_fps', 25),
+            'motion': [],
+        }
+
+        for i in track(range(n_frames), description='Making driving motion templates...', total=n_frames):
+            # collect s, R, δ and t for inference
+            I_i = I_lst[i]
+            x_i_info = self.live_portrait_wrapper_animal.get_kp_info(I_i)
+            R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])
+
+            item_dct = {
+                'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),
+                'R': R_i.cpu().numpy().astype(np.float32),
+                'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),
+                't': x_i_info['t'].cpu().numpy().astype(np.float32),
+            }
+
+            template_dct['motion'].append(item_dct)
+
+        return template_dct
+
+    def execute(self, args: ArgumentConfig):
+        # for convenience
+        inf_cfg = self.live_portrait_wrapper_animal.inference_cfg
+        device = self.live_portrait_wrapper_animal.device
+        crop_cfg = self.cropper.crop_cfg
+
+        ######## load source input ########
+        if is_image(args.source):
+            img_rgb = load_image_rgb(args.source)
+            img_rgb = resize_to_limit(img_rgb, inf_cfg.source_max_dim, inf_cfg.source_division)
+            log(f"Load source image from {args.source}")
+        else:  # source input is an unknown format
+            raise Exception(f"Unknown source format: {args.source}")
+
+        ######## process driving info ########
+        flag_load_from_template = is_template(args.driving)
+        driving_rgb_crop_256x256_lst = None
+        wfp_template = None
+
+        if flag_load_from_template:
+            # NOTE: load from template, it is fast, but the cropping video is None
+            log(f"Load from template: {args.driving}, NOT the video, so the cropping video and audio are both NULL.", style='bold green')
+            driving_template_dct = load(args.driving)
+            n_frames = driving_template_dct['n_frames']
+
+            # set output_fps
+            output_fps = driving_template_dct.get('output_fps', inf_cfg.output_fps)
+            log(f'The FPS of template: {output_fps}')
+
+            if args.flag_crop_driving_video:
+                log("Warning: flag_crop_driving_video is True, but the driving info is a template, so it is ignored.")
+
+        elif osp.exists(args.driving) and is_video(args.driving):
+            # load from video file, AND make motion template
+            output_fps = int(get_fps(args.driving))
+            log(f"Load driving video from: {args.driving}, FPS is {output_fps}")
+
+            driving_rgb_lst = load_video(args.driving)
+            n_frames = len(driving_rgb_lst)
+
+            ######## make motion template ########
+            log("Start making driving motion template...")
+            if inf_cfg.flag_crop_driving_video:
+                ret_d = self.cropper.crop_driving_video(driving_rgb_lst)
+                log(f'Driving video is cropped, {len(ret_d["frame_crop_lst"])} frames are processed.')
+                if len(ret_d["frame_crop_lst"]) is not n_frames:
+                    n_frames = min(n_frames, len(ret_d["frame_crop_lst"]))
+                driving_rgb_crop_lst = ret_d['frame_crop_lst']
+                driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_crop_lst]
+            else:
+                driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst]  # force to resize to 256x256
+            #######################################
+
+            # save the motion template
+            I_d_lst = self.live_portrait_wrapper_animal.prepare_videos(driving_rgb_crop_256x256_lst)
+            driving_template_dct = self.make_motion_template(I_d_lst, output_fps=output_fps)
+
+            wfp_template = remove_suffix(args.driving) + '.pkl'
+            dump(wfp_template, driving_template_dct)
+            log(f"Dump motion template to {wfp_template}")
+
+        else:
+            raise Exception(f"{args.driving} not exists or unsupported driving info types!")
+
+        ######## prepare for pasteback ########
+        I_p_pstbk_lst = None
+        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
+            I_p_pstbk_lst = []
+            log("Prepared pasteback mask done.")
+
+        ######## process source info ########
+        if inf_cfg.flag_do_crop:
+            crop_info = self.cropper.crop_source_image(img_rgb, crop_cfg)
+            if crop_info is None:
+                raise Exception("No animal face detected in the source image!")
+            img_crop_256x256 = crop_info['img_crop_256x256']
+        else:
+            img_crop_256x256 = cv2.resize(img_rgb, (256, 256))  # force to resize to 256x256
+        I_s = self.live_portrait_wrapper_animal.prepare_source(img_crop_256x256)
+        x_s_info = self.live_portrait_wrapper_animal.get_kp_info(I_s)
+        x_c_s = x_s_info['kp']
+        R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+        f_s = self.live_portrait_wrapper_animal.extract_feature_3d(I_s)
+        x_s = self.live_portrait_wrapper_animal.transform_keypoint(x_s_info)
+
+        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
+            mask_ori_float = prepare_paste_back(inf_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0]))
+
+        ######## animate ########
+        I_p_lst = []
+        for i in track(range(n_frames), description='🚀Animating...', total=n_frames):
+
+            x_d_i_info = driving_template_dct['motion'][i]
+            x_d_i_info = dct2device(x_d_i_info, device)
+
+            R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys
+            delta_new = x_d_i_info['exp']
+            t_new = x_d_i_info['t']
+            t_new[..., 2].fill_(0)  # zero tz
+            scale_new = x_s_info['scale']
+
+            x_d_i = scale_new * (x_c_s @ R_d_i + delta_new) + t_new
+
+            if i == 0:
+                x_d_0 = x_d_i
+                motion_multiplier = calc_motion_multiplier(x_s, x_d_0)
+
+            x_d_diff = (x_d_i - x_d_0) * motion_multiplier
+            x_d_i = x_d_diff + x_s
+
+            if not inf_cfg.flag_stitching:
+                pass
+            else:
+                x_d_i = self.live_portrait_wrapper_animal.stitching(x_s, x_d_i)
+
+            x_d_i = x_s + (x_d_i - x_s) * inf_cfg.driving_multiplier
+            out = self.live_portrait_wrapper_animal.warp_decode(f_s, x_s, x_d_i)
+            I_p_i = self.live_portrait_wrapper_animal.parse_output(out['out'])[0]
+            I_p_lst.append(I_p_i)
+
+            if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
+                I_p_pstbk = paste_back(I_p_i, crop_info['M_c2o'], img_rgb, mask_ori_float)
+                I_p_pstbk_lst.append(I_p_pstbk)
+
+        mkdir(args.output_dir)
+        wfp_concat = None
+        flag_driving_has_audio = (not flag_load_from_template) and has_audio_stream(args.driving)
+
+        ######### build the final concatenation result #########
+        # driving frame | source image | generation
+        frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, [img_crop_256x256], I_p_lst)
+        wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.mp4')
+        images2video(frames_concatenated, wfp=wfp_concat, fps=output_fps)
+
+        if flag_driving_has_audio:
+            # final result with concatenation
+            wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')
+            audio_from_which_video = args.driving
+            add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)
+            os.replace(wfp_concat_with_audio, wfp_concat)
+            log(f"Replace {wfp_concat_with_audio} with {wfp_concat}")
+
+        # save the animated result
+        wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.mp4')
+        if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
+            images2video(I_p_pstbk_lst, wfp=wfp, fps=output_fps)
+        else:
+            images2video(I_p_lst, wfp=wfp, fps=output_fps)
+
+        ######### build the final result #########
+        if flag_driving_has_audio:
+            wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')
+            audio_from_which_video = args.driving
+            add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)
+            os.replace(wfp_with_audio, wfp)
+            log(f"Replace {wfp_with_audio} with {wfp}")
+
+        # final log
+        if wfp_template not in (None, ''):
+            log(f'Animated template: {wfp_template}, you can specify `-d` argument with this template path next time to avoid cropping video, motion making and protecting privacy.', style='bold green')
+        log(f'Animated video: {wfp}')
+        log(f'Animated video with concat: {wfp_concat}')
+
+        # build the gif
+        wfp_gif = video2gif(wfp)
+        log(f'Animated gif: {wfp_gif}')
+
+
+        return wfp, wfp_concat, wfp_gif
diff --git a/src/thirdparty/liveportrait/src/live_portrait_wrapper.py b/src/thirdparty/liveportrait/src/live_portrait_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..af95812b5ea8c567b2780fe5723767d7f335c572
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/live_portrait_wrapper.py
@@ -0,0 +1,384 @@
+# coding: utf-8
+
+"""
+Wrappers for LivePortrait core functions
+"""
+
+import contextlib
+import os.path as osp
+import numpy as np
+import cv2
+import torch
+import yaml
+
+from .utils.timer import Timer
+from .utils.helper import load_model, concat_feat
+from .utils.camera import headpose_pred_to_degree, get_rotation_matrix
+from .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio
+from .config.inference_config import InferenceConfig
+from .utils.rprint import rlog as log
+
+
+class LivePortraitWrapper(object):
+    """
+    Wrapper for Human
+    """
+
+    def __init__(self, inference_cfg: InferenceConfig):
+
+        self.inference_cfg = inference_cfg
+        self.device_id = inference_cfg.device_id
+        self.compile = inference_cfg.flag_do_torch_compile
+        if inference_cfg.flag_force_cpu:
+            self.device = 'cpu'
+        else:
+            try:
+                if torch.backends.mps.is_available():
+                    self.device = 'mps'
+                else:
+                    self.device = 'cuda:' + str(self.device_id)
+            except:
+                self.device = 'cuda:' + str(self.device_id)
+
+        model_config = yaml.load(open(inference_cfg.models_config, 'r'), Loader=yaml.SafeLoader)
+        # init F
+        self.appearance_feature_extractor = load_model(inference_cfg.checkpoint_F, model_config, self.device, 'appearance_feature_extractor')
+        log(f'Load appearance_feature_extractor from {osp.realpath(inference_cfg.checkpoint_F)} done.')
+        # init M
+        self.motion_extractor = load_model(inference_cfg.checkpoint_M, model_config, self.device, 'motion_extractor')
+        log(f'Load motion_extractor from {osp.realpath(inference_cfg.checkpoint_M)} done.')
+        # init W
+        self.warping_module = load_model(inference_cfg.checkpoint_W, model_config, self.device, 'warping_module')
+        log(f'Load warping_module from {osp.realpath(inference_cfg.checkpoint_W)} done.')
+        # init G
+        self.spade_generator = load_model(inference_cfg.checkpoint_G, model_config, self.device, 'spade_generator')
+        log(f'Load spade_generator from {osp.realpath(inference_cfg.checkpoint_G)} done.')
+        # init S and R
+        if inference_cfg.checkpoint_S is not None and osp.exists(inference_cfg.checkpoint_S):
+            self.stitching_retargeting_module = load_model(inference_cfg.checkpoint_S, model_config, self.device, 'stitching_retargeting_module')
+            log(f'Load stitching_retargeting_module from {osp.realpath(inference_cfg.checkpoint_S)} done.')
+        else:
+            self.stitching_retargeting_module = None
+        # Optimize for inference
+        if self.compile:
+            torch._dynamo.config.suppress_errors = True  # Suppress errors and fall back to eager execution
+            self.warping_module = torch.compile(self.warping_module, mode='max-autotune')
+            self.spade_generator = torch.compile(self.spade_generator, mode='max-autotune')
+
+        self.timer = Timer()
+
+    def inference_ctx(self):
+        if self.device == "mps":
+            ctx = contextlib.nullcontext()
+        else:
+            ctx = torch.autocast(device_type=self.device[:4], dtype=torch.float16,
+                                 enabled=self.inference_cfg.flag_use_half_precision)
+        return ctx
+
+    def update_config(self, user_args):
+        for k, v in user_args.items():
+            if hasattr(self.inference_cfg, k):
+                setattr(self.inference_cfg, k, v)
+
+    def prepare_source(self, img: np.ndarray) -> torch.Tensor:
+        """ construct the input as standard
+        img: HxWx3, uint8, 256x256
+        """
+        h, w = img.shape[:2]
+        if h != self.inference_cfg.input_shape[0] or w != self.inference_cfg.input_shape[1]:
+            x = cv2.resize(img, (self.inference_cfg.input_shape[0], self.inference_cfg.input_shape[1]))
+        else:
+            x = img.copy()
+
+        if x.ndim == 3:
+            x = x[np.newaxis].astype(np.float32) / 255.  # HxWx3 -> 1xHxWx3, normalized to 0~1
+        elif x.ndim == 4:
+            x = x.astype(np.float32) / 255.  # BxHxWx3, normalized to 0~1
+        else:
+            raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')
+        x = np.clip(x, 0, 1)  # clip to 0~1
+        x = torch.from_numpy(x).permute(0, 3, 1, 2)  # 1xHxWx3 -> 1x3xHxW
+        x = x.to(self.device)
+        return x
+
+    def prepare_videos(self, imgs) -> torch.Tensor:
+        """ construct the input as standard
+        imgs: NxBxHxWx3, uint8
+        """
+        if isinstance(imgs, list):
+            _imgs = np.array(imgs)[..., np.newaxis]  # TxHxWx3x1
+        elif isinstance(imgs, np.ndarray):
+            _imgs = imgs
+        else:
+            raise ValueError(f'imgs type error: {type(imgs)}')
+
+        y = _imgs.astype(np.float32) / 255.
+        y = np.clip(y, 0, 1)  # clip to 0~1
+        y = torch.from_numpy(y).permute(0, 4, 3, 1, 2)  # TxHxWx3x1 -> Tx1x3xHxW
+        y = y.to(self.device)
+
+        return y
+
+    def extract_feature_3d(self, x: torch.Tensor) -> torch.Tensor:
+        """ get the appearance feature of the image by F
+        x: Bx3xHxW, normalized to 0~1
+        """
+        with torch.no_grad(), self.inference_ctx():
+            feature_3d = self.appearance_feature_extractor(x)
+
+        return feature_3d.float()
+
+    def get_kp_info(self, x: torch.Tensor, **kwargs) -> dict:
+        """ get the implicit keypoint information
+        x: Bx3xHxW, normalized to 0~1
+        flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape
+        return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'
+        """
+        with torch.no_grad(), self.inference_ctx():
+            kp_info = self.motion_extractor(x)
+
+            if self.inference_cfg.flag_use_half_precision:
+                # float the dict
+                for k, v in kp_info.items():
+                    if isinstance(v, torch.Tensor):
+                        kp_info[k] = v.float()
+
+        flag_refine_info: bool = kwargs.get('flag_refine_info', True)
+        if flag_refine_info:
+            bs = kp_info['kp'].shape[0]
+            kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None]  # Bx1
+            kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None]  # Bx1
+            kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None]  # Bx1
+            kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3)  # BxNx3
+            kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3)  # BxNx3
+
+        return kp_info
+
+    def get_pose_dct(self, kp_info: dict) -> dict:
+        pose_dct = dict(
+            pitch=headpose_pred_to_degree(kp_info['pitch']).item(),
+            yaw=headpose_pred_to_degree(kp_info['yaw']).item(),
+            roll=headpose_pred_to_degree(kp_info['roll']).item(),
+        )
+        return pose_dct
+
+    def get_fs_and_kp_info(self, source_prepared, driving_first_frame):
+
+        # get the canonical keypoints of source image by M
+        source_kp_info = self.get_kp_info(source_prepared, flag_refine_info=True)
+        source_rotation = get_rotation_matrix(source_kp_info['pitch'], source_kp_info['yaw'], source_kp_info['roll'])
+
+        # get the canonical keypoints of first driving frame by M
+        driving_first_frame_kp_info = self.get_kp_info(driving_first_frame, flag_refine_info=True)
+        driving_first_frame_rotation = get_rotation_matrix(
+            driving_first_frame_kp_info['pitch'],
+            driving_first_frame_kp_info['yaw'],
+            driving_first_frame_kp_info['roll']
+        )
+
+        # get feature volume by F
+        source_feature_3d = self.extract_feature_3d(source_prepared)
+
+        return source_kp_info, source_rotation, source_feature_3d, driving_first_frame_kp_info, driving_first_frame_rotation
+
+    def transform_keypoint(self, kp_info: dict):
+        """
+        transform the implicit keypoints with the pose, shift, and expression deformation
+        kp: BxNx3
+        """
+        kp = kp_info['kp']    # (bs, k, 3)
+        pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
+
+        t, exp = kp_info['t'], kp_info['exp']
+        scale = kp_info['scale']
+
+        pitch = headpose_pred_to_degree(pitch)
+        yaw = headpose_pred_to_degree(yaw)
+        roll = headpose_pred_to_degree(roll)
+
+        bs = kp.shape[0]
+        if kp.ndim == 2:
+            num_kp = kp.shape[1] // 3  # Bx(num_kpx3)
+        else:
+            num_kp = kp.shape[1]  # Bxnum_kpx3
+
+        rot_mat = get_rotation_matrix(pitch, yaw, roll)    # (bs, 3, 3)
+
+        # Eqn.2: s * (R * x_c,s + exp) + t
+        kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)
+        kp_transformed *= scale[..., None]  # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
+        kp_transformed[:, :, 0:2] += t[:, None, 0:2]  # remove z, only apply tx ty
+
+        return kp_transformed
+
+    def retarget_eye(self, kp_source: torch.Tensor, eye_close_ratio: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        eye_close_ratio: Bx3
+        Return: Bx(3*num_kp)
+        """
+        feat_eye = concat_feat(kp_source, eye_close_ratio)
+
+        with torch.no_grad():
+            delta = self.stitching_retargeting_module['eye'](feat_eye)
+
+        return delta.reshape(-1, kp_source.shape[1], 3)
+
+    def retarget_lip(self, kp_source: torch.Tensor, lip_close_ratio: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        lip_close_ratio: Bx2
+        Return: Bx(3*num_kp)
+        """
+        feat_lip = concat_feat(kp_source, lip_close_ratio)
+
+        with torch.no_grad():
+            delta = self.stitching_retargeting_module['lip'](feat_lip)
+
+        return delta.reshape(-1, kp_source.shape[1], 3)
+
+    def stitch(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        kp_driving: BxNx3
+        Return: Bx(3*num_kp+2)
+        """
+        feat_stiching = concat_feat(kp_source, kp_driving)
+
+        with torch.no_grad():
+            delta = self.stitching_retargeting_module['stitching'](feat_stiching)
+
+        return delta
+
+    def stitching(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+        """ conduct the stitching
+        kp_source: Bxnum_kpx3
+        kp_driving: Bxnum_kpx3
+        """
+
+        if self.stitching_retargeting_module is not None:
+
+            bs, num_kp = kp_source.shape[:2]
+
+            kp_driving_new = kp_driving.clone()
+            delta = self.stitch(kp_source, kp_driving_new)
+
+            delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3)  # 1x20x3
+            delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2)  # 1x1x2
+
+            kp_driving_new += delta_exp
+            kp_driving_new[..., :2] += delta_tx_ty
+
+            return kp_driving_new
+
+        return kp_driving
+
+    def warp_decode(self, feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+        """ get the image after the warping of the implicit keypoints
+        feature_3d: Bx32x16x64x64, feature volume
+        kp_source: BxNx3
+        kp_driving: BxNx3
+        """
+        # The line 18 in Algorithm 1: D(W(f_s; x_s, x′_d,i)）
+        with torch.no_grad(), self.inference_ctx():
+            if self.compile:
+                # Mark the beginning of a new CUDA Graph step
+                torch.compiler.cudagraph_mark_step_begin()
+            # get decoder input
+            ret_dct = self.warping_module(feature_3d, kp_source=kp_source, kp_driving=kp_driving)
+            # decode
+            ret_dct['out'] = self.spade_generator(feature=ret_dct['out'])
+
+            # float the dict
+            if self.inference_cfg.flag_use_half_precision:
+                for k, v in ret_dct.items():
+                    if isinstance(v, torch.Tensor):
+                        ret_dct[k] = v.float()
+
+        return ret_dct
+
+    def parse_output(self, out: torch.Tensor) -> np.ndarray:
+        """ construct the output as standard
+        return: 1xHxWx3, uint8
+        """
+        out = np.transpose(out.data.cpu().numpy(), [0, 2, 3, 1])  # 1x3xHxW -> 1xHxWx3
+        out = np.clip(out, 0, 1)  # clip to 0~1
+        out = np.clip(out * 255, 0, 255).astype(np.uint8)  # 0~1 -> 0~255
+
+        return out
+
+    def calc_ratio(self, lmk_lst):
+        input_eye_ratio_lst = []
+        input_lip_ratio_lst = []
+        for lmk in lmk_lst:
+            # for eyes retargeting
+            input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))
+            # for lip retargeting
+            input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))
+        return input_eye_ratio_lst, input_lip_ratio_lst
+
+    def calc_combined_eye_ratio(self, c_d_eyes_i, source_lmk):
+        c_s_eyes = calc_eye_close_ratio(source_lmk[None])
+        c_s_eyes_tensor = torch.from_numpy(c_s_eyes).float().to(self.device)
+        c_d_eyes_i_tensor = torch.Tensor([c_d_eyes_i[0][0]]).reshape(1, 1).to(self.device)
+        # [c_s,eyes, c_d,eyes,i]
+        combined_eye_ratio_tensor = torch.cat([c_s_eyes_tensor, c_d_eyes_i_tensor], dim=1)
+        return combined_eye_ratio_tensor
+
+    def calc_combined_lip_ratio(self, c_d_lip_i, source_lmk):
+        c_s_lip = calc_lip_close_ratio(source_lmk[None])
+        c_s_lip_tensor = torch.from_numpy(c_s_lip).float().to(self.device)
+        c_d_lip_i_tensor = torch.Tensor([c_d_lip_i[0]]).to(self.device).reshape(1, 1) # 1x1
+        # [c_s,lip, c_d,lip,i]
+        combined_lip_ratio_tensor = torch.cat([c_s_lip_tensor, c_d_lip_i_tensor], dim=1) # 1x2
+        return combined_lip_ratio_tensor
+
+
+class LivePortraitWrapperAnimal(LivePortraitWrapper):
+    """
+    Wrapper for Animal
+    """
+    def __init__(self, inference_cfg: InferenceConfig):
+        # super().__init__(inference_cfg)  # 调用父类的初始化方法
+
+        self.inference_cfg = inference_cfg
+        self.device_id = inference_cfg.device_id
+        self.compile = inference_cfg.flag_do_torch_compile
+        if inference_cfg.flag_force_cpu:
+            self.device = 'cpu'
+        else:
+            try: 
+                if torch.backends.mps.is_available():
+                    self.device = 'mps'
+                else:
+                    self.device = 'cuda:' + str(self.device_id)
+            except:
+                    self.device = 'cuda:' + str(self.device_id)
+
+        model_config = yaml.load(open(inference_cfg.models_config, 'r'), Loader=yaml.SafeLoader)
+        # init F
+        self.appearance_feature_extractor = load_model(inference_cfg.checkpoint_F_animal, model_config, self.device, 'appearance_feature_extractor')
+        log(f'Load appearance_feature_extractor from {osp.realpath(inference_cfg.checkpoint_F_animal)} done.')
+        # init M
+        self.motion_extractor = load_model(inference_cfg.checkpoint_M_animal, model_config, self.device, 'motion_extractor')
+        log(f'Load motion_extractor from {osp.realpath(inference_cfg.checkpoint_M_animal)} done.')
+        # init W
+        self.warping_module = load_model(inference_cfg.checkpoint_W_animal, model_config, self.device, 'warping_module')
+        log(f'Load warping_module from {osp.realpath(inference_cfg.checkpoint_W_animal)} done.')
+        # init G
+        self.spade_generator = load_model(inference_cfg.checkpoint_G_animal, model_config, self.device, 'spade_generator')
+        log(f'Load spade_generator from {osp.realpath(inference_cfg.checkpoint_G_animal)} done.')
+        # init S and R
+        if inference_cfg.checkpoint_S_animal is not None and osp.exists(inference_cfg.checkpoint_S_animal):
+            self.stitching_retargeting_module = load_model(inference_cfg.checkpoint_S_animal, model_config, self.device, 'stitching_retargeting_module')
+            log(f'Load stitching_retargeting_module from {osp.realpath(inference_cfg.checkpoint_S_animal)} done.')
+        else:
+            self.stitching_retargeting_module = None
+
+        # Optimize for inference
+        if self.compile:
+            torch._dynamo.config.suppress_errors = True  # Suppress errors and fall back to eager execution
+            self.warping_module = torch.compile(self.warping_module, mode='max-autotune')
+            self.spade_generator = torch.compile(self.spade_generator, mode='max-autotune')
+
+        self.timer = Timer()
diff --git a/src/thirdparty/liveportrait/src/modules/__init__.py b/src/thirdparty/liveportrait/src/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/thirdparty/liveportrait/src/modules/appearance_feature_extractor.py b/src/thirdparty/liveportrait/src/modules/appearance_feature_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d89e4f18a2fbe58447f52ab4c5e3f2011a4ec80
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/modules/appearance_feature_extractor.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+
+"""
+Appearance extractor(F) defined in paper, which maps the source image s to a 3D appearance feature volume.
+"""
+
+import torch
+from torch import nn
+from .util import SameBlock2d, DownBlock2d, ResBlock3d
+
+
+class AppearanceFeatureExtractor(nn.Module):
+
+    def __init__(self, image_channel, block_expansion, num_down_blocks, max_features, reshape_channel, reshape_depth, num_resblocks):
+        super(AppearanceFeatureExtractor, self).__init__()
+        self.image_channel = image_channel
+        self.block_expansion = block_expansion
+        self.num_down_blocks = num_down_blocks
+        self.max_features = max_features
+        self.reshape_channel = reshape_channel
+        self.reshape_depth = reshape_depth
+
+        self.first = SameBlock2d(image_channel, block_expansion, kernel_size=(3, 3), padding=(1, 1))
+
+        down_blocks = []
+        for i in range(num_down_blocks):
+            in_features = min(max_features, block_expansion * (2 ** i))
+            out_features = min(max_features, block_expansion * (2 ** (i + 1)))
+            down_blocks.append(DownBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1)))
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+        self.second = nn.Conv2d(in_channels=out_features, out_channels=max_features, kernel_size=1, stride=1)
+
+        self.resblocks_3d = torch.nn.Sequential()
+        for i in range(num_resblocks):
+            self.resblocks_3d.add_module('3dr' + str(i), ResBlock3d(reshape_channel, kernel_size=3, padding=1))
+
+    def forward(self, source_image):
+        out = self.first(source_image)  # Bx3x256x256 -> Bx64x256x256
+
+        for i in range(len(self.down_blocks)):
+            out = self.down_blocks[i](out)
+        out = self.second(out)
+        bs, c, h, w = out.shape  # ->Bx512x64x64
+
+        f_s = out.view(bs, self.reshape_channel, self.reshape_depth, h, w)  # ->Bx32x16x64x64
+        f_s = self.resblocks_3d(f_s)  # ->Bx32x16x64x64
+        return f_s
diff --git a/src/thirdparty/liveportrait/src/modules/convnextv2.py b/src/thirdparty/liveportrait/src/modules/convnextv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..83ea12662b607854915df8c7abb160b588d330b1
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/modules/convnextv2.py
@@ -0,0 +1,149 @@
+# coding: utf-8
+
+"""
+This moudle is adapted to the ConvNeXtV2 version for the extraction of implicit keypoints, poses, and expression deformation.
+"""
+
+import torch
+import torch.nn as nn
+# from timm.models.layers import trunc_normal_, DropPath
+from .util import LayerNorm, DropPath, trunc_normal_, GRN
+
+__all__ = ['convnextv2_tiny']
+
+
+class Block(nn.Module):
+    """ ConvNeXtV2 Block.
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+    """
+
+    def __init__(self, dim, drop_path=0.):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(4 * dim)
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class ConvNeXtV2(nn.Module):
+    """ ConvNeXt V2
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+
+    def __init__(
+        self,
+        in_chans=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.,
+        **kwargs
+    ):
+        super().__init__()
+        self.depths = depths
+        self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.ModuleList()  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)  # final norm layer
+
+        # NOTE: the output semantic items
+        num_bins = kwargs.get('num_bins', 66)
+        num_kp = kwargs.get('num_kp', 24)  # the number of implicit keypoints
+        self.fc_kp = nn.Linear(dims[-1], 3 * num_kp)  # implicit keypoints
+
+        # print('dims[-1]: ', dims[-1])
+        self.fc_scale = nn.Linear(dims[-1], 1)  # scale
+        self.fc_pitch = nn.Linear(dims[-1], num_bins)  # pitch bins
+        self.fc_yaw = nn.Linear(dims[-1], num_bins)  # yaw bins
+        self.fc_roll = nn.Linear(dims[-1], num_bins)  # roll bins
+        self.fc_t = nn.Linear(dims[-1], 3)  # translation
+        self.fc_exp = nn.Linear(dims[-1], 3 * num_kp)  # expression / delta
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        return self.norm(x.mean([-2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+
+        # implicit keypoints
+        kp = self.fc_kp(x)
+
+        # pose and expression deformation
+        pitch = self.fc_pitch(x)
+        yaw = self.fc_yaw(x)
+        roll = self.fc_roll(x)
+        t = self.fc_t(x)
+        exp = self.fc_exp(x)
+        scale = self.fc_scale(x)
+
+        ret_dct = {
+            'pitch': pitch,
+            'yaw': yaw,
+            'roll': roll,
+            't': t,
+            'exp': exp,
+            'scale': scale,
+
+            'kp': kp,  # canonical keypoint
+        }
+
+        return ret_dct
+
+
+def convnextv2_tiny(**kwargs):
+    model = ConvNeXtV2(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    return model
diff --git a/src/thirdparty/liveportrait/src/modules/dense_motion.py b/src/thirdparty/liveportrait/src/modules/dense_motion.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1a7f9a5a1b5463c4a1ddcee2bf36dcd34735706
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/modules/dense_motion.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+
+"""
+The module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving
+"""
+
+from torch import nn
+import torch.nn.functional as F
+import torch
+from .util import Hourglass, make_coordinate_grid, kp2gaussian
+
+
+class DenseMotionNetwork(nn.Module):
+    def __init__(self, block_expansion, num_blocks, max_features, num_kp, feature_channel, reshape_depth, compress, estimate_occlusion_map=True):
+        super(DenseMotionNetwork, self).__init__()
+        self.hourglass = Hourglass(block_expansion=block_expansion, in_features=(num_kp+1)*(compress+1), max_features=max_features, num_blocks=num_blocks)  # ~60+G
+
+        self.mask = nn.Conv3d(self.hourglass.out_filters, num_kp + 1, kernel_size=7, padding=3)  # 65G! NOTE: computation cost is large
+        self.compress = nn.Conv3d(feature_channel, compress, kernel_size=1)  # 0.8G
+        self.norm = nn.BatchNorm3d(compress, affine=True)
+        self.num_kp = num_kp
+        self.flag_estimate_occlusion_map = estimate_occlusion_map
+
+        if self.flag_estimate_occlusion_map:
+            self.occlusion = nn.Conv2d(self.hourglass.out_filters*reshape_depth, 1, kernel_size=7, padding=3)
+        else:
+            self.occlusion = None
+
+    def create_sparse_motions(self, feature, kp_driving, kp_source):
+        bs, _, d, h, w = feature.shape  # (bs, 4, 16, 64, 64)
+        identity_grid = make_coordinate_grid((d, h, w), ref=kp_source)  # (16, 64, 64, 3)
+        identity_grid = identity_grid.view(1, 1, d, h, w, 3)  # (1, 1, d=16, h=64, w=64, 3)
+        coordinate_grid = identity_grid - kp_driving.view(bs, self.num_kp, 1, 1, 1, 3)
+
+        k = coordinate_grid.shape[1]
+
+        # NOTE: there lacks an one-order flow
+        driving_to_source = coordinate_grid + kp_source.view(bs, self.num_kp, 1, 1, 1, 3)    # (bs, num_kp, d, h, w, 3)
+
+        # adding background feature
+        identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1, 1)
+        sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1)  # (bs, 1+num_kp, d, h, w, 3)
+        return sparse_motions
+
+    def create_deformed_feature(self, feature, sparse_motions):
+        bs, _, d, h, w = feature.shape
+        feature_repeat = feature.unsqueeze(1).unsqueeze(1).repeat(1, self.num_kp+1, 1, 1, 1, 1, 1)      # (bs, num_kp+1, 1, c, d, h, w)
+        feature_repeat = feature_repeat.view(bs * (self.num_kp+1), -1, d, h, w)                         # (bs*(num_kp+1), c, d, h, w)
+        sparse_motions = sparse_motions.view((bs * (self.num_kp+1), d, h, w, -1))                       # (bs*(num_kp+1), d, h, w, 3)
+        sparse_deformed = F.grid_sample(feature_repeat, sparse_motions, align_corners=False)
+        sparse_deformed = sparse_deformed.view((bs, self.num_kp+1, -1, d, h, w))                        # (bs, num_kp+1, c, d, h, w)
+
+        return sparse_deformed
+
+    def create_heatmap_representations(self, feature, kp_driving, kp_source):
+        spatial_size = feature.shape[3:]  # (d=16, h=64, w=64)
+        gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)
+        gaussian_source = kp2gaussian(kp_source, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)
+        heatmap = gaussian_driving - gaussian_source  # (bs, num_kp, d, h, w)
+
+        # adding background feature
+        zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0], spatial_size[1], spatial_size[2]).type(heatmap.dtype).to(heatmap.device)
+        heatmap = torch.cat([zeros, heatmap], dim=1)
+        heatmap = heatmap.unsqueeze(2)         # (bs, 1+num_kp, 1, d, h, w)
+        return heatmap
+
+    def forward(self, feature, kp_driving, kp_source):
+        bs, _, d, h, w = feature.shape  # (bs, 32, 16, 64, 64)
+
+        feature = self.compress(feature)  # (bs, 4, 16, 64, 64)
+        feature = self.norm(feature)  # (bs, 4, 16, 64, 64)
+        feature = F.relu(feature)  # (bs, 4, 16, 64, 64)
+
+        out_dict = dict()
+
+        # 1. deform 3d feature
+        sparse_motion = self.create_sparse_motions(feature, kp_driving, kp_source)  # (bs, 1+num_kp, d, h, w, 3)
+        deformed_feature = self.create_deformed_feature(feature, sparse_motion)  # (bs, 1+num_kp, c=4, d=16, h=64, w=64)
+
+        # 2. (bs, 1+num_kp, d, h, w)
+        heatmap = self.create_heatmap_representations(deformed_feature, kp_driving, kp_source)  # (bs, 1+num_kp, 1, d, h, w)
+
+        input = torch.cat([heatmap, deformed_feature], dim=2)  # (bs, 1+num_kp, c=5, d=16, h=64, w=64)
+        input = input.view(bs, -1, d, h, w)  # (bs, (1+num_kp)*c=105, d=16, h=64, w=64)
+
+        prediction = self.hourglass(input)
+
+        mask = self.mask(prediction)
+        mask = F.softmax(mask, dim=1)  # (bs, 1+num_kp, d=16, h=64, w=64)
+        out_dict['mask'] = mask
+        mask = mask.unsqueeze(2)                                   # (bs, num_kp+1, 1, d, h, w)
+        sparse_motion = sparse_motion.permute(0, 1, 5, 2, 3, 4)    # (bs, num_kp+1, 3, d, h, w)
+        deformation = (sparse_motion * mask).sum(dim=1)            # (bs, 3, d, h, w)  mask take effect in this place
+        deformation = deformation.permute(0, 2, 3, 4, 1)           # (bs, d, h, w, 3)
+
+        out_dict['deformation'] = deformation
+
+        if self.flag_estimate_occlusion_map:
+            bs, _, d, h, w = prediction.shape
+            prediction_reshape = prediction.view(bs, -1, h, w)
+            occlusion_map = torch.sigmoid(self.occlusion(prediction_reshape))  # Bx1x64x64
+            out_dict['occlusion_map'] = occlusion_map
+
+        return out_dict
diff --git a/src/thirdparty/liveportrait/src/modules/motion_extractor.py b/src/thirdparty/liveportrait/src/modules/motion_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2982e53c52d9ec1e0bec0453cc05edb51a15d23
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/modules/motion_extractor.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+
+"""
+Motion extractor(M), which directly predicts the canonical keypoints, head pose and expression deformation of the input image
+"""
+
+from torch import nn
+import torch
+
+from .convnextv2 import convnextv2_tiny
+from .util import filter_state_dict
+
+model_dict = {
+    'convnextv2_tiny': convnextv2_tiny,
+}
+
+
+class MotionExtractor(nn.Module):
+    def __init__(self, **kwargs):
+        super(MotionExtractor, self).__init__()
+
+        # default is convnextv2_base
+        backbone = kwargs.get('backbone', 'convnextv2_tiny')
+        self.detector = model_dict.get(backbone)(**kwargs)
+
+    def load_pretrained(self, init_path: str):
+        if init_path not in (None, ''):
+            state_dict = torch.load(init_path, map_location=lambda storage, loc: storage)['model']
+            state_dict = filter_state_dict(state_dict, remove_name='head')
+            ret = self.detector.load_state_dict(state_dict, strict=False)
+            print(f'Load pretrained model from {init_path}, ret: {ret}')
+
+    def forward(self, x):
+        out = self.detector(x)
+        return out
diff --git a/src/thirdparty/liveportrait/src/modules/spade_generator.py b/src/thirdparty/liveportrait/src/modules/spade_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..147a9aed0c7707fe6ae3d59ce1a30154ef75afcc
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/modules/spade_generator.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+
+"""
+Spade decoder(G) defined in the paper, which input the warped feature to generate the animated image.
+"""
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .util import SPADEResnetBlock
+
+
+class SPADEDecoder(nn.Module):
+    def __init__(self, upscale=1, max_features=256, block_expansion=64, out_channels=64, num_down_blocks=2):
+        for i in range(num_down_blocks):
+            input_channels = min(max_features, block_expansion * (2 ** (i + 1)))
+        self.upscale = upscale
+        super().__init__()
+        norm_G = 'spadespectralinstance'
+        label_num_channels = input_channels  # 256
+
+        self.fc = nn.Conv2d(input_channels, 2 * input_channels, 3, padding=1)
+        self.G_middle_0 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_1 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_2 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_3 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_4 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.G_middle_5 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
+        self.up_0 = SPADEResnetBlock(2 * input_channels, input_channels, norm_G, label_num_channels)
+        self.up_1 = SPADEResnetBlock(input_channels, out_channels, norm_G, label_num_channels)
+        self.up = nn.Upsample(scale_factor=2)
+
+        if self.upscale is None or self.upscale <= 1:
+            self.conv_img = nn.Conv2d(out_channels, 3, 3, padding=1)
+        else:
+            self.conv_img = nn.Sequential(
+                nn.Conv2d(out_channels, 3 * (2 * 2), kernel_size=3, padding=1),
+                nn.PixelShuffle(upscale_factor=2)
+            )
+
+    def forward(self, feature):
+        seg = feature  # Bx256x64x64
+        x = self.fc(feature)  # Bx512x64x64
+        x = self.G_middle_0(x, seg)
+        x = self.G_middle_1(x, seg)
+        x = self.G_middle_2(x, seg)
+        x = self.G_middle_3(x, seg)
+        x = self.G_middle_4(x, seg)
+        x = self.G_middle_5(x, seg)
+
+        x = self.up(x)  # Bx512x64x64 -> Bx512x128x128
+        x = self.up_0(x, seg)  # Bx512x128x128 -> Bx256x128x128
+        x = self.up(x)  # Bx256x128x128 -> Bx256x256x256
+        x = self.up_1(x, seg)  # Bx256x256x256 -> Bx64x256x256
+
+        x = self.conv_img(F.leaky_relu(x, 2e-1))  # Bx64x256x256 -> Bx3xHxW
+        x = torch.sigmoid(x)  # Bx3xHxW
+
+        return x
\ No newline at end of file
diff --git a/src/thirdparty/liveportrait/src/modules/stitching_retargeting_network.py b/src/thirdparty/liveportrait/src/modules/stitching_retargeting_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f50b7cf5a21cd71c70a7bbaaa4b6b68b4762ea3
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/modules/stitching_retargeting_network.py
@@ -0,0 +1,38 @@
+# coding: utf-8
+
+"""
+Stitching module(S) and two retargeting modules(R) defined in the paper.
+
+- The stitching module pastes the animated portrait back into the original image space without pixel misalignment, such as in
+the stitching region.
+
+- The eyes retargeting module is designed to address the issue of incomplete eye closure during cross-id reenactment, especially
+when a person with small eyes drives a person with larger eyes.
+
+- The lip retargeting module is designed similarly to the eye retargeting module, and can also normalize the input by ensuring that
+the lips are in a closed state, which facilitates better animation driving.
+"""
+from torch import nn
+
+
+class StitchingRetargetingNetwork(nn.Module):
+    def __init__(self, input_size, hidden_sizes, output_size):
+        super(StitchingRetargetingNetwork, self).__init__()
+        layers = []
+        for i in range(len(hidden_sizes)):
+            if i == 0:
+                layers.append(nn.Linear(input_size, hidden_sizes[i]))
+            else:
+                layers.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
+            layers.append(nn.ReLU(inplace=True))
+        layers.append(nn.Linear(hidden_sizes[-1], output_size))
+        self.mlp = nn.Sequential(*layers)
+
+    def initialize_weights_to_zero(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.zeros_(m.weight)
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x):
+        return self.mlp(x)
diff --git a/src/thirdparty/liveportrait/src/modules/util.py b/src/thirdparty/liveportrait/src/modules/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc6b925ff4d93dbb89d0d1e593bee15c888c39ee
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/modules/util.py
@@ -0,0 +1,452 @@
+# coding: utf-8
+
+"""
+This file defines various neural network modules and utility functions, including convolutional and residual blocks,
+normalizations, and functions for spatial transformation and tensor manipulation.
+"""
+
+from torch import nn
+import torch.nn.functional as F
+import torch
+import torch.nn.utils.spectral_norm as spectral_norm
+import math
+import warnings
+import collections.abc
+from itertools import repeat
+
+def kp2gaussian(kp, spatial_size, kp_variance):
+    """
+    Transform a keypoint into gaussian like representation
+    """
+    mean = kp
+
+    coordinate_grid = make_coordinate_grid(spatial_size, mean)
+    number_of_leading_dimensions = len(mean.shape) - 1
+    shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape
+    coordinate_grid = coordinate_grid.view(*shape)
+    repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 1)
+    coordinate_grid = coordinate_grid.repeat(*repeats)
+
+    # Preprocess kp shape
+    shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 3)
+    mean = mean.view(*shape)
+
+    mean_sub = (coordinate_grid - mean)
+
+    out = torch.exp(-0.5 * (mean_sub ** 2).sum(-1) / kp_variance)
+
+    return out
+
+
+def make_coordinate_grid(spatial_size, ref, **kwargs):
+    d, h, w = spatial_size
+    x = torch.arange(w).type(ref.dtype).to(ref.device)
+    y = torch.arange(h).type(ref.dtype).to(ref.device)
+    z = torch.arange(d).type(ref.dtype).to(ref.device)
+
+    # NOTE: must be right-down-in
+    x = (2 * (x / (w - 1)) - 1)  # the x axis faces to the right
+    y = (2 * (y / (h - 1)) - 1)  # the y axis faces to the bottom
+    z = (2 * (z / (d - 1)) - 1)  # the z axis faces to the inner
+
+    yy = y.view(1, -1, 1).repeat(d, 1, w)
+    xx = x.view(1, 1, -1).repeat(d, h, 1)
+    zz = z.view(-1, 1, 1).repeat(1, h, w)
+
+    meshed = torch.cat([xx.unsqueeze_(3), yy.unsqueeze_(3), zz.unsqueeze_(3)], 3)
+
+    return meshed
+
+
+class ConvT2d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, stride=2, padding=1, output_padding=1):
+        super(ConvT2d, self).__init__()
+
+        self.convT = nn.ConvTranspose2d(in_features, out_features, kernel_size=kernel_size, stride=stride,
+                                        padding=padding, output_padding=output_padding)
+        self.norm = nn.InstanceNorm2d(out_features)
+
+    def forward(self, x):
+        out = self.convT(x)
+        out = self.norm(out)
+        out = F.leaky_relu(out)
+        return out
+
+
+class ResBlock3d(nn.Module):
+    """
+    Res block, preserve spatial resolution.
+    """
+
+    def __init__(self, in_features, kernel_size, padding):
+        super(ResBlock3d, self).__init__()
+        self.conv1 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
+        self.conv2 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
+        self.norm1 = nn.BatchNorm3d(in_features, affine=True)
+        self.norm2 = nn.BatchNorm3d(in_features, affine=True)
+
+    def forward(self, x):
+        out = self.norm1(x)
+        out = F.relu(out)
+        out = self.conv1(out)
+        out = self.norm2(out)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out += x
+        return out
+
+
+class UpBlock3d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(UpBlock3d, self).__init__()
+
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = nn.BatchNorm3d(out_features, affine=True)
+
+    def forward(self, x):
+        out = F.interpolate(x, scale_factor=(1, 2, 2))
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out)
+        return out
+
+
+class DownBlock2d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+        self.pool = nn.AvgPool2d(kernel_size=(2, 2))
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out)
+        out = self.pool(out)
+        return out
+
+
+class DownBlock3d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock3d, self).__init__()
+        '''
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                                padding=padding, groups=groups, stride=(1, 2, 2))
+        '''
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = nn.BatchNorm3d(out_features, affine=True)
+        self.pool = nn.AvgPool3d(kernel_size=(1, 2, 2))
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out)
+        out = self.pool(out)
+        return out
+
+
+class SameBlock2d(nn.Module):
+    """
+    Simple block, preserve spatial resolution.
+    """
+
+    def __init__(self, in_features, out_features, groups=1, kernel_size=3, padding=1, lrelu=False):
+        super(SameBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+        if lrelu:
+            self.ac = nn.LeakyReLU()
+        else:
+            self.ac = nn.ReLU()
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = self.ac(out)
+        return out
+
+
+class Encoder(nn.Module):
+    """
+    Hourglass Encoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Encoder, self).__init__()
+
+        down_blocks = []
+        for i in range(num_blocks):
+            down_blocks.append(DownBlock3d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)), min(max_features, block_expansion * (2 ** (i + 1))), kernel_size=3, padding=1))
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+    def forward(self, x):
+        outs = [x]
+        for down_block in self.down_blocks:
+            outs.append(down_block(outs[-1]))
+        return outs
+
+
+class Decoder(nn.Module):
+    """
+    Hourglass Decoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Decoder, self).__init__()
+
+        up_blocks = []
+
+        for i in range(num_blocks)[::-1]:
+            in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1)))
+            out_filters = min(max_features, block_expansion * (2 ** i))
+            up_blocks.append(UpBlock3d(in_filters, out_filters, kernel_size=3, padding=1))
+
+        self.up_blocks = nn.ModuleList(up_blocks)
+        self.out_filters = block_expansion + in_features
+
+        self.conv = nn.Conv3d(in_channels=self.out_filters, out_channels=self.out_filters, kernel_size=3, padding=1)
+        self.norm = nn.BatchNorm3d(self.out_filters, affine=True)
+
+    def forward(self, x):
+        out = x.pop()
+        for up_block in self.up_blocks:
+            out = up_block(out)
+            skip = x.pop()
+            out = torch.cat([out, skip], dim=1)
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out)
+        return out
+
+
+class Hourglass(nn.Module):
+    """
+    Hourglass architecture.
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Hourglass, self).__init__()
+        self.encoder = Encoder(block_expansion, in_features, num_blocks, max_features)
+        self.decoder = Decoder(block_expansion, in_features, num_blocks, max_features)
+        self.out_filters = self.decoder.out_filters
+
+    def forward(self, x):
+        return self.decoder(self.encoder(x))
+
+
+class SPADE(nn.Module):
+    def __init__(self, norm_nc, label_nc):
+        super().__init__()
+
+        self.param_free_norm = nn.InstanceNorm2d(norm_nc, affine=False)
+        nhidden = 128
+
+        self.mlp_shared = nn.Sequential(
+            nn.Conv2d(label_nc, nhidden, kernel_size=3, padding=1),
+            nn.ReLU())
+        self.mlp_gamma = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
+        self.mlp_beta = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
+
+    def forward(self, x, segmap):
+        normalized = self.param_free_norm(x)
+        segmap = F.interpolate(segmap, size=x.size()[2:], mode='nearest')
+        actv = self.mlp_shared(segmap)
+        gamma = self.mlp_gamma(actv)
+        beta = self.mlp_beta(actv)
+        out = normalized * (1 + gamma) + beta
+        return out
+
+
+class SPADEResnetBlock(nn.Module):
+    def __init__(self, fin, fout, norm_G, label_nc, use_se=False, dilation=1):
+        super().__init__()
+        # Attributes
+        self.learned_shortcut = (fin != fout)
+        fmiddle = min(fin, fout)
+        self.use_se = use_se
+        # create conv layers
+        self.conv_0 = nn.Conv2d(fin, fmiddle, kernel_size=3, padding=dilation, dilation=dilation)
+        self.conv_1 = nn.Conv2d(fmiddle, fout, kernel_size=3, padding=dilation, dilation=dilation)
+        if self.learned_shortcut:
+            self.conv_s = nn.Conv2d(fin, fout, kernel_size=1, bias=False)
+        # apply spectral norm if specified
+        if 'spectral' in norm_G:
+            self.conv_0 = spectral_norm(self.conv_0)
+            self.conv_1 = spectral_norm(self.conv_1)
+            if self.learned_shortcut:
+                self.conv_s = spectral_norm(self.conv_s)
+        # define normalization layers
+        self.norm_0 = SPADE(fin, label_nc)
+        self.norm_1 = SPADE(fmiddle, label_nc)
+        if self.learned_shortcut:
+            self.norm_s = SPADE(fin, label_nc)
+
+    def forward(self, x, seg1):
+        x_s = self.shortcut(x, seg1)
+        dx = self.conv_0(self.actvn(self.norm_0(x, seg1)))
+        dx = self.conv_1(self.actvn(self.norm_1(dx, seg1)))
+        out = x_s + dx
+        return out
+
+    def shortcut(self, x, seg1):
+        if self.learned_shortcut:
+            x_s = self.conv_s(self.norm_s(x, seg1))
+        else:
+            x_s = x
+        return x_s
+
+    def actvn(self, x):
+        return F.leaky_relu(x, 2e-1)
+
+
+def filter_state_dict(state_dict, remove_name='fc'):
+    new_state_dict = {}
+    for key in state_dict:
+        if remove_name in key:
+            continue
+        new_state_dict[key] = state_dict[key]
+    return new_state_dict
+
+
+class GRN(nn.Module):
+    """ GRN (Global Response Normalization) layer
+    """
+
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def drop_path(x, drop_prob=0., training=False, scale_by_keep=True):
+    """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """ Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+
+to_2tuple = _ntuple(2)
diff --git a/src/thirdparty/liveportrait/src/modules/warping_network.py b/src/thirdparty/liveportrait/src/modules/warping_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..9191a197055a954272ee8ed86c5e34f3f33f9ad5
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/modules/warping_network.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+
+"""
+Warping field estimator(W) defined in the paper, which generates a warping field using the implicit
+keypoint representations x_s and x_d, and employs this flow field to warp the source feature volume f_s.
+"""
+
+from torch import nn
+import torch.nn.functional as F
+from .util import SameBlock2d
+from .dense_motion import DenseMotionNetwork
+
+
+class WarpingNetwork(nn.Module):
+    def __init__(
+        self,
+        num_kp,
+        block_expansion,
+        max_features,
+        num_down_blocks,
+        reshape_channel,
+        estimate_occlusion_map=False,
+        dense_motion_params=None,
+        **kwargs
+    ):
+        super(WarpingNetwork, self).__init__()
+
+        self.upscale = kwargs.get('upscale', 1)
+        self.flag_use_occlusion_map = kwargs.get('flag_use_occlusion_map', True)
+
+        if dense_motion_params is not None:
+            self.dense_motion_network = DenseMotionNetwork(
+                num_kp=num_kp,
+                feature_channel=reshape_channel,
+                estimate_occlusion_map=estimate_occlusion_map,
+                **dense_motion_params
+            )
+        else:
+            self.dense_motion_network = None
+
+        self.third = SameBlock2d(max_features, block_expansion * (2 ** num_down_blocks), kernel_size=(3, 3), padding=(1, 1), lrelu=True)
+        self.fourth = nn.Conv2d(in_channels=block_expansion * (2 ** num_down_blocks), out_channels=block_expansion * (2 ** num_down_blocks), kernel_size=1, stride=1)
+
+        self.estimate_occlusion_map = estimate_occlusion_map
+
+    def deform_input(self, inp, deformation):
+        return F.grid_sample(inp, deformation, align_corners=False)
+
+    def forward(self, feature_3d, kp_driving, kp_source):
+        if self.dense_motion_network is not None:
+            # Feature warper, Transforming feature representation according to deformation and occlusion
+            dense_motion = self.dense_motion_network(
+                feature=feature_3d, kp_driving=kp_driving, kp_source=kp_source
+            )
+            if 'occlusion_map' in dense_motion:
+                occlusion_map = dense_motion['occlusion_map']  # Bx1x64x64
+            else:
+                occlusion_map = None
+
+            deformation = dense_motion['deformation']  # Bx16x64x64x3
+            out = self.deform_input(feature_3d, deformation)  # Bx32x16x64x64
+
+            bs, c, d, h, w = out.shape  # Bx32x16x64x64
+            out = out.view(bs, c * d, h, w)  # -> Bx512x64x64
+            out = self.third(out)  # -> Bx256x64x64
+            out = self.fourth(out)  # -> Bx256x64x64
+
+            if self.flag_use_occlusion_map and (occlusion_map is not None):
+                out = out * occlusion_map
+
+        ret_dct = {
+            'occlusion_map': occlusion_map,
+            'deformation': deformation,
+            'out': out,
+        }
+
+        return ret_dct
diff --git a/src/thirdparty/liveportrait/src/utils/__init__.py b/src/thirdparty/liveportrait/src/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/thirdparty/liveportrait/src/utils/animal_landmark_runner.py b/src/thirdparty/liveportrait/src/utils/animal_landmark_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..c66efe493b08bf77467d6b920183d640eb1940ec
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/animal_landmark_runner.py
@@ -0,0 +1,138 @@
+# coding: utf-8
+
+"""
+face detectoin and alignment using XPose
+"""
+
+import os
+import pickle
+import torch
+import numpy as np
+from PIL import Image
+from torchvision.ops import nms
+
+from .timer import Timer
+from .rprint import rlog as log
+from .helper import clean_state_dict
+
+from .dependencies.XPose import transforms as T
+from .dependencies.XPose.models import build_model
+from .dependencies.XPose.predefined_keypoints import *
+from .dependencies.XPose.util import box_ops
+from .dependencies.XPose.util.config import Config
+
+
+class XPoseRunner(object):
+    def __init__(self, model_config_path, model_checkpoint_path, embeddings_cache_path=None, cpu_only=False, **kwargs):
+        self.device_id = kwargs.get("device_id", 0)
+        self.flag_use_half_precision = kwargs.get("flag_use_half_precision", True)
+        self.device = f"cuda:{self.device_id}" if not cpu_only else "cpu"
+        self.model = self.load_animal_model(model_config_path, model_checkpoint_path, self.device)
+        self.timer = Timer()
+        # Load cached embeddings if available
+        try:
+            with open(f'{embeddings_cache_path}_9.pkl', 'rb') as f:
+                self.ins_text_embeddings_9, self.kpt_text_embeddings_9 = pickle.load(f)
+            with open(f'{embeddings_cache_path}_68.pkl', 'rb') as f:
+                self.ins_text_embeddings_68, self.kpt_text_embeddings_68 = pickle.load(f)
+            print("Loaded cached embeddings from file.")
+        except Exception:
+            raise ValueError("Could not load clip embeddings from file, please check your file path.")
+
+    def load_animal_model(self, model_config_path, model_checkpoint_path, device):
+        args = Config.fromfile(model_config_path)
+        args.device = device
+        model = build_model(args)
+        checkpoint = torch.load(model_checkpoint_path, map_location=lambda storage, loc: storage)
+        load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+        model.eval()
+        return model
+
+    def load_image(self, input_image):
+        image_pil = input_image.convert("RGB")
+        transform = T.Compose([
+            T.RandomResize([800], max_size=1333),  # NOTE: fixed size to 800
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ])
+        image, _ = transform(image_pil, None)
+        return image_pil, image
+
+    def get_unipose_output(self, image, instance_text_prompt, keypoint_text_prompt, box_threshold, IoU_threshold):
+        instance_list = instance_text_prompt.split(',')
+        
+        if len(keypoint_text_prompt) == 9:
+            # torch.Size([1, 512]) torch.Size([9, 512])
+            ins_text_embeddings, kpt_text_embeddings = self.ins_text_embeddings_9, self.kpt_text_embeddings_9
+        elif len(keypoint_text_prompt) ==68:
+            # torch.Size([1, 512]) torch.Size([68, 512])
+            ins_text_embeddings, kpt_text_embeddings = self.ins_text_embeddings_68, self.kpt_text_embeddings_68
+        else:
+            raise ValueError("Invalid number of keypoint embeddings.")
+        target = {
+            "instance_text_prompt": instance_list,
+            "keypoint_text_prompt": keypoint_text_prompt,
+            "object_embeddings_text": ins_text_embeddings.float(),
+            "kpts_embeddings_text": torch.cat((kpt_text_embeddings.float(), torch.zeros(100 - kpt_text_embeddings.shape[0], 512, device=self.device)), dim=0),
+            "kpt_vis_text": torch.cat((torch.ones(kpt_text_embeddings.shape[0], device=self.device), torch.zeros(100 - kpt_text_embeddings.shape[0], device=self.device)), dim=0)
+        }
+
+        self.model = self.model.to(self.device)
+        image = image.to(self.device)
+
+        with torch.no_grad():
+            with torch.autocast(device_type=self.device[:4], dtype=torch.float16, enabled=self.flag_use_half_precision):
+                outputs = self.model(image[None], [target])
+
+        logits = outputs["pred_logits"].sigmoid()[0]
+        boxes = outputs["pred_boxes"][0]
+        keypoints = outputs["pred_keypoints"][0][:, :2 * len(keypoint_text_prompt)]
+
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        keypoints_filt = keypoints.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]
+        boxes_filt = boxes_filt[filt_mask]
+        keypoints_filt = keypoints_filt[filt_mask]
+
+        keep_indices = nms(box_ops.box_cxcywh_to_xyxy(boxes_filt), logits_filt.max(dim=1)[0], iou_threshold=IoU_threshold)
+
+        filtered_boxes = boxes_filt[keep_indices]
+        filtered_keypoints = keypoints_filt[keep_indices]
+
+        return filtered_boxes, filtered_keypoints
+
+    def run(self, input_image, instance_text_prompt, keypoint_text_example, box_threshold, IoU_threshold):
+        if keypoint_text_example in globals():
+            keypoint_dict = globals()[keypoint_text_example]
+        elif instance_text_prompt in globals():
+            keypoint_dict = globals()[instance_text_prompt]
+        else:
+            keypoint_dict = globals()["animal"]
+
+        keypoint_text_prompt = keypoint_dict.get("keypoints")
+        keypoint_skeleton = keypoint_dict.get("skeleton")
+
+        image_pil, image = self.load_image(input_image)
+        boxes_filt, keypoints_filt = self.get_unipose_output(image, instance_text_prompt, keypoint_text_prompt, box_threshold, IoU_threshold)
+
+        size = image_pil.size
+        H, W = size[1], size[0]
+        keypoints_filt = keypoints_filt[0].squeeze(0)
+        kp = np.array(keypoints_filt.cpu())
+        num_kpts = len(keypoint_text_prompt)
+        Z = kp[:num_kpts * 2] * np.array([W, H] * num_kpts)
+        Z = Z.reshape(num_kpts * 2)
+        x = Z[0::2]
+        y = Z[1::2]
+        return np.stack((x, y), axis=1)
+
+    def warmup(self):
+        self.timer.tic()
+
+        img_rgb = Image.fromarray(np.zeros((512, 512, 3), dtype=np.uint8))
+        self.run(img_rgb, 'face', 'face', box_threshold=0.0, IoU_threshold=0.0)
+
+        elapse = self.timer.toc()
+        log(f'XPoseRunner warmup time: {elapse:.3f}s')
diff --git a/src/thirdparty/liveportrait/src/utils/camera.py b/src/thirdparty/liveportrait/src/utils/camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3dd942697e1f00a96dc3efc75b883d98b52e525
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/camera.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+
+"""
+functions for processing and transforming 3D facial keypoints
+"""
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+PI = np.pi
+
+
+def headpose_pred_to_degree(pred):
+    """
+    pred: (bs, 66) or (bs, 1) or others
+    """
+    if pred.ndim > 1 and pred.shape[1] == 66:
+        # NOTE: note that the average is modified to 97.5
+        device = pred.device
+        idx_tensor = [idx for idx in range(0, 66)]
+        idx_tensor = torch.FloatTensor(idx_tensor).to(device)
+        pred = F.softmax(pred, dim=1)
+        degree = torch.sum(pred*idx_tensor, axis=1) * 3 - 97.5
+
+        return degree
+
+    return pred
+
+
+def get_rotation_matrix(pitch_, yaw_, roll_):
+    """ the input is in degree
+    """
+    # transform to radian
+    pitch = pitch_ / 180 * PI
+    yaw = yaw_ / 180 * PI
+    roll = roll_ / 180 * PI
+
+    device = pitch.device
+
+    if pitch.ndim == 1:
+        pitch = pitch.unsqueeze(1)
+    if yaw.ndim == 1:
+        yaw = yaw.unsqueeze(1)
+    if roll.ndim == 1:
+        roll = roll.unsqueeze(1)
+
+    # calculate the euler matrix
+    bs = pitch.shape[0]
+    ones = torch.ones([bs, 1]).to(device)
+    zeros = torch.zeros([bs, 1]).to(device)
+    x, y, z = pitch, yaw, roll
+
+    rot_x = torch.cat([
+        ones, zeros, zeros,
+        zeros, torch.cos(x), -torch.sin(x),
+        zeros, torch.sin(x), torch.cos(x)
+    ], dim=1).reshape([bs, 3, 3])
+
+    rot_y = torch.cat([
+        torch.cos(y), zeros, torch.sin(y),
+        zeros, ones, zeros,
+        -torch.sin(y), zeros, torch.cos(y)
+    ], dim=1).reshape([bs, 3, 3])
+
+    rot_z = torch.cat([
+        torch.cos(z), -torch.sin(z), zeros,
+        torch.sin(z), torch.cos(z), zeros,
+        zeros, zeros, ones
+    ], dim=1).reshape([bs, 3, 3])
+
+    rot = rot_z @ rot_y @ rot_x
+    return rot.permute(0, 2, 1)  # transpose
diff --git a/src/thirdparty/liveportrait/src/utils/check_windows_port.py b/src/thirdparty/liveportrait/src/utils/check_windows_port.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f9728b59c18454ea330b146bc80f27d92936be
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/check_windows_port.py
@@ -0,0 +1,18 @@
+import socket
+import sys
+
+if len(sys.argv) != 2:
+    print("Usage: python check_port.py <port>")
+    sys.exit(1)
+
+port = int(sys.argv[1])
+
+sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+sock.settimeout(1)
+result = sock.connect_ex(('127.0.0.1', port))
+
+if result == 0:
+    print("LISTENING")
+else:
+    print("NOT LISTENING")
+sock.close
diff --git a/src/thirdparty/liveportrait/src/utils/crop.py b/src/thirdparty/liveportrait/src/utils/crop.py
new file mode 100644
index 0000000000000000000000000000000000000000..724df55b663acb84753b0cdcc2db3f7feaf977f9
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/crop.py
@@ -0,0 +1,565 @@
+# coding: utf-8
+
+"""
+cropping function and the related preprocess functions for cropping
+"""
+
+import numpy as np
+import os.path as osp
+from math import sin, cos, acos, degrees
+import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False) # NOTE: enforce single thread
+
+import torch
+from PIL import Image
+
+from .rprint import rprint as print
+
+DTYPE = np.float32
+CV2_INTERP = cv2.INTER_LINEAR
+
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):
+    """ conduct similarity or affine transformation to the image, do not do border operation!
+    img:
+    M: 2x3 matrix or 3x3 matrix
+    dsize: target shape (width, height)
+    """
+    if isinstance(dsize, tuple) or isinstance(dsize, list):
+        _dsize = tuple(dsize)
+    else:
+        _dsize = (dsize, dsize)
+
+    if borderMode is not None:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
+    else:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
+
+
+def _transform_pts(pts, M):
+    """ conduct similarity or affine transformation to the pts
+    pts: Nx2 ndarray
+    M: 2x3 matrix or 3x3 matrix
+    return: Nx2
+    """
+    return pts @ M[:2, :2].T + M[:2, 2]
+
+
+def parse_pt2_from_pt101(pt101, use_lip=True):
+    """
+    parsing the 2 points according to the 101 points, which cancels the roll
+    """
+    # the former version use the eye center, but it is not robust, now use interpolation
+    pt_left_eye = np.mean(pt101[[39, 42, 45, 48]], axis=0)  # left eye center
+    pt_right_eye = np.mean(pt101[[51, 54, 57, 60]], axis=0)  # right eye center
+
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = (pt101[75] + pt101[81]) / 2
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+
+
+def parse_pt2_from_pt106(pt106, use_lip=True):
+    """
+    parsing the 2 points according to the 106 points, which cancels the roll
+    """
+    pt_left_eye = np.mean(pt106[[33, 35, 40, 39]], axis=0)  # left eye center
+    pt_right_eye = np.mean(pt106[[87, 89, 94, 93]], axis=0)  # right eye center
+
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = (pt106[52] + pt106[61]) / 2
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+
+
+def parse_pt2_from_pt203(pt203, use_lip=True):
+    """
+    parsing the 2 points according to the 203 points, which cancels the roll
+    """
+    pt_left_eye = np.mean(pt203[[0, 6, 12, 18]], axis=0)  # left eye center
+    pt_right_eye = np.mean(pt203[[24, 30, 36, 42]], axis=0)  # right eye center
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = (pt203[48] + pt203[66]) / 2
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+
+
+def parse_pt2_from_pt68(pt68, use_lip=True):
+    """
+    parsing the 2 points according to the 68 points, which cancels the roll
+    """
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55], dtype=np.int32) - 1
+    if use_lip:
+        pt5 = np.stack([
+            np.mean(pt68[lm_idx[[1, 2]], :], 0),  # left eye
+            np.mean(pt68[lm_idx[[3, 4]], :], 0),  # right eye
+            pt68[lm_idx[0], :],  # nose
+            pt68[lm_idx[5], :],  # lip
+            pt68[lm_idx[6], :]   # lip
+        ], axis=0)
+
+        pt2 = np.stack([
+            (pt5[0] + pt5[1]) / 2,
+            (pt5[3] + pt5[4]) / 2
+        ], axis=0)
+    else:
+        pt2 = np.stack([
+            np.mean(pt68[lm_idx[[1, 2]], :], 0),  # left eye
+            np.mean(pt68[lm_idx[[3, 4]], :], 0),  # right eye
+        ], axis=0)
+
+    return pt2
+
+
+def parse_pt2_from_pt5(pt5, use_lip=True):
+    """
+    parsing the 2 points according to the 5 points, which cancels the roll
+    """
+    if use_lip:
+        pt2 = np.stack([
+            (pt5[0] + pt5[1]) / 2,
+            (pt5[3] + pt5[4]) / 2
+        ], axis=0)
+    else:
+        pt2 = np.stack([
+            pt5[0],
+            pt5[1]
+        ], axis=0)
+    return pt2
+
+def parse_pt2_from_pt9(pt9, use_lip=True):
+    '''
+    parsing the 2 points according to the 9 points, which cancels the roll
+    ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip']
+    '''
+    if use_lip:
+        pt9 = np.stack([
+            (pt9[2] + pt9[3]) / 2, # left eye
+            (pt9[0] + pt9[1]) / 2, # right eye
+            pt9[4],
+            (pt9[5] + pt9[6] ) / 2 # lip
+        ], axis=0)
+        pt2 = np.stack([
+            (pt9[0] + pt9[1]) / 2, # eye
+            pt9[3] # lip
+        ], axis=0)
+    else:
+        pt2 = np.stack([
+            (pt9[2] + pt9[3]) / 2,
+            (pt9[0] + pt9[1]) / 2,
+        ], axis=0)
+
+    return pt2
+
+def parse_pt2_from_pt_x(pts, use_lip=True):
+    if pts.shape[0] == 101:
+        pt2 = parse_pt2_from_pt101(pts, use_lip=use_lip)
+    elif pts.shape[0] == 106:
+        pt2 = parse_pt2_from_pt106(pts, use_lip=use_lip)
+    elif pts.shape[0] == 68:
+        pt2 = parse_pt2_from_pt68(pts, use_lip=use_lip)
+    elif pts.shape[0] == 5:
+        pt2 = parse_pt2_from_pt5(pts, use_lip=use_lip)
+    elif pts.shape[0] == 203:
+        pt2 = parse_pt2_from_pt203(pts, use_lip=use_lip)
+    elif pts.shape[0] > 101:
+        # take the first 101 points
+        pt2 = parse_pt2_from_pt101(pts[:101], use_lip=use_lip)
+    elif pts.shape[0] == 9:
+        pt2 = parse_pt2_from_pt9(pts, use_lip=use_lip)
+    else:
+        raise Exception(f'Unknow shape: {pts.shape}')
+
+    if not use_lip:
+        # NOTE: to compile with the latter code, need to rotate the pt2 90 degrees clockwise manually
+        v = pt2[1] - pt2[0]
+        pt2[1, 0] = pt2[0, 0] - v[1]
+        pt2[1, 1] = pt2[0, 1] + v[0]
+
+    return pt2
+
+
+def parse_rect_from_landmark(
+    pts,
+    scale=1.5,
+    need_square=True,
+    vx_ratio=0,
+    vy_ratio=0,
+    use_deg_flag=False,
+    **kwargs
+):
+    """parsing center, size, angle from 101/68/5/x landmarks
+    vx_ratio: the offset ratio along the pupil axis x-axis, multiplied by size
+    vy_ratio: the offset ratio along the pupil axis y-axis, multiplied by size, which is used to contain more forehead area
+
+    judge with pts.shape
+    """
+    pt2 = parse_pt2_from_pt_x(pts, use_lip=kwargs.get('use_lip', True))
+
+    uy = pt2[1] - pt2[0]
+    l = np.linalg.norm(uy)
+    if l <= 1e-3:
+        uy = np.array([0, 1], dtype=DTYPE)
+    else:
+        uy /= l
+    ux = np.array((uy[1], -uy[0]), dtype=DTYPE)
+
+    # the rotation degree of the x-axis, the clockwise is positive, the counterclockwise is negative (image coordinate system)
+    # print(uy)
+    # print(ux)
+    angle = acos(ux[0])
+    if ux[1] < 0:
+        angle = -angle
+
+    # rotation matrix
+    M = np.array([ux, uy])
+
+    # calculate the size which contains the angle degree of the bbox, and the center
+    center0 = np.mean(pts, axis=0)
+    rpts = (pts - center0) @ M.T  # (M @ P.T).T = P @ M.T
+    lt_pt = np.min(rpts, axis=0)
+    rb_pt = np.max(rpts, axis=0)
+    center1 = (lt_pt + rb_pt) / 2
+
+    size = rb_pt - lt_pt
+    if need_square:
+        m = max(size[0], size[1])
+        size[0] = m
+        size[1] = m
+
+    size *= scale  # scale size
+    center = center0 + ux * center1[0] + uy * center1[1]  # counterclockwise rotation, equivalent to M.T @ center1.T
+    center = center + ux * (vx_ratio * size) + uy * \
+        (vy_ratio * size)  # considering the offset in vx and vy direction
+
+    if use_deg_flag:
+        angle = degrees(angle)
+
+    return center, size, angle
+
+
+def parse_bbox_from_landmark(pts, **kwargs):
+    center, size, angle = parse_rect_from_landmark(pts, **kwargs)
+    cx, cy = center
+    w, h = size
+
+    # calculate the vertex positions before rotation
+    bbox = np.array([
+        [cx-w/2, cy-h/2],  # left, top
+        [cx+w/2, cy-h/2],
+        [cx+w/2, cy+h/2],  # right, bottom
+        [cx-w/2, cy+h/2]
+    ], dtype=DTYPE)
+
+    # construct rotation matrix
+    bbox_rot = bbox.copy()
+    R = np.array([
+        [np.cos(angle), -np.sin(angle)],
+        [np.sin(angle),  np.cos(angle)]
+    ], dtype=DTYPE)
+
+    # calculate the relative position of each vertex from the rotation center, then rotate these positions, and finally add the coordinates of the rotation center
+    bbox_rot = (bbox_rot - center) @ R.T + center
+
+    return {
+        'center': center,  # 2x1
+        'size': size,  # scalar
+        'angle': angle,  # rad, counterclockwise
+        'bbox': bbox,  # 4x2
+        'bbox_rot': bbox_rot,  # 4x2
+    }
+
+
+def crop_image_by_bbox(img, bbox, lmk=None, dsize=512, angle=None, flag_rot=False, **kwargs):
+    left, top, right, bot = bbox
+    if int(right - left) != int(bot - top):
+        print(f'right-left {right-left} != bot-top {bot-top}')
+    size = right - left
+
+    src_center = np.array([(left + right) / 2, (top + bot) / 2], dtype=DTYPE)
+    tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)
+
+    s = dsize / size  # scale
+    if flag_rot and angle is not None:
+        costheta, sintheta = cos(angle), sin(angle)
+        cx, cy = src_center[0], src_center[1]  # ori center
+        tcx, tcy = tgt_center[0], tgt_center[1]  # target center
+        # need to infer
+        M_o2c = np.array(
+            [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
+             [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
+            dtype=DTYPE
+        )
+    else:
+        M_o2c = np.array(
+            [[s, 0, tgt_center[0] - s * src_center[0]],
+             [0, s, tgt_center[1] - s * src_center[1]]],
+            dtype=DTYPE
+        )
+
+    # if flag_rot and angle is None:
+        # print('angle is None, but flag_rotate is True', style="bold yellow")
+
+    img_crop = _transform_img(img, M_o2c, dsize=dsize, borderMode=kwargs.get('borderMode', None))
+    lmk_crop = _transform_pts(lmk, M_o2c) if lmk is not None else None
+
+    M_o2c = np.vstack([M_o2c, np.array([0, 0, 1], dtype=DTYPE)])
+    M_c2o = np.linalg.inv(M_o2c)
+
+    # cv2.imwrite('crop.jpg', img_crop)
+
+    return {
+        'img_crop': img_crop,
+        'lmk_crop': lmk_crop,
+        'M_o2c': M_o2c,
+        'M_c2o': M_c2o,
+    }
+
+
+def _estimate_similar_transform_from_pts(
+    pts,
+    dsize,
+    scale=1.5,
+    vx_ratio=0,
+    vy_ratio=-0.1,
+    flag_do_rot=True,
+    **kwargs
+):
+    """ calculate the affine matrix of the cropped image from sparse points, the original image to the cropped image, the inverse is the cropped image to the original image
+    pts: landmark, 101 or 68 points or other points, Nx2
+    scale: the larger scale factor, the smaller face ratio
+    vx_ratio: x shift
+    vy_ratio: y shift, the smaller the y shift, the lower the face region
+    rot_flag: if it is true, conduct correction
+    """
+    center, size, angle = parse_rect_from_landmark(
+        pts, scale=scale, vx_ratio=vx_ratio, vy_ratio=vy_ratio,
+        use_lip=kwargs.get('use_lip', True)
+    )
+
+    s = dsize / size[0]  # scale
+    tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)  # center of dsize
+
+    if flag_do_rot:
+        costheta, sintheta = cos(angle), sin(angle)
+        cx, cy = center[0], center[1]  # ori center
+        tcx, tcy = tgt_center[0], tgt_center[1]  # target center
+        # need to infer
+        M_INV = np.array(
+            [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
+             [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
+            dtype=DTYPE
+        )
+    else:
+        M_INV = np.array(
+            [[s, 0, tgt_center[0] - s * center[0]],
+             [0, s, tgt_center[1] - s * center[1]]],
+            dtype=DTYPE
+        )
+
+    M_INV_H = np.vstack([M_INV, np.array([0, 0, 1])])
+    M = np.linalg.inv(M_INV_H)
+
+    # M_INV is from the original image to the cropped image, M is from the cropped image to the original image
+    return M_INV, M[:2, ...]
+
+
+def crop_image(img, pts: np.ndarray, **kwargs):
+    dsize = kwargs.get('dsize', 224)
+    scale = kwargs.get('scale', 1.5)  # 1.5 | 1.6
+    vy_ratio = kwargs.get('vy_ratio', -0.1)  # -0.0625 | -0.1
+
+    M_INV, _ = _estimate_similar_transform_from_pts(
+        pts,
+        dsize=dsize,
+        scale=scale,
+        vy_ratio=vy_ratio,
+        flag_do_rot=kwargs.get('flag_do_rot', True),
+    )
+
+    img_crop = _transform_img(img, M_INV, dsize)  # origin to crop
+    pt_crop = _transform_pts(pts, M_INV)
+
+    M_o2c = np.vstack([M_INV, np.array([0, 0, 1], dtype=DTYPE)])
+    M_c2o = np.linalg.inv(M_o2c)
+
+    ret_dct = {
+        'M_o2c': M_o2c,  # from the original image to the cropped image 3x3
+        'M_c2o': M_c2o,  # from the cropped image to the original image 3x3
+        'img_crop': img_crop,  # the cropped image
+        'pt_crop': pt_crop,  # the landmarks of the cropped image
+    }
+
+    return ret_dct
+
+def average_bbox_lst(bbox_lst):
+    if len(bbox_lst) == 0:
+        return None
+    bbox_arr = np.array(bbox_lst)
+    return np.mean(bbox_arr, axis=0).tolist()
+
+def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
+    """prepare mask for later image paste back
+    """
+    mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
+    mask_ori = mask_ori.astype(np.float32) / 255.
+    return mask_ori
+
+################################################## Image Fusion ##################################################
+def to_npf32(image):
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    elif isinstance(image, torch.Tensor):
+        image = image.squeeze().unsqueeze(-1).cpu().numpy()
+    return image.astype(np.float32)
+
+def to_npui8(image):
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    elif isinstance(image, torch.Tensor):
+        image = image.squeeze().unsqueeze(-1).cpu().numpy()
+    return image.astype(np.uint8)
+
+def to_pil(image):
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image.astype(np.uint8)).convert('RGB')
+    return image
+        
+# Alpha Image Blending
+def alpha_blending(src, dst, msk, radiance=32):
+    src, dst, msk = to_npf32(src), to_npf32(dst), to_npf32(msk)
+    if msk.ndim == 3:
+        msk = msk[:, :, 0]
+    msk = msk / np.max(msk)
+    if radiance > 0:
+        msk = cv2.erode(msk, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, ksize=(radiance, radiance)), iterations=1)
+        #msk = cv2.blur(msk, (radiance, radiance))
+        msk = msk[:,:,np.newaxis]
+    result = dst * (1 - msk) + src * msk
+    result = np.clip(result, 0, 255)
+    #result = to_pil(result)
+    return result.astype(np.uint8)
+
+# Laplacian Pyramid Image Blending
+def fix_size(img, target):    
+    if img.shape[0] != target.shape[0] or img.shape[1] != target.shape[1]:
+        img = cv2.resize(img, (target.shape[1], target.shape[0]), interpolation=cv2.INTER_LINEAR) 
+    return img
+
+def laplacian_pyramid_blending(A, B, m, num_levels=3, radiance=32):
+    #return to_pil(to_npf32(B))
+    A, B, m = to_npf32(A), to_npf32(B), to_npf32(m)
+    if m.ndim == 3:
+        m = m[:, :, 0]
+    # print(f"src shape: {A.shape}")
+    # print(f"dst shape: {B.shape}")
+    # print(f"msk shape: {m.shape}")
+    gpA = [A.copy()]
+    gpB = [B.copy()]
+    gpM = [m.copy()]    
+    # 1. Downsampling, get Gaussian Pyramid
+    for i in range(num_levels):
+        gpA.append(cv2.pyrDown(gpA[i]))
+        gpB.append(cv2.pyrDown(gpB[i]))
+        gpM.append(cv2.pyrDown(gpM[i]))
+    # 2. Get Laplacian Pyramid via the upsampling sample and the corresponding gaussion map 
+    lpA = [gpA[num_levels]]
+    lpB = [gpB[num_levels]]
+    for i in range(num_levels, 0, -1):
+        LA = cv2.subtract(gpA[i-1], fix_size(cv2.pyrUp(gpA[i]), gpA[i-1]))
+        LB = cv2.subtract(gpB[i-1], fix_size(cv2.pyrUp(gpB[i]), gpB[i-1]))
+        lpA.append(LA)
+        lpB.append(LB)
+    # 3. Fusion according the mask  
+    LS = []
+    for la, lb, gm in zip(lpA, lpB, gpM):
+        gm = fix_size(gm, la)
+        #print(f"gm.ndim:{gm.ndim},gm:{gm.shape},la:{la.shape},lb:{lb.shape}")
+        gm = gm[..., np.newaxis] if gm.ndim == 2 else gm
+        la = fix_size(la, gm)
+        lb = fix_size(lb, gm)
+        ls = la * gm + lb * (1.0 - gm)
+        LS.append(ls)
+    # 4. Rebuilding the image via the Laplacian Pyramid 
+    ls_ = LS[0]
+    for i in range(1, num_levels + 1):
+        ls_ = fix_size(cv2.pyrUp(ls_), LS[i])
+        ls_ = cv2.add(ls_, LS[i])
+    ls_ = fix_size(ls_, A)
+    result = np.clip(ls_, 0, 255).astype('uint8')
+    # 5. alpha blending if radiance > 0
+    if radiance > 0:
+        src, dst, msk = to_npf32(result), to_npf32(B), to_npf32(m)
+        msk = msk / np.max(msk)
+        msk = cv2.erode(msk, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, ksize=(radiance, radiance)), iterations=1)
+        msk = cv2.blur(msk, (radiance, radiance))
+        msk = msk[:,:,np.newaxis]
+        result = dst * (1 - msk) + src * msk
+        result = np.clip(result, 0, 255)
+
+    #result = to_pil(result)
+    return result.astype(np.uint8)
+
+# Poisson Image Fusion
+def poisson_image_blending(src, dst, msk, radiance=16):
+    # src, tar, msk 3-channel numpy arrays, have the same size
+    src, dst, msk = to_npui8(src), to_npui8(dst), to_npui8(msk*255)
+    if msk.ndim == 3:
+        msk = msk[:, :, 0]
+    #msk = msk.repeat(3, axis=2)
+    h, w, _ = src.shape
+    loc = (w // 2, h // 2)
+    result = cv2.seamlessClone(src, dst, msk, loc, cv2.NORMAL_CLONE)
+    # with alpha fusion
+    if radiance > 0:
+        src, dst, msk = to_npf32(result), to_npf32(dst), to_npf32(msk)
+        msk = msk / np.max(msk)
+        msk = cv2.erode(msk, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, ksize=(radiance, radiance)), iterations=1)
+        msk = cv2.blur(msk, (radiance, radiance))
+        msk = msk[:,:,np.newaxis]
+        result = dst * (1 - msk) + src * msk
+        result = np.clip(result, 0, 255)
+    
+    #result = to_pil(result)
+    return result.astype(np.uint8)
+
+
+def paste_back(img_crop, M_c2o, img_ori, mask_ori, use_laplacian=False):
+    """paste back the image
+    """
+    dsize = (img_ori.shape[1], img_ori.shape[0])
+    result = _transform_img(img_crop, M_c2o, dsize=dsize)
+    if use_laplacian:
+        result = laplacian_pyramid_blending(result, img_ori, mask_ori, radiance=32)
+    else:
+        result = np.clip(mask_ori * result + (1 - mask_ori) * img_ori, 0, 255).astype(np.uint8)
+
+    return result
+
+
+def paste_back_with_face_mask(img_crop, M_c2o, img_ori, face_msk, use_laplacian=False):
+    """paste back the image with face_mask
+    """
+    dsize = (img_ori.shape[1], img_ori.shape[0])
+    result = _transform_img(img_crop, M_c2o, dsize=dsize)
+    face_msk = _transform_img(face_msk, M_c2o, dsize=dsize)
+    face_msk = face_msk[:, :, np.newaxis]
+    if use_laplacian:
+        result = laplacian_pyramid_blending(result, img_ori, face_msk, radiance=32)
+    else:
+        result = np.clip(face_msk * result + (1 - face_msk) * img_ori, 0, 255).astype(np.uint8)
+
+    return result
diff --git a/src/thirdparty/liveportrait/src/utils/cropper.py b/src/thirdparty/liveportrait/src/utils/cropper.py
new file mode 100644
index 0000000000000000000000000000000000000000..e40682200ce8bb093a058084ef02f11894a12599
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/cropper.py
@@ -0,0 +1,319 @@
+# coding: utf-8
+
+import os.path as osp
+
+import torch
+import numpy as np
+import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)
+
+from PIL import Image
+from typing import List, Tuple, Union
+from dataclasses import dataclass, field
+
+from ..config.crop_config import CropConfig
+from .crop import (
+    average_bbox_lst,
+    crop_image,
+    crop_image_by_bbox,
+    parse_bbox_from_landmark,
+)
+from .io import contiguous
+from .rprint import rlog as log
+from .face_analysis_diy import FaceAnalysisDIY
+from .human_landmark_runner import LandmarkRunner as HumanLandmark
+
+
+import sys
+sys.path.append(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__))))))
+
+
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+@dataclass
+class Trajectory:
+    start: int = -1  # start frame
+    end: int = -1  # end frame
+    lmk_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # lmk list
+    bbox_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # bbox list
+    M_c2o_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # M_c2o list
+
+    frame_rgb_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # frame list
+    lmk_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # lmk list
+    frame_rgb_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # frame crop list
+
+
+class Cropper(object):
+    def __init__(self, **kwargs) -> None:
+        self.crop_cfg: CropConfig = kwargs.get("crop_cfg", None)
+        self.image_type = kwargs.get("image_type", 'human_face')
+        device_id = kwargs.get("device_id", 0)
+        flag_force_cpu = kwargs.get("flag_force_cpu", False)
+        if flag_force_cpu:
+            device = "cpu"
+            face_analysis_wrapper_provider = ["CPUExecutionProvider"]
+        else:
+            try:
+                if torch.backends.mps.is_available():
+                    # Shape inference currently fails with CoreMLExecutionProvider
+                    # for the retinaface model
+                    device = "mps"
+                    face_analysis_wrapper_provider = ["CPUExecutionProvider"]
+                else:
+                    device = "cuda"
+                    face_analysis_wrapper_provider = ["CUDAExecutionProvider"]
+            except:
+                    device = "cuda"
+                    face_analysis_wrapper_provider = ["CUDAExecutionProvider"]
+        self.face_analysis_wrapper = FaceAnalysisDIY(
+                    name="buffalo_l",
+                    root=self.crop_cfg.insightface_root,
+                    providers=face_analysis_wrapper_provider,
+                )
+        self.face_analysis_wrapper.prepare(ctx_id=device_id, det_size=(512, 512), det_thresh=self.crop_cfg.det_thresh)
+        self.face_analysis_wrapper.warmup()
+
+        self.human_landmark_runner = HumanLandmark(
+            ckpt_path=self.crop_cfg.landmark_ckpt_path,
+            onnx_provider=device,
+            device_id=device_id,
+        )
+        self.human_landmark_runner.warmup()
+
+        if self.image_type == "animal_face":
+            from .animal_landmark_runner import XPoseRunner as AnimalLandmarkRunner
+            self.animal_landmark_runner = AnimalLandmarkRunner(
+                    model_config_path=self.crop_cfg.xpose_config_file_path,
+                    model_checkpoint_path=self.crop_cfg.xpose_ckpt_path,
+                    embeddings_cache_path=self.crop_cfg.xpose_embedding_cache_path,
+                    flag_use_half_precision=kwargs.get("flag_use_half_precision", True),
+                )
+            self.animal_landmark_runner.warmup()
+
+    def update_config(self, user_args):
+        for k, v in user_args.items():
+            if hasattr(self.crop_cfg, k):
+                setattr(self.crop_cfg, k, v)
+
+    def crop_source_image(self, img_rgb_: np.ndarray, crop_cfg: CropConfig):
+        # crop a source image and get neccessary information
+        img_rgb = img_rgb_.copy()  # copy it
+        img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+
+        if self.image_type == "human_face":
+            src_face = self.face_analysis_wrapper.get(
+                img_bgr,
+                flag_do_landmark_2d_106=True,
+                direction=crop_cfg.direction,
+                max_face_num=crop_cfg.max_face_num,
+            )
+
+            if len(src_face) == 0:
+                log("No face detected in the source image.")
+                return None
+            elif len(src_face) > 1:
+                log(f"More than one face detected in the image, only pick one face by rule {crop_cfg.direction}.")
+
+            # NOTE: temporarily only pick the first face, to support multiple face in the future
+            src_face = src_face[0]
+            lmk = src_face.landmark_2d_106  # this is the 106 landmarks from insightface
+        else:
+            tmp_dct = {
+                'animal_face_9': 'animal_face',
+                'animal_face_68': 'face'
+            }
+
+            img_rgb_pil = Image.fromarray(img_rgb)
+            lmk = self.animal_landmark_runner.run(
+                img_rgb_pil,
+                'face',
+                tmp_dct[crop_cfg.animal_face_type],
+                0,
+                0
+            )
+
+        # crop the face
+        ret_dct = crop_image(
+            img_rgb,  # ndarray
+            lmk,  # 106x2 or Nx2
+            dsize=crop_cfg.dsize,
+            scale=crop_cfg.scale,
+            vx_ratio=crop_cfg.vx_ratio,
+            vy_ratio=crop_cfg.vy_ratio,
+            flag_do_rot=crop_cfg.flag_do_rot,
+        )
+
+        # update a 256x256 version for network input
+        ret_dct["img_crop_256x256"] = cv2.resize(ret_dct["img_crop"], (256, 256), interpolation=cv2.INTER_AREA)
+        if self.image_type == "human_face":
+            lmk = self.human_landmark_runner.run(img_rgb, lmk)
+            ret_dct["lmk_crop"] = lmk
+            ret_dct["lmk_crop_256x256"] = ret_dct["lmk_crop"] * 256 / crop_cfg.dsize
+        else:
+            # 68x2 or 9x2
+            ret_dct["lmk_crop"] = lmk
+
+        return ret_dct
+
+    def calc_lmk_from_cropped_image(self, img_rgb_, **kwargs):
+        direction = kwargs.get("direction", "large-small")
+        src_face = self.face_analysis_wrapper.get(
+            contiguous(img_rgb_[..., ::-1]),  # convert to BGR
+            flag_do_landmark_2d_106=True,
+            direction=direction,
+        )
+        if len(src_face) == 0:
+            log("No face detected in the source image.")
+            return None
+        elif len(src_face) > 1:
+            log(f"More than one face detected in the image, only pick one face by rule {direction}.")
+        src_face = src_face[0]
+        lmk = src_face.landmark_2d_106
+        lmk = self.human_landmark_runner.run(img_rgb_, lmk)
+
+        return lmk
+
+    # TODO: support skipping frame with NO FACE
+    def crop_source_video(self, source_rgb_lst, crop_cfg: CropConfig, **kwargs):
+        """Tracking based landmarks/alignment and cropping"""
+        trajectory = Trajectory()
+        direction = kwargs.get("direction", "large-small")
+        for idx, frame_rgb in enumerate(source_rgb_lst):
+            if idx == 0 or trajectory.start == -1:
+                src_face = self.face_analysis_wrapper.get(
+                    contiguous(frame_rgb[..., ::-1]),
+                    flag_do_landmark_2d_106=True,
+                    direction=crop_cfg.direction,
+                    max_face_num=crop_cfg.max_face_num,
+                )
+                if len(src_face) == 0:
+                    log(f"No face detected in the frame #{idx}")
+                    continue
+                elif len(src_face) > 1:
+                    log(f"More than one face detected in the source frame_{idx}, only pick one face by rule {direction}.")
+                src_face = src_face[0]
+                lmk = src_face.landmark_2d_106
+                lmk = self.human_landmark_runner.run(frame_rgb, lmk)
+                trajectory.start, trajectory.end = idx, idx
+            else:
+                # TODO: add IOU check for tracking
+                lmk = self.human_landmark_runner.run(frame_rgb, trajectory.lmk_lst[-1])
+                trajectory.end = idx
+
+            trajectory.lmk_lst.append(lmk)
+
+            # crop the face
+            ret_dct = crop_image(
+                frame_rgb,  # ndarray
+                lmk,  # 106x2 or Nx2
+                dsize=crop_cfg.dsize,
+                scale=crop_cfg.scale,
+                vx_ratio=crop_cfg.vx_ratio,
+                vy_ratio=crop_cfg.vy_ratio,
+                flag_do_rot=crop_cfg.flag_do_rot,
+            )
+            lmk = self.human_landmark_runner.run(frame_rgb, lmk)
+            ret_dct["lmk_crop"] = lmk
+
+            # update a 256x256 version for network input
+            ret_dct["img_crop_256x256"] = cv2.resize(ret_dct["img_crop"], (256, 256), interpolation=cv2.INTER_AREA)
+            ret_dct["lmk_crop_256x256"] = ret_dct["lmk_crop"] * 256 / crop_cfg.dsize
+
+            trajectory.frame_rgb_crop_lst.append(ret_dct["img_crop_256x256"])
+            trajectory.lmk_crop_lst.append(ret_dct["lmk_crop_256x256"])
+            trajectory.M_c2o_lst.append(ret_dct['M_c2o'])
+
+        return {
+            "frame_crop_lst": trajectory.frame_rgb_crop_lst,
+            "lmk_crop_lst": trajectory.lmk_crop_lst,
+            "M_c2o_lst": trajectory.M_c2o_lst,
+        }
+
+    def crop_driving_video(self, driving_rgb_lst, **kwargs):
+        """Tracking based landmarks/alignment and cropping"""
+        trajectory = Trajectory()
+        direction = kwargs.get("direction", "large-small")
+        for idx, frame_rgb in enumerate(driving_rgb_lst):
+            if idx == 0 or trajectory.start == -1:
+                src_face = self.face_analysis_wrapper.get(
+                    contiguous(frame_rgb[..., ::-1]),
+                    flag_do_landmark_2d_106=True,
+                    direction=direction,
+                )
+                if len(src_face) == 0:
+                    log(f"No face detected in the frame #{idx}")
+                    continue
+                elif len(src_face) > 1:
+                    log(f"More than one face detected in the driving frame_{idx}, only pick one face by rule {direction}.")
+                src_face = src_face[0]
+                lmk = src_face.landmark_2d_106
+                lmk = self.human_landmark_runner.run(frame_rgb, lmk)
+                trajectory.start, trajectory.end = idx, idx
+            else:
+                lmk = self.human_landmark_runner.run(frame_rgb, trajectory.lmk_lst[-1])
+                trajectory.end = idx
+
+            trajectory.lmk_lst.append(lmk)
+            ret_bbox = parse_bbox_from_landmark(
+                lmk,
+                scale=self.crop_cfg.scale_crop_driving_video,
+                vx_ratio_crop_driving_video=self.crop_cfg.vx_ratio_crop_driving_video,
+                vy_ratio=self.crop_cfg.vy_ratio_crop_driving_video,
+            )["bbox"]
+            bbox = [
+                ret_bbox[0, 0],
+                ret_bbox[0, 1],
+                ret_bbox[2, 0],
+                ret_bbox[2, 1],
+            ]  # 4,
+            trajectory.bbox_lst.append(bbox)  # bbox
+            trajectory.frame_rgb_lst.append(frame_rgb)
+
+        global_bbox = average_bbox_lst(trajectory.bbox_lst)
+
+        for idx, (frame_rgb, lmk) in enumerate(zip(trajectory.frame_rgb_lst, trajectory.lmk_lst)):
+            ret_dct = crop_image_by_bbox(
+                frame_rgb,
+                global_bbox,
+                lmk=lmk,
+                dsize=kwargs.get("dsize", 512),
+                flag_rot=False,
+                borderValue=(0, 0, 0),
+            )
+            trajectory.frame_rgb_crop_lst.append(ret_dct["img_crop"])
+            trajectory.lmk_crop_lst.append(ret_dct["lmk_crop"])
+
+        return {
+            "frame_crop_lst": trajectory.frame_rgb_crop_lst,
+            "lmk_crop_lst": trajectory.lmk_crop_lst,
+        }
+
+
+    def calc_lmks_from_cropped_video(self, driving_rgb_crop_lst, **kwargs):
+        """Tracking based landmarks/alignment"""
+        trajectory = Trajectory()
+        direction = kwargs.get("direction", "large-small")
+
+        for idx, frame_rgb_crop in enumerate(driving_rgb_crop_lst):
+            if idx == 0 or trajectory.start == -1:
+                src_face = self.face_analysis_wrapper.get(
+                    contiguous(frame_rgb_crop[..., ::-1]),  # convert to BGR
+                    flag_do_landmark_2d_106=True,
+                    direction=direction,
+                )
+                if len(src_face) == 0:
+                    log(f"No face detected in the frame #{idx}")
+                    raise Exception(f"No face detected in the frame #{idx}")
+                elif len(src_face) > 1:
+                    log(f"More than one face detected in the driving frame_{idx}, only pick one face by rule {direction}.")
+                src_face = src_face[0]
+                lmk = src_face.landmark_2d_106
+                lmk = self.human_landmark_runner.run(frame_rgb_crop, lmk)
+                trajectory.start, trajectory.end = idx, idx
+            else:
+                lmk = self.human_landmark_runner.run(frame_rgb_crop, trajectory.lmk_lst[-1])
+                trajectory.end = idx
+
+            trajectory.lmk_lst.append(lmk)
+        return trajectory.lmk_lst
\ No newline at end of file
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/config_model/UniPose_SwinT.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/config_model/UniPose_SwinT.py
new file mode 100644
index 0000000000000000000000000000000000000000..707b359fc414b525db5a11a9bc505105f6f66741
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/config_model/UniPose_SwinT.py
@@ -0,0 +1,125 @@
+_base_ = ['coco_transformer.py']
+
+use_label_enc = True
+
+num_classes=2
+
+lr = 0.0001
+param_dict_type = 'default'
+lr_backbone = 1e-05
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 12
+lr_drop = 11
+save_checkpoint_interval = 100
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = False
+lr_drop_list = [33, 45]
+
+
+modelname = 'UniPose'
+frozen_weights = None
+backbone = 'swin_T_224_1k'
+
+
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+unic_layers = 0
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+pdetr3_bbox_embed_diff_each_layer = False
+pdetr3_refHW = -1
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dabdetr_yolo_like_anchor_update = False
+dabdetr_deformable_encoder = False
+dabdetr_deformable_decoder = False
+use_deformable_box_attn = False
+box_attn_type = 'roi_align'
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+decoder_layer_noise = False
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+add_channel_attention = False
+add_pos_value = False
+two_stage_type = 'standard'
+two_stage_pat_embed = 0
+two_stage_add_query_num = 0
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+masks = False
+
+decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
+matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+
+dec_pred_bbox_embed_share = True
+dec_pred_class_embed_share = True
+
+
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef=1.0
+dn_bbox_coef=1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+
+match_unstable_error = True
+
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+
+use_detached_boxes_dec_out = False
+
+max_text_len = 256
+shuffle_type = None
+
+use_text_enhancer = True
+use_fusion_layer = True
+
+use_checkpoint = False # True
+use_transformer_ckpt = True
+text_encoder_type = 'bert-base-uncased'
+
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+
+num_body_points=68
+binary_query_selection = False
+use_cdn = True
+ffn_extra_layernorm = False
+
+fix_size=False
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/config_model/coco_transformer.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/config_model/coco_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7b3feeaef9cc890891d3e1733e4fec91ccba426
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/config_model/coco_transformer.py
@@ -0,0 +1,8 @@
+data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+data_aug_max_size = 1333
+data_aug_scales2_resize = [400, 500, 600]
+data_aug_scales2_crop = [384, 600]
+
+
+data_aug_scale_overlap = None
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/__init__.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..07659639dd12c66e36689df0a0456a6af3d4f96d
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/__init__.py
@@ -0,0 +1,10 @@
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+from .unipose import build_unipose
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/attention.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..103cf175204e05a74c4d4dd20d0a9ed485a783a7
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/attention.py
@@ -0,0 +1,373 @@
+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from codes in torch.nn
+# ------------------------------------------------------------------------
+
+"""
+MultiheadAttention that support query, key, and value to have different dimensions.
+Query, key, and value projections are removed.
+
+Mostly copy-paste from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/activation.py#L873
+and https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4837
+"""
+
+import warnings
+import torch
+from torch.nn.modules.linear import Linear
+from torch.nn.init import constant_
+from torch.nn.modules.module import Module
+from torch._jit_internal import Optional, Tuple
+try:
+    from torch.overrides import has_torch_function, handle_torch_function
+except:
+    from torch._overrides import has_torch_function, handle_torch_function
+from torch.nn.functional import linear, pad, softmax, dropout
+Tensor = torch.Tensor
+
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See reference: Attention Is All You Need
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+        Note: if kdim and vdim are None, they will be set to embed_dim such that
+        query, key, and value have the same number of features.
+    Examples::
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    """
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        vdim = vdim if vdim is not None else embed_dim
+        self.out_proj = Linear(vdim , vdim)
+
+        self.in_proj_bias = None
+        self.in_proj_weight = None
+        self.bias_k = self.bias_v = None
+        self.q_proj_weight = None
+        self.k_proj_weight = None
+        self.v_proj_weight = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.out_proj.bias, 0.)
+
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+
+        super(MultiheadAttention, self).__setstate__(state)
+
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+        r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored. When given
+            a byte mask and a value is non-zero, the corresponding value on the attention
+            layer will be ignored
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+    Shape:
+        - Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the position
+          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight, out_dim=self.vdim)
+        else:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, out_dim=self.vdim)
+
+
+def multi_head_attention_forward(query: Tensor,
+                                 key: Tensor,
+                                 value: Tensor,
+                                 embed_dim_to_check: int,
+                                 num_heads: int,
+                                 in_proj_weight: Tensor,
+                                 in_proj_bias: Tensor,
+                                 bias_k: Optional[Tensor],
+                                 bias_v: Optional[Tensor],
+                                 add_zero_attn: bool,
+                                 dropout_p: float,
+                                 out_proj_weight: Tensor,
+                                 out_proj_bias: Tensor,
+                                 training: bool = True,
+                                 key_padding_mask: Optional[Tensor] = None,
+                                 need_weights: bool = True,
+                                 attn_mask: Optional[Tensor] = None,
+                                 use_separate_proj_weight: bool = False,
+                                 q_proj_weight: Optional[Tensor] = None,
+                                 k_proj_weight: Optional[Tensor] = None,
+                                 v_proj_weight: Optional[Tensor] = None,
+                                 static_k: Optional[Tensor] = None,
+                                 static_v: Optional[Tensor] = None,
+                                 out_dim: Optional[Tensor] = None
+                                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    Shape:
+        Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
+          will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+    """
+    if not torch.jit.is_scripting():
+        tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
+                    out_proj_weight, out_proj_bias)
+        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
+            return handle_torch_function(
+                multi_head_attention_forward, tens_ops, query, key, value,
+                embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
+                bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
+                out_proj_bias, training=training, key_padding_mask=key_padding_mask,
+                need_weights=need_weights, attn_mask=attn_mask,
+                use_separate_proj_weight=use_separate_proj_weight,
+                q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
+                v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+    head_dim = embed_dim // num_heads
+    v_head_dim = out_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+
+    q = query * scaling
+    k = key
+    v = value
+
+    if attn_mask is not None:
+        assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
+            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
+            'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+        key_padding_mask = key_padding_mask.to(torch.bool)
+
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
+
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == v_head_dim
+        v = static_v
+
+    src_len = k.size(1)
+
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+        else:
+            attn_output_weights += attn_mask
+
+
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+
+    # attn_output_weights = softmax(
+    #     attn_output_weights, dim=-1)
+    attn_output_weights = softmax(
+            attn_output_weights - attn_output_weights.max(dim=-1, keepdim=True)[0], dim=-1)
+    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+
+    attn_output = torch.bmm(attn_output_weights, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/backbone.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f4c04693ae6ed96eccf98a9b98aef82daab089
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/backbone.py
@@ -0,0 +1,211 @@
+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+"""
+Backbone modules.
+"""
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+
+from util.misc import NestedTensor, is_main_process
+
+from .position_encoding import build_position_encoding
+from .swin_transformer import build_swin_transformer
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+class BackboneBase(nn.Module):
+    def __init__(
+        self,
+        backbone: nn.Module,
+        train_backbone: bool,
+        num_channels: int,
+        return_interm_indices: list,
+    ):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if (
+                not train_backbone
+                or "layer2" not in name
+                and "layer3" not in name
+                and "layer4" not in name
+            ):
+                parameter.requires_grad_(False)
+
+        return_layers = {}
+        for idx, layer_index in enumerate(return_interm_indices):
+            return_layers.update(
+                {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}
+            )
+
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        # import ipdb; ipdb.set_trace()
+        return out
+
+
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+
+    def __init__(
+        self,
+        name: str,
+        train_backbone: bool,
+        dilation: bool,
+        return_interm_indices: list,
+        batch_norm=FrozenBatchNorm2d,
+    ):
+        if name in ["resnet18", "resnet34", "resnet50", "resnet101"]:
+            backbone = getattr(torchvision.models, name)(
+                replace_stride_with_dilation=[False, False, dilation],
+                pretrained=is_main_process(),
+                norm_layer=batch_norm,
+            )
+        else:
+            raise NotImplementedError("Why you can get here with name {}".format(name))
+        # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available."
+        assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+        num_channels_all = [256, 512, 1024, 2048]
+        num_channels = num_channels_all[4 - len(return_interm_indices) :]
+        super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
+
+
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+
+        return out, pos
+
+
+def build_backbone(args):
+    """
+    Useful args:
+        - backbone: backbone name
+        - lr_backbone:
+        - dilation
+        - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
+        - backbone_freeze_keywords:
+        - use_checkpoint: for swin only for now
+
+    """
+    position_embedding = build_position_encoding(args)
+    train_backbone = True
+    if not train_backbone:
+        raise ValueError("Please set lr_backbone > 0")
+    return_interm_indices = args.return_interm_indices
+    assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+    args.backbone_freeze_keywords
+    use_checkpoint = getattr(args, "use_checkpoint", False)
+
+    if args.backbone in ["resnet50", "resnet101"]:
+        backbone = Backbone(
+            args.backbone,
+            train_backbone,
+            args.dilation,
+            return_interm_indices,
+            batch_norm=FrozenBatchNorm2d,
+        )
+        bb_num_channels = backbone.num_channels
+    elif args.backbone in [
+        "swin_T_224_1k",
+        "swin_B_224_22k",
+        "swin_B_384_22k",
+        "swin_L_224_22k",
+        "swin_L_384_22k",
+    ]:
+        pretrain_img_size = int(args.backbone.split("_")[-2])
+        backbone = build_swin_transformer(
+            args.backbone,
+            pretrain_img_size=pretrain_img_size,
+            out_indices=tuple(return_interm_indices),
+            dilation=False,
+            use_checkpoint=use_checkpoint,
+        )
+
+        bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
+    else:
+        raise NotImplementedError("Unknown backbone {}".format(args.backbone))
+
+    assert len(bb_num_channels) == len(
+        return_interm_indices
+    ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
+
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = bb_num_channels
+    assert isinstance(
+        bb_num_channels, List
+    ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
+    return model
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/deformable_transformer.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/deformable_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f99779d335ac6211fbc8e9168b017e2892875ac
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/deformable_transformer.py
@@ -0,0 +1,1230 @@
+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+import math
+import copy
+import torch
+import torch.utils.checkpoint as checkpoint
+from torch import nn, Tensor
+from typing import Optional
+from util.misc import inverse_sigmoid
+
+from .transformer_vanilla import TransformerEncoderLayer
+from .fuse_modules import BiAttentionBlock
+from .utils import gen_encoder_output_proposals, MLP, _get_activation_fn, gen_sineembed_for_position, get_sine_pos_embed
+from .ops.modules import MSDeformAttn
+
+
+class DeformableTransformer(nn.Module):
+
+    def __init__(self, d_model=256, nhead=8,
+                 num_queries=300,
+                 num_encoder_layers=6,
+                 num_unicoder_layers=0,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048, dropout=0.0,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=False, query_dim=4,
+                 num_patterns=0,
+                 modulate_hw_attn=False,
+                 # for deformable encoder
+                 deformable_encoder=False,
+                 deformable_decoder=False,
+                 num_feature_levels=1,
+                 enc_n_points=4,
+                 dec_n_points=4,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 # init query
+                 learnable_tgt_init=False,
+                 decoder_query_perturber=None,
+                 add_channel_attention=False,
+                 add_pos_value=False,
+                 random_refpoints_xy=False,
+                 # two stage
+                 two_stage_type='no',
+                 two_stage_pat_embed=0,
+                 two_stage_add_query_num=0,
+                 two_stage_learn_wh=False,
+                 two_stage_keep_all_tokens=False,
+                 # evo of #anchors
+                 dec_layer_number=None,
+                 rm_enc_query_scale=True,
+                 rm_dec_query_scale=True,
+                 rm_self_attn_layers=None,
+                 key_aware_type=None,
+                 # layer share
+                 layer_share_type=None,
+                 # for detach
+                 rm_detach=None,
+                 decoder_sa_type='ca',
+                 module_seq=['sa', 'ca', 'ffn'],
+                 # for dn
+                 embed_init_tgt=False,
+
+                 use_detached_boxes_dec_out=False,
+                 use_text_enhancer=False,
+                 use_fusion_layer=False,
+                 use_checkpoint=False,
+                 use_transformer_ckpt=False,
+                 use_text_cross_attention=False,
+                 text_dropout=0.1,
+                 fusion_dropout=0.1,
+                 fusion_droppath=0.0,
+
+                 binary_query_selection=False,
+                 ffn_extra_layernorm=False,
+                 ):
+        super().__init__()
+        self.num_feature_levels = num_feature_levels
+        self.num_encoder_layers = num_encoder_layers
+        self.num_unicoder_layers = num_unicoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.deformable_encoder = deformable_encoder
+        self.deformable_decoder = deformable_decoder
+        self.two_stage_keep_all_tokens = two_stage_keep_all_tokens
+        self.num_queries = num_queries
+        self.random_refpoints_xy = random_refpoints_xy
+        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
+        self.ffn_extra_layernorm = ffn_extra_layernorm
+        assert query_dim == 4
+
+        self.binary_query_selection = binary_query_selection
+        if self.binary_query_selection:
+            self.binary_query_selection_layer = nn.Linear(d_model, 1)
+        # assert not binary_query_selection, 'binary_query_selection not implemented yet'
+
+        if num_feature_levels > 1:
+            assert deformable_encoder, "only support deformable_encoder for num_feature_levels > 1"
+        if use_deformable_box_attn:
+            assert deformable_encoder or deformable_encoder
+
+        assert layer_share_type in [None, 'encoder', 'decoder', 'both']
+        if layer_share_type in ['encoder', 'both']:
+            enc_layer_share = True
+        else:
+            enc_layer_share = False
+        if layer_share_type in ['decoder', 'both']:
+            dec_layer_share = True
+        else:
+            dec_layer_share = False
+        assert layer_share_type is None
+
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+
+        # choose encoder layer type
+        if deformable_encoder:
+            encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
+                                                              dropout, activation,
+                                                              num_feature_levels, nhead, enc_n_points,
+                                                              add_channel_attention=add_channel_attention,
+                                                              use_deformable_box_attn=use_deformable_box_attn,
+                                                              box_attn_type=box_attn_type)
+        else:
+            raise NotImplementedError
+
+        if use_text_enhancer:
+            text_enhance_layer = TransformerEncoderLayer(
+                d_model=d_model,
+                nhead=nhead // 2,
+                dim_feedforward=dim_feedforward // 2,
+                dropout=text_dropout
+            )
+        else:
+            text_enhance_layer = None
+
+        if use_fusion_layer:
+            feature_fusion_layer = BiAttentionBlock(
+                v_dim=d_model,
+                l_dim=d_model,
+                embed_dim=dim_feedforward // 2,
+                num_heads=nhead // 2,
+                dropout=fusion_dropout,
+                drop_path=fusion_droppath
+            )
+        else:
+            feature_fusion_layer = None
+
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        assert encoder_norm is None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, d_model=d_model,
+            num_queries=num_queries,
+            enc_layer_share=enc_layer_share,
+            text_enhance_layer=text_enhance_layer,
+            feature_fusion_layer=feature_fusion_layer,
+            use_checkpoint=use_checkpoint,
+            use_transformer_ckpt=use_transformer_ckpt,
+        )
+
+        # choose decoder layer type
+        if deformable_decoder:
+            decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                              dropout, activation,
+                                                              num_feature_levels, nhead, dec_n_points,
+                                                              use_text_cross_attention=use_text_cross_attention,
+                                                              ffn_extra_layernorm=ffn_extra_layernorm, )
+
+        else:
+            raise NotImplementedError
+
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec,
+                                          d_model=d_model, query_dim=query_dim,
+                                          modulate_hw_attn=modulate_hw_attn,
+                                          num_feature_levels=num_feature_levels,
+                                          deformable_decoder=deformable_decoder,
+                                          decoder_query_perturber=decoder_query_perturber,
+                                          dec_layer_number=dec_layer_number, rm_dec_query_scale=rm_dec_query_scale,
+                                          dec_layer_share=dec_layer_share,
+                                          use_detached_boxes_dec_out=use_detached_boxes_dec_out
+                                          )
+
+        self.d_model = d_model
+        self.nhead = nhead
+        self.dec_layers = num_decoder_layers
+        self.num_queries = num_queries  # useful for single stage model only
+        self.num_patterns = num_patterns
+        if not isinstance(num_patterns, int):
+            Warning("num_patterns should be int but {}".format(type(num_patterns)))
+            self.num_patterns = 0
+
+        if num_feature_levels > 1:
+            if self.num_encoder_layers > 0:
+                self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+            else:
+                self.level_embed = None
+
+        self.learnable_tgt_init = learnable_tgt_init
+        assert learnable_tgt_init, "why not learnable_tgt_init"
+        self.embed_init_tgt = embed_init_tgt
+        if (two_stage_type != 'no' and embed_init_tgt) or (two_stage_type == 'no'):
+            self.tgt_embed = nn.Embedding(self.num_queries, d_model)
+            nn.init.normal_(self.tgt_embed.weight.data)
+        else:
+            self.tgt_embed = None
+
+        # for two stage
+        self.two_stage_type = two_stage_type
+        self.two_stage_pat_embed = two_stage_pat_embed
+        self.two_stage_add_query_num = two_stage_add_query_num
+        self.two_stage_learn_wh = two_stage_learn_wh
+        assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
+        if two_stage_type == 'standard':
+            # anchor selection at the output of encoder
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+
+            if two_stage_pat_embed > 0:
+                self.pat_embed_for_2stage = nn.Parameter(torch.Tensor(two_stage_pat_embed, d_model))
+                nn.init.normal_(self.pat_embed_for_2stage)
+
+            if two_stage_add_query_num > 0:
+                self.tgt_embed = nn.Embedding(self.two_stage_add_query_num, d_model)
+
+            if two_stage_learn_wh:
+                # import ipdb; ipdb.set_trace()
+                self.two_stage_wh_embedding = nn.Embedding(1, 2)
+            else:
+                self.two_stage_wh_embedding = None
+
+        if two_stage_type == 'no':
+            self.init_ref_points(num_queries)  # init self.refpoint_embed
+
+        self.enc_out_class_embed = None
+        self.enc_out_bbox_embed = None
+
+        # evolution of anchors
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            if self.two_stage_type != 'no' or num_patterns == 0:
+                assert dec_layer_number[
+                           0] == num_queries, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries})"
+            else:
+                assert dec_layer_number[
+                           0] == num_queries * num_patterns, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries}) * num_patterns({num_patterns})"
+
+        self._reset_parameters()
+
+        self.rm_self_attn_layers = rm_self_attn_layers
+        if rm_self_attn_layers is not None:
+            # assert len(rm_self_attn_layers) == num_decoder_layers
+            print("Removing the self-attn in {} decoder layers".format(rm_self_attn_layers))
+            for lid, dec_layer in enumerate(self.decoder.layers):
+                if lid in rm_self_attn_layers:
+                    dec_layer.rm_self_attn_modules()
+
+        self.rm_detach = rm_detach
+        if self.rm_detach:
+            assert isinstance(rm_detach, list)
+            assert any([i in ['enc_ref', 'enc_tgt', 'dec'] for i in rm_detach])
+        self.decoder.rm_detach = rm_detach
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if self.num_feature_levels > 1 and self.level_embed is not None:
+            nn.init.normal_(self.level_embed)
+
+        if self.two_stage_learn_wh:
+            nn.init.constant_(self.two_stage_wh_embedding.weight, math.log(0.05 / (1 - 0.05)))
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, 4)
+
+        if self.random_refpoints_xy:
+            # import ipdb; ipdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+
+    def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, attn_mask2=None, text_dict=None,
+                dn_meta=None,targets=None,kpt_embed=None):
+        """
+        Input:
+            - srcs: List of multi features [bs, ci, hi, wi]
+            - masks: List of multi masks [bs, hi, wi]
+            - refpoint_embed: [bs, num_dn, 4]. None in infer
+            - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]
+            - tgt: [bs, num_dn, d_model]. None in infer
+
+        """
+        # if self.two_stage_type != 'no' and self.two_stage_add_query_num == 0:
+        #     assert refpoint_embed is None
+
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+
+            src = src.flatten(2).transpose(1, 2)  # bs, hw, c
+            mask = mask.flatten(1)  # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)  # bs, hw, c
+            if self.num_feature_levels > 1 and self.level_embed is not None:
+                lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            else:
+                lvl_pos_embed = pos_embed
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)  # bs, \sum{hxw}, c
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        # two stage
+        enc_topk_proposals = enc_refpoint_embed = None
+
+        #########################################################
+        # Begin Encoder
+        #########################################################
+        memory, memory_text = self.encoder(
+            src_flatten,
+            pos=lvl_pos_embed_flatten,
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            key_padding_mask=mask_flatten,
+            memory_text=text_dict['encoded_text'],
+            text_attention_mask=~text_dict['text_token_mask'],
+            # we ~ the mask . False means use the token; True means pad the token
+            position_ids=text_dict['position_ids'],
+            text_self_attention_masks=text_dict['text_self_attention_masks'],
+        )
+        #########################################################
+        # End Encoder
+        # - memory: bs, \sum{hw}, c
+        # - mask_flatten: bs, \sum{hw}
+        # - lvl_pos_embed_flatten: bs, \sum{hw}, c
+        # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        #########################################################
+        text_dict['encoded_text'] = memory_text
+
+        if self.two_stage_type == 'standard':
+            if self.two_stage_learn_wh:
+                input_hw = self.two_stage_wh_embedding.weight[0]
+            else:
+                input_hw = None
+            output_memory, output_proposals = gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes,
+                                                                           input_hw)
+            output_memory = self.enc_output_norm(self.enc_output(output_memory))
+
+            if self.two_stage_pat_embed > 0:
+                bs, nhw, _ = output_memory.shape
+                # output_memory: bs, n, 256; self.pat_embed_for_2stage: k, 256
+                output_memory = output_memory.repeat(1, self.two_stage_pat_embed, 1)
+                _pats = self.pat_embed_for_2stage.repeat_interleave(nhw, 0)
+                output_memory = output_memory + _pats
+                output_proposals = output_proposals.repeat(1, self.two_stage_pat_embed, 1)
+
+            if self.two_stage_add_query_num > 0:
+                assert refpoint_embed is not None
+                output_memory = torch.cat((output_memory, tgt), dim=1)
+                output_proposals = torch.cat((output_proposals, refpoint_embed), dim=1)
+
+            if self.binary_query_selection:
+                topk_logits = self.binary_query_selection_layer(output_memory).squeeze(-1)
+            else:
+                if text_dict is not None:
+                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
+                else:
+                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
+
+                topk_logits = enc_outputs_class_unselected.max(-1)[0]
+            enc_outputs_coord_unselected = self.enc_out_bbox_embed(
+                output_memory) + output_proposals  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]  # bs, nq
+
+            # gather boxes
+            refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1,
+                                                   topk_proposals.unsqueeze(-1).repeat(1, 1, 4))  # unsigmoid
+            refpoint_embed_ = refpoint_embed_undetach.detach()
+            init_box_proposal = torch.gather(output_proposals, 1,
+                                             topk_proposals.unsqueeze(-1).repeat(1, 1, 4)).sigmoid()  # sigmoid
+
+            # gather tgt
+            tgt_undetach = torch.gather(output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
+            if self.embed_init_tgt:
+                tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            else:
+                tgt_ = tgt_undetach.detach()
+
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+        elif self.two_stage_type == 'no':
+            tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            refpoint_embed_ = self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, 4
+
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+            if self.num_patterns > 0:
+                tgt_embed = tgt.repeat(1, self.num_patterns, 1)
+                refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
+                tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(self.num_queries,
+                                                                             1)  # 1, n_q*n_pat, d_model
+                tgt = tgt_embed + tgt_pat
+
+            init_box_proposal = refpoint_embed_.sigmoid()
+
+        else:
+            raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
+        #########################################################
+        # End preparing tgt
+        # - tgt: bs, NQ, d_model
+        # - refpoint_embed(unsigmoid): bs, NQ, d_model
+        #########################################################
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if refpoint_embed.isnan().any() | refpoint_embed.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+        #     if tgt.isnan().any() | tgt.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+
+        #########################################################
+        # Begin Decoder
+        #########################################################
+        hs, references = self.decoder(
+            tgt=tgt.transpose(0, 1),
+            memory=memory.transpose(0, 1),
+            memory_key_padding_mask=mask_flatten,
+            pos=lvl_pos_embed_flatten.transpose(0, 1),
+            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios, tgt_mask=attn_mask,
+            tgt_mask2=attn_mask2,
+            memory_text=text_dict['encoded_text'],
+            text_attention_mask=~text_dict['text_token_mask'],
+            text_dict=text_dict,
+            dn_meta=dn_meta,
+            targets=targets,
+            kpt_embed=kpt_embed
+            # we ~ the mask . False means use the token; True means pad the token
+        )
+        #########################################################
+        # End Decoder
+        # hs: n_dec, bs, nq, d_model
+        # references: n_dec+1, bs, nq, query_dim
+        #########################################################
+
+        #########################################################
+        # Begin postprocess
+        #########################################################
+        if self.two_stage_type == 'standard':
+            if self.two_stage_keep_all_tokens:
+                hs_enc = output_memory.unsqueeze(0)
+                ref_enc = enc_outputs_coord_unselected.unsqueeze(0)
+                init_box_proposal = output_proposals
+                # import ipdb; ipdb.set_trace()
+            else:
+                hs_enc = tgt_undetach.unsqueeze(0)
+                ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
+        else:
+            hs_enc = ref_enc = None
+        #########################################################
+        # End postprocess
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None
+        # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None
+        #########################################################
+
+        return hs, references, hs_enc, ref_enc, init_box_proposal
+        # hs: (n_dec, bs, nq, d_model)
+        # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None
+        # ref_enc: sigmoid coordinates. \
+        #           (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self,
+                 encoder_layer, num_layers, d_model=256,
+                 num_queries=300,
+                 enc_layer_share=False,
+                 text_enhance_layer=None,
+                 feature_fusion_layer=None,
+                 use_checkpoint=False,
+                 use_transformer_ckpt=False,
+                 ):
+        """_summary_
+
+        Args:
+            encoder_layer (_type_): _description_
+            num_layers (_type_): _description_
+            norm (_type_, optional): _description_. Defaults to None.
+            d_model (int, optional): _description_. Defaults to 256.
+            num_queries (int, optional): _description_. Defaults to 300.
+            enc_layer_share (bool, optional): _description_. Defaults to False.
+
+        """
+        super().__init__()
+        # prepare layers
+        self.layers = []
+        self.text_layers = []
+        self.fusion_layers = []
+        if num_layers > 0:
+            self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
+
+            if text_enhance_layer is not None:
+                self.text_layers = _get_clones(text_enhance_layer, num_layers, layer_share=enc_layer_share)
+            if feature_fusion_layer is not None:
+                self.fusion_layers = _get_clones(feature_fusion_layer, num_layers, layer_share=enc_layer_share)
+        else:
+            self.layers = []
+            del encoder_layer
+
+            if text_enhance_layer is not None:
+                self.text_layers = []
+                del text_enhance_layer
+            if feature_fusion_layer is not None:
+                self.fusion_layers = []
+                del feature_fusion_layer
+
+        self.query_scale = None
+        self.num_queries = num_queries
+        self.num_layers = num_layers
+        self.d_model = d_model
+
+        self.use_checkpoint = use_checkpoint
+        self.use_transformer_ckpt = use_transformer_ckpt
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),)
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(self,
+                # for images
+                src: Tensor,
+                pos: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                key_padding_mask: Tensor,
+                # for texts
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                pos_text: Tensor = None,
+                text_self_attention_masks: Tensor = None,
+                position_ids: Tensor = None,
+                ):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - key_padding_mask: [bs, sum(hi*wi)]
+
+            - memory_text: bs, n_text, 256
+            - text_attention_mask: bs, n_text
+                False for no padding; True for padding
+            - pos_text: bs, n_text, 256
+
+            - position_ids: bs, n_text
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_level, 2]
+        Outpus:
+            - output: [bs, sum(hi*wi), 256]
+        """
+
+        output = src
+
+        # preparation and reshape
+        if self.num_layers > 0:
+            reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+
+        if self.text_layers:
+            # generate pos_text
+            bs, n_text, text_dim = memory_text.shape
+            if pos_text is None and position_ids is None:
+                pos_text = torch.arange(n_text, device=memory_text.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs,
+                                                                                                                     1,
+                                                                                                                     1)
+                pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
+            if position_ids is not None:
+                pos_text = get_sine_pos_embed(position_ids[..., None], num_pos_feats=256, exchange_xy=False)
+
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            # if output.isnan().any() or memory_text.isnan().any():
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+            if self.fusion_layers:
+                if self.use_checkpoint:
+                    output, memory_text = checkpoint.checkpoint(
+                        self.fusion_layers[layer_id],
+                        output,
+                        memory_text,
+                        key_padding_mask,
+                        text_attention_mask
+                    )
+                else:
+                    output, memory_text = self.fusion_layers[layer_id](v=output, l=memory_text,
+                                                                       attention_mask_v=key_padding_mask,
+                                                                       attention_mask_l=text_attention_mask)
+
+            if self.text_layers:
+                memory_text = self.text_layers[layer_id](
+                    src=memory_text.transpose(0, 1),
+                    src_mask=~text_self_attention_masks,  # note we use ~ for mask here
+                    src_key_padding_mask=text_attention_mask,
+                    pos=(pos_text.transpose(0, 1) if pos_text is not None else None)
+                ).transpose(0, 1)
+
+            # main process
+            if self.use_transformer_ckpt:
+                output = checkpoint.checkpoint(
+                    layer,
+                    output,
+                    pos,
+                    reference_points,
+                    spatial_shapes,
+                    level_start_index,
+                    key_padding_mask
+                )
+            else:
+                output = layer(src=output, pos=pos, reference_points=reference_points, spatial_shapes=spatial_shapes,
+                               level_start_index=level_start_index, key_padding_mask=key_padding_mask)
+
+        return output, memory_text
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self, decoder_layer, num_layers, norm=None,
+                 return_intermediate=False,
+                 d_model=256, query_dim=4,
+                 modulate_hw_attn=False,
+                 num_feature_levels=1,
+                 deformable_decoder=False,
+                 decoder_query_perturber=None,
+                 dec_layer_number=None,  # number of queries each layer in decoder
+                 rm_dec_query_scale=False,
+                 dec_layer_share=False,
+                 dec_layer_dropout_prob=None,
+                 use_detached_boxes_dec_out=False,
+                 num_box_decoder_layers=2,
+                 num_body_points=68,
+                 ):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        assert return_intermediate, "support return_intermediate only"
+        self.query_dim = query_dim
+        assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
+        self.num_feature_levels = num_feature_levels
+        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
+
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
+        if not deformable_decoder:
+            self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
+        else:
+            self.query_pos_sine_scale = None
+
+        if rm_dec_query_scale:
+            self.query_scale = None
+        else:
+            raise NotImplementedError
+            self.query_scale = MLP(d_model, d_model, d_model, 2)
+        self.bbox_embed = None
+        self.class_embed = None
+        self.pose_embed = None
+        self.pose_hw_embed = None
+        self.d_model = d_model
+        self.modulate_hw_attn = modulate_hw_attn
+        self.deformable_decoder = deformable_decoder
+
+        if not deformable_decoder and modulate_hw_attn:
+            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
+        else:
+            self.ref_anchor_head = None
+
+        self.decoder_query_perturber = decoder_query_perturber
+        self.box_pred_damping = None
+
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            assert isinstance(dec_layer_number, list)
+            assert len(dec_layer_number) == num_layers
+            # assert dec_layer_number[0] ==
+
+        self.dec_layer_dropout_prob = dec_layer_dropout_prob
+        if dec_layer_dropout_prob is not None:
+            assert isinstance(dec_layer_dropout_prob, list)
+            assert len(dec_layer_dropout_prob) == num_layers
+            for i in dec_layer_dropout_prob:
+                assert 0.0 <= i <= 1.0
+
+        self.rm_detach = None
+        self.num_body_points = num_body_points
+
+        self.hw = nn.Embedding(17, 2)
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.kpt_index = [x for x in range(50 * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
+        self.hw_append = nn.Embedding(self.num_body_points-17, 2)
+
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_mask2: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2
+                # for memory
+                level_start_index: Optional[Tensor] = None,  # num_levels
+                spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                valid_ratios: Optional[Tensor] = None,
+                # for text
+                memory_text: Optional[Tensor] = None,
+                text_attention_mask: Optional[Tensor] = None,
+                text_dict: Optional[Tensor] = None,
+                dn_meta: Optional[Tensor] = None,
+                targets: Optional[Tensor] = None,
+                kpt_embed: Optional[Tensor] = None
+                ):
+        """
+        Input:
+            - tgt: nq, bs, d_model
+            - memory: hw, bs, d_model
+            - pos: hw, bs, d_model
+            - refpoints_unsigmoid: nq, bs, 2/4
+            - valid_ratios/spatial_shapes: bs, nlevel, 2
+        """
+
+        output = tgt
+        output += self.hw.weight[0, 0] * 0.0
+
+
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points]
+        effect_num_dn = dn_meta['pad_size'] if self.training else 0
+        inter_select_number = 50
+        for layer_id, layer in enumerate(self.layers):
+
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([valid_ratios, valid_ratios], -1)[None, :]  # nq, bs, nlevel, 4
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
+            query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :])  # nq, bs, 256*2
+
+            # conditional query
+            raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(output) if self.query_scale is not None else 1
+            query_pos = pos_scale * raw_query_pos
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if query_pos.isnan().any() | query_pos.isinf().any():
+            #         import ipdb; ipdb.set_trace()
+
+            # main process
+            output = layer(
+                tgt=output,
+                tgt_query_pos=query_pos,
+                tgt_query_sine_embed=query_sine_embed,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_reference_points=reference_points_input,
+
+                memory_text=memory_text,
+                text_attention_mask=text_attention_mask,
+
+                memory=memory,
+                memory_key_padding_mask=memory_key_padding_mask,
+                memory_level_start_index=level_start_index,
+                memory_spatial_shapes=spatial_shapes,
+                memory_pos=pos,
+
+                self_attn_mask=tgt_mask,
+                cross_attn_mask=memory_mask
+            )
+            if output.isnan().any() | output.isinf().any():
+                print(f"output layer_id {layer_id} is nan")
+                try:
+                    num_nan = output.isnan().sum().item()
+                    num_inf = output.isinf().sum().item()
+                    print(f"num_nan {num_nan}, num_inf {num_inf}")
+                except Exception as e:
+                    print(e)
+
+
+
+
+            intermediate.append(self.norm(output))
+            # iter update
+            if layer_id < self.num_box_decoder_layers:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](output)
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = outputs_unsig.sigmoid()
+
+            # select # ref points as anchors
+            if layer_id == self.num_box_decoder_layers - 1:
+                dn_output = output[:effect_num_dn]
+                dn_new_reference_points = new_reference_points[:effect_num_dn]
+                class_unselected = self.class_embed[layer_id](output.transpose(0, 1), text_dict)[:,
+                                   effect_num_dn:].transpose(0, 1)
+                topk_proposals = torch.topk(class_unselected.max(-1)[0], inter_select_number, dim=0)[1]
+                new_reference_points_for_box = torch.gather(new_reference_points[effect_num_dn:], 0,
+                                                            topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+                new_output_for_box = torch.gather(output[effect_num_dn:], 0,
+                                                  topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
+                keypoint_embed=kpt_embed.transpose(0, 1)
+
+                new_output_for_keypoint = keypoint_embed[None, :, :, :].repeat(new_output_for_box.shape[0],1,1,1)
+                delta_xy = self.pose_embed[-1](new_output_for_keypoint)[..., :2]
+                keypoint_xy = (inverse_sigmoid(new_reference_points_for_box[..., :2][:, None]) + delta_xy).sigmoid()
+                num_queries, _, bs, _ = keypoint_xy.shape
+                aa = torch.cat((self.hw.weight,self.hw_append.weight),dim=0)
+                keypoint_wh_weight = aa.unsqueeze(0).unsqueeze(-2).repeat(num_queries, 1, bs, 1).sigmoid()
+                keypoint_wh = keypoint_wh_weight * new_reference_points_for_box[..., 2:][:, None]
+                new_reference_points_for_keypoint = torch.cat((keypoint_xy, keypoint_wh), dim=-1)
+                new_reference_points = torch.cat(
+                    (new_reference_points_for_box.unsqueeze(1), new_reference_points_for_keypoint), dim=1).flatten(0, 1)
+                output = torch.cat((new_output_for_box.unsqueeze(1), new_output_for_keypoint), dim=1).flatten(0, 1)
+                new_reference_points = torch.cat((dn_new_reference_points, new_reference_points), dim=0)
+                output = torch.cat((dn_output, output), dim=0)
+                tgt_mask = tgt_mask2
+
+            if layer_id >= self.num_box_decoder_layers:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                output_bbox_dn = output[:effect_num_dn]
+                output_bbox_norm = output[effect_num_dn:][0::(self.num_body_points + 1)]
+                reference_before_sigmoid_bbox_dn = reference_before_sigmoid[:effect_num_dn]
+                reference_before_sigmoid_bbox_norm = reference_before_sigmoid[effect_num_dn:][
+                                                     0::(self.num_body_points + 1)]
+                delta_unsig_dn = self.bbox_embed[layer_id](output_bbox_dn)
+                delta_unsig_norm = self.bbox_embed[layer_id](output_bbox_norm)
+                outputs_unsig_dn = delta_unsig_dn + reference_before_sigmoid_bbox_dn
+                outputs_unsig_norm = delta_unsig_norm + reference_before_sigmoid_bbox_norm
+                new_reference_points_for_box_dn = outputs_unsig_dn.sigmoid()
+                new_reference_points_for_box_norm = outputs_unsig_norm.sigmoid()
+                output_kpt = output[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index, device=output.device))
+                delta_xy_unsig = self.pose_embed[layer_id - self.num_box_decoder_layers](output_kpt)
+                outputs_unsig = reference_before_sigmoid[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index,
+                                                                                                      device=output.device)).clone()  ##
+                delta_hw_unsig = self.pose_hw_embed[layer_id - self.num_box_decoder_layers](output_kpt)
+                outputs_unsig[..., :2] += delta_xy_unsig[..., :2]
+                outputs_unsig[..., 2:] += delta_hw_unsig
+                new_reference_points_for_keypoint = outputs_unsig.sigmoid()
+                bs = new_reference_points_for_box_norm.shape[1]
+                new_reference_points_norm = torch.cat((new_reference_points_for_box_norm.unsqueeze(1),
+                                                       new_reference_points_for_keypoint.view(-1, self.num_body_points,
+                                                                                              bs, 4)), dim=1).flatten(0,
+                                                                                                                      1)
+                new_reference_points = torch.cat((new_reference_points_for_box_dn, new_reference_points_norm), dim=0)
+
+            if self.rm_detach and 'dec' in self.rm_detach:
+                reference_points = new_reference_points
+            else:
+                reference_points = new_reference_points.detach()
+
+            # if layer_id != self.num_layers - 1:
+            if self.use_detached_boxes_dec_out:
+                ref_points.append(reference_points)
+            else:
+                ref_points.append(new_reference_points)
+
+        return [
+            [itm_out.transpose(0, 1) for itm_out in intermediate],
+            [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]
+        ]
+
+
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 add_channel_attention=False,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 ):
+        super().__init__()
+
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # channel attention
+        self.add_channel_attention = add_channel_attention
+        if add_channel_attention:
+            self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
+            self.norm_channel = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
+        # self attention
+        # import ipdb; ipdb.set_trace()
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index,
+                              key_padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        src = self.forward_ffn(src)
+
+        # channel attn
+        if self.add_channel_attention:
+            src = self.norm_channel(src + self.activ_channel(src))
+
+        return src
+
+
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 use_text_feat_guide=False,
+                 use_text_cross_attention=False,
+                 ffn_extra_layernorm=False
+                 ):
+        super().__init__()
+
+        # cross attention
+        # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention text
+        if use_text_cross_attention:
+            self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+            self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+            self.catext_norm = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm3 = nn.LayerNorm(d_model)
+        if ffn_extra_layernorm:
+            raise NotImplementedError('ffn_extra_layernorm not implemented')
+            self.norm_ext = nn.LayerNorm(d_ffn)
+        else:
+            self.norm_ext = None
+
+        self.key_aware_proj = None
+        self.use_text_feat_guide = use_text_feat_guide
+        assert not use_text_feat_guide
+        self.use_text_cross_attention = use_text_cross_attention
+
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt, ipdb_flag=False):
+
+        with torch.cuda.amp.autocast(enabled=False):
+            tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None,  # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None,  # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None,  # nq, bs, 4
+
+                memory_text: Optional[Tensor] = None,  # bs, num_token, d_model
+                text_attention_mask: Optional[Tensor] = None,  # bs, num_token
+
+                # for memory
+                memory: Optional[Tensor] = None,  # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None,  # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None,  # pos for memory
+
+                # sa
+                self_attn_mask: Optional[Tensor] = None,  # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None,  # mask used for cross-attention
+                ):
+        """
+        Input:
+            - tgt/tgt_query_pos: nq, bs, d_model
+            -
+        """
+        assert cross_attn_mask is None
+
+        # self attention
+        if self.self_attn is not None:
+            # import ipdb; ipdb.set_trace()
+            q = k = self.with_pos_embed(tgt, tgt_query_pos)
+            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+            tgt = tgt + self.dropout2(tgt2)
+            tgt = self.norm2(tgt)
+
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if tgt.isnan().any() | tgt.isinf().any() :
+            #         import ipdb; ipdb.set_trace()
+
+        if self.use_text_cross_attention:
+            tgt2 = self.ca_text(self.with_pos_embed(tgt, tgt_query_pos), memory_text.transpose(0, 1),
+                                memory_text.transpose(0, 1), key_padding_mask=text_attention_mask)[0]
+            tgt = tgt + self.catext_dropout(tgt2)
+            tgt = self.catext_norm(tgt)
+
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+
+            # if tgt.isnan().any() | tgt.isinf().any() :
+            #     import ipdb; ipdb.set_trace()
+
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                               tgt_reference_points.transpose(0, 1).contiguous(),
+                               memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index,
+                               memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     tgtk = tgt.clone()
+        #     if tgt.isnan().any() | tgt.isinf().any() :
+        #         import ipdb; ipdb.set_trace()
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if tgt.isnan().any() | tgt.isinf().any() :
+        #         tgtk = self.forward_ffn(tgtk, ipdb_flag=True)
+        #         import ipdb; ipdb.set_trace()
+
+        return tgt
+
+
+def _get_clones(module, N, layer_share=False):
+    # import ipdb; ipdb.set_trace()
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def build_deformable_transformer(args):
+    decoder_query_perturber = None
+    if args.decoder_layer_noise:
+        from .utils import RandomBoxPerturber
+        decoder_query_perturber = RandomBoxPerturber(
+            x_noise_scale=args.dln_xy_noise, y_noise_scale=args.dln_xy_noise,
+            w_noise_scale=args.dln_hw_noise, h_noise_scale=args.dln_hw_noise)
+
+    use_detached_boxes_dec_out = False
+    try:
+        use_detached_boxes_dec_out = args.use_detached_boxes_dec_out
+    except:
+        use_detached_boxes_dec_out = False
+
+    binary_query_selection = False
+    try:
+        binary_query_selection = args.binary_query_selection
+    except:
+        binary_query_selection = False
+
+    ffn_extra_layernorm = False
+    try:
+        ffn_extra_layernorm = args.ffn_extra_layernorm
+    except:
+        print('ffn_extra_layernorm not found, set to False')
+        ffn_extra_layernorm = False
+
+    return DeformableTransformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        num_queries=args.num_queries,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_unicoder_layers=args.unic_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+        query_dim=args.query_dim,
+        activation=args.transformer_activation,
+        num_patterns=args.num_patterns,
+        modulate_hw_attn=True,
+
+        deformable_encoder=True,
+        deformable_decoder=True,
+        num_feature_levels=args.num_feature_levels,
+        enc_n_points=args.enc_n_points,
+        dec_n_points=args.dec_n_points,
+        use_deformable_box_attn=args.use_deformable_box_attn,
+        box_attn_type=args.box_attn_type,
+
+        learnable_tgt_init=True,
+        decoder_query_perturber=decoder_query_perturber,
+
+        add_channel_attention=args.add_channel_attention,
+        add_pos_value=args.add_pos_value,
+        random_refpoints_xy=args.random_refpoints_xy,
+
+        # two stage
+        two_stage_type=args.two_stage_type,  # ['no', 'standard', 'early']
+        two_stage_pat_embed=args.two_stage_pat_embed,
+        two_stage_add_query_num=args.two_stage_add_query_num,
+        two_stage_learn_wh=args.two_stage_learn_wh,
+        two_stage_keep_all_tokens=args.two_stage_keep_all_tokens,
+        dec_layer_number=args.dec_layer_number,
+        rm_self_attn_layers=None,
+        key_aware_type=None,
+        layer_share_type=None,
+
+        rm_detach=None,
+        decoder_sa_type=args.decoder_sa_type,
+        module_seq=args.decoder_module_seq,
+
+        embed_init_tgt=args.embed_init_tgt,
+        use_detached_boxes_dec_out=use_detached_boxes_dec_out,
+        use_text_enhancer=args.use_text_enhancer,
+        use_fusion_layer=args.use_fusion_layer,
+        use_checkpoint=args.use_checkpoint,
+        use_transformer_ckpt=args.use_transformer_ckpt,
+        use_text_cross_attention=args.use_text_cross_attention,
+
+        text_dropout=args.text_dropout,
+        fusion_dropout=args.fusion_dropout,
+        fusion_droppath=args.fusion_droppath,
+
+        binary_query_selection=binary_query_selection,
+        ffn_extra_layernorm=ffn_extra_layernorm,
+    )
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/fuse_modules.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/fuse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8edb6150d538f3b27d043d403b178e5e407a59
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/fuse_modules.py
@@ -0,0 +1,274 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# from timm.models.layers import DropPath
+from src.modules.util import DropPath
+
+class FeatureResizer(nn.Module):
+    """
+    This class takes as input a set of embeddings of dimension C1 and outputs a set of
+    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
+    """
+
+    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
+        super().__init__()
+        self.do_ln = do_ln
+        # Object feature encoding
+        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
+        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, encoder_features):
+        x = self.fc(encoder_features)
+        if self.do_ln:
+            x = self.layer_norm(x)
+        output = self.dropout(x)
+        return output
+
+
+
+
+def l1norm(X, dim, eps=1e-8):
+    """L1-normalize columns of X
+    """
+    norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps
+    X = torch.div(X, norm)
+    return X
+
+
+def l2norm(X, dim, eps=1e-8):
+    """L2-normalize columns of X
+    """
+    norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps
+    X = torch.div(X, norm)
+    return X
+
+
+def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
+    """
+    query: (n_context, queryL, d)
+    context: (n_context, sourceL, d)
+    """
+    batch_size_q, queryL = query.size(0), query.size(1)
+    batch_size, sourceL = context.size(0), context.size(1)
+
+    # Get attention
+    # --> (batch, d, queryL)
+    queryT = torch.transpose(query, 1, 2)
+
+    # (batch, sourceL, d)(batch, d, queryL)
+    # --> (batch, sourceL, queryL)
+    attn = torch.bmm(context, queryT)
+    if raw_feature_norm == "softmax":
+        # --> (batch*sourceL, queryL)
+        attn = attn.view(batch_size * sourceL, queryL)
+        attn = nn.Softmax()(attn)
+        # --> (batch, sourceL, queryL)
+        attn = attn.view(batch_size, sourceL, queryL)
+    elif raw_feature_norm == "l2norm":
+        attn = l2norm(attn, 2)
+    elif raw_feature_norm == "clipped_l2norm":
+        attn = nn.LeakyReLU(0.1)(attn)
+        attn = l2norm(attn, 2)
+    else:
+        raise ValueError("unknown first norm type:", raw_feature_norm)
+    # --> (batch, queryL, sourceL)
+    attn = torch.transpose(attn, 1, 2).contiguous()
+    # --> (batch*queryL, sourceL)
+    attn = attn.view(batch_size * queryL, sourceL)
+    attn = nn.Softmax()(attn * smooth)
+    # --> (batch, queryL, sourceL)
+    attn = attn.view(batch_size, queryL, sourceL)
+    # --> (batch, sourceL, queryL)
+    attnT = torch.transpose(attn, 1, 2).contiguous()
+
+    # --> (batch, d, sourceL)
+    contextT = torch.transpose(context, 1, 2)
+    # (batch x d x sourceL)(batch x sourceL x queryL)
+    # --> (batch, d, queryL)
+    weightedContext = torch.bmm(contextT, attnT)
+    # --> (batch, queryL, d)
+    weightedContext = torch.transpose(weightedContext, 1, 2)
+
+    return weightedContext, attnT
+
+
+class BiMultiHeadAttention(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
+        super(BiMultiHeadAttention, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+
+        assert (
+                self.head_dim * self.num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+
+        self.stable_softmax_2d = True
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+
+        self._reset_parameters()
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.v_proj.weight)
+        self.v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.l_proj.weight)
+        self.l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_v_proj.weight)
+        self.values_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_l_proj.weight)
+        self.values_l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_v_proj.weight)
+        self.out_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_l_proj.weight)
+        self.out_l_proj.bias.data.fill_(0)
+
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        """_summary_
+
+        Args:
+            v (_type_): bs, n_img, dim
+            l (_type_): bs, n_text, dim
+            attention_mask_v (_type_, optional): _description_. bs, n_img
+            attention_mask_l (_type_, optional): _description_. bs, n_text
+
+        Returns:
+            _type_: _description_
+        """
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        bsz, tgt_len, _ = v.size()
+
+        query_states = self.v_proj(v) * self.scale
+        key_states = self._shape(self.l_proj(l), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_v_states = value_v_states.view(*proj_shape)
+        value_l_states = value_l_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+
+        if self.clamp_min_for_underflow:
+            attn_weights = torch.clamp(attn_weights, min=-50000) # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights = torch.clamp(attn_weights, max=50000) # Do not increase 50000, data type half has quite limited range
+
+        attn_weights_T = attn_weights.transpose(1, 2)
+        attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[
+            0])
+        if self.clamp_min_for_underflow:
+            attn_weights_l = torch.clamp(attn_weights_l, min=-50000) # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights_l = torch.clamp(attn_weights_l, max=50000) # Do not increase 50000, data type half has quite limited range
+
+        # mask vison for language
+        if attention_mask_v is not None:
+            attention_mask_v = attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))
+
+        attn_weights_l = attn_weights_l.softmax(dim=-1)
+
+        # mask language for vision
+        if attention_mask_l is not None:
+            attention_mask_l = attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights.masked_fill_(attention_mask_l, float('-inf'))
+        attn_weights_v = attn_weights.softmax(dim=-1)
+
+        attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
+
+        attn_output_v = torch.bmm(attn_probs_v, value_l_states)
+        attn_output_l = torch.bmm(attn_probs_l, value_v_states)
+
+
+        if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
+            )
+
+        if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
+            )
+
+        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output_v = attn_output_v.transpose(1, 2)
+        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
+        attn_output_l = attn_output_l.transpose(1, 2)
+        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
+
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+
+        return attn_output_v, attn_output_l
+
+
+# Bi-Direction MHA (text->image, image->text)
+class BiAttentionBlock(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1,
+                 drop_path=.0, init_values=1e-4, cfg=None):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            hidden_dim - Dimensionality of hidden layer in feed-forward network
+                         (usually 2-4x larger than embed_dim)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super(BiAttentionBlock, self).__init__()
+
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(v_dim=v_dim,
+                                         l_dim=l_dim,
+                                         embed_dim=embed_dim,
+                                         num_heads=num_heads,
+                                         dropout=dropout)
+
+        # add layer scale for training stability
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=False)
+        self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=False)
+
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        v = self.layer_norm_v(v)
+        l = self.layer_norm_l(l)
+        delta_v, delta_l = self.attn(v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l)
+        # v, l = v + delta_v, l + delta_l
+        v = v + self.drop_path(self.gamma_v * delta_v)
+        l = l + self.drop_path(self.gamma_l * delta_l)
+        return v, l
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/mask_generate.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/mask_generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed79e74d724b11b761e9a762099017e105d87df1
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/mask_generate.py
@@ -0,0 +1,56 @@
+import torch
+
+
+def prepare_for_mask(kpt_mask):
+
+
+    tgt_size2 = 50 * 69
+    attn_mask2 = torch.ones(kpt_mask.shape[0], 8, tgt_size2, tgt_size2).to('cuda') < 0
+    group_bbox_kpt = 69
+    num_group=50
+    for matchj in range(num_group * group_bbox_kpt):
+        sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+        ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+        if sj > 0:
+            attn_mask2[:,:,matchj, :sj] = True
+        if ej < num_group * group_bbox_kpt:
+            attn_mask2[:,:,matchj, ej:] = True
+
+
+    bs, length = kpt_mask.shape
+    equal_mask = kpt_mask[:, :, None] == kpt_mask[:, None, :]
+    equal_mask= equal_mask.unsqueeze(1).repeat(1,8,1,1)
+    for idx in range(num_group):
+        start_idx = idx * length
+        end_idx = (idx + 1) * length
+        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][equal_mask] = False
+        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][~equal_mask] = True
+
+
+
+
+    input_query_label = None
+    input_query_bbox = None
+    attn_mask = None
+    dn_meta = None
+
+    return input_query_label, input_query_bbox, attn_mask, attn_mask2.flatten(0,1), dn_meta
+
+
+def post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):
+
+    if dn_meta and dn_meta['pad_size'] > 0:
+
+        output_known_class = [outputs_class_i[:, :dn_meta['pad_size'], :] for outputs_class_i in outputs_class]
+        output_known_coord = [outputs_coord_i[:, :dn_meta['pad_size'], :] for outputs_coord_i in outputs_coord]
+
+        outputs_class = [outputs_class_i[:, dn_meta['pad_size']:, :] for outputs_class_i in outputs_class]
+        outputs_coord = [outputs_coord_i[:, dn_meta['pad_size']:, :] for outputs_coord_i in outputs_coord]
+
+        out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1]}
+        if aux_loss:
+            out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord)
+        dn_meta['output_known_lbs_bboxes'] = out
+    return outputs_class, outputs_coord
+
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/functions/__init__.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a2197bda3199aa32cafc5b9d396479609853dd2
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/functions/__init__.py
@@ -0,0 +1,10 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from .ms_deform_attn_func import MSDeformAttnFunction
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c5df8cf5d23aca963eec6c1133c180b37289607
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py
@@ -0,0 +1,61 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+import MultiScaleDeformableAttention as MSDA
+
+
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/modules/__init__.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f82cb1ad9d634a87b54ba6a71b58a230bcade5fe
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/modules/__init__.py
@@ -0,0 +1,9 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from .ms_deform_attn import MSDeformAttn
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd48d27e0654fbc3000241820dbb4f0e28f61f5e
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn.py
@@ -0,0 +1,142 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+import math, os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+
+from src.utils.dependencies.XPose.models.UniPose.ops.functions.ms_deform_attn_func import MSDeformAttnFunction
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self.use_4D_normalizer = use_4D_normalizer
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+
+        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            if self.use_4D_normalizer:
+                offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
+            else:
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+
+
+        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        # for amp
+        if value.dtype == torch.float16:
+            # for mixed precision
+            output = MSDeformAttnFunction.apply(
+            value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)
+            output = output.to(torch.float16)
+            output = self.output_proj(output)
+            return output
+
+        output = MSDeformAttnFunction.apply(
+            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        return output
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py
new file mode 100644
index 0000000000000000000000000000000000000000..b737ba62f5eaddfb4734468eaf952395f808a95f
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py
@@ -0,0 +1,130 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+import math, os
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+
+try:
+    from src.utils.dependencies.XPose.models.UniPose.ops.functions import MSDeformAttnFunction
+except:
+    warnings.warn('Failed to import MSDeformAttnFunction.')
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self.use_4D_normalizer = use_4D_normalizer
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, query, key, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param key                          (N, 1, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+
+        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            if self.use_4D_normalizer:
+                offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
+            else:
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+        output = MSDeformAttnFunction.apply(
+            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        return output
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/setup.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..049f9232ba48996518ccd76df4cf39c8ed14791e
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/setup.py
@@ -0,0 +1,73 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+import os
+import glob
+
+import torch
+
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ["torch", "torchvision"]
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    # import ipdb; ipdb.set_trace()
+
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "MultiScaleDeformableAttention",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+setup(
+    name="MultiScaleDeformableAttention",
+    version="1.0",
+    author="Weijie Su",
+    url="https://github.com/fundamentalvision/Deformable-DETR",
+    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
+    packages=find_packages(exclude=("configs", "tests",)),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1bf854de1f3860d20b6fef5c1a17817c268e70a
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp
@@ -0,0 +1,41 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..81b7b58a3d9502bbb684dc84687a526dedf94cae
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h
@@ -0,0 +1,33 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6d583647cce987196d5ad1968a8a365a379e774
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,153 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
\ No newline at end of file
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7ae53f99c820ce6193b608ad344550348a0b42c
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h
@@ -0,0 +1,30 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6bc2acb7aea0eab2e9e91e769a16861e1652c284
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh
@@ -0,0 +1,1327 @@
+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
\ No newline at end of file
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/ms_deform_attn.h b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/ms_deform_attn.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac0ef2ec25f7d0ee51ca2d807b159ddf85652017
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/ms_deform_attn.h
@@ -0,0 +1,62 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+
+#include "cpu/ms_deform_attn_cpu.h"
+
+#ifdef WITH_CUDA
+#include "cuda/ms_deform_attn_cuda.h"
+#endif
+
+
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/vision.cpp b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2201f63a51dca16d0b31148ed2c9e8e47ec15bdc
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/src/vision.cpp
@@ -0,0 +1,16 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "ms_deform_attn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/test.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dbf6d5547d131f01a8c5c28b76557bd27a9334b
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/ops/test.py
@@ -0,0 +1,89 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
+
+
+N, M, D = 1, 2, 2
+Lq, L, P = 2, 2, 2
+shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
+S = sum([(H*W).item() for H, W in shapes])
+
+
+torch.manual_seed(3)
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
+
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    func = MSDeformAttnFunction.apply
+
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+
+    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
+
+    print(f'* {gradok} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+
+    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
+        check_gradient_numerical(channels, True, True, True)
+
+
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/position_encoding.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/position_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0db9cc02fd4f4c15cec6f88ad888538dd8ba3d2
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/position_encoding.py
@@ -0,0 +1,157 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+
+from util.misc import NestedTensor
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            # if os.environ.get("SHILONG_AMP", None) == '1':
+            #     eps = 1e-4
+            # else:
+            #     eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+class PositionEmbeddingSineHW(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperatureH = temperatureH
+        self.temperatureW = temperatureW
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+
+        # import ipdb; ipdb.set_trace()
+
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_tx
+
+        dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)
+        pos_y = y_embed[:, :, :, None] / dim_ty
+
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+
+        # import ipdb; ipdb.set_trace()
+
+        return pos
+
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSineHW(
+            N_steps, 
+            temperatureH=args.pe_temperatureH,
+            temperatureW=args.pe_temperatureW,
+            normalize=True
+        )
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+
+    return position_embedding
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/swin_transformer.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f425087b287d6a0fe67ab888a081d71fb0c3026
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/swin_transformer.py
@@ -0,0 +1,701 @@
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+
+from util.misc import NestedTensor
+# from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from src.modules.util import DropPath, to_2tuple, trunc_normal_
+
+
+
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.view(B, H, W, C)
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 dilation=False,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.dilation = dilation
+
+        # if use_checkpoint:
+        #     print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!")
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        # prepare downsample list
+        downsamplelist = [PatchMerging for i in range(self.num_layers)]
+        downsamplelist[-1] = None
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        if self.dilation:
+            downsamplelist[-2] = None
+            num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                # dim=int(embed_dim * 2 ** i_layer),
+                dim=num_features[i_layer],
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                downsample=downsamplelist[i_layer],
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+
+
+    def forward_raw(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            # import ipdb; ipdb.set_trace()
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # outs:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+        return tuple(outs)
+
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # out:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+
+        # collect for nesttensors
+        outs_dict = {}
+        for idx, out_i in enumerate(outs):
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
+            outs_dict[idx] = NestedTensor(out_i, mask)
+
+        return outs_dict
+
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+
+
+def build_swin_transformer(modelname, pretrain_img_size, **kw):
+    assert modelname in ['swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']
+
+    model_para_dict = {
+        'swin_T_224_1k': dict(
+            embed_dim=96,
+            depths=[ 2, 2, 6, 2 ],
+            num_heads=[ 3, 6, 12, 24],
+            window_size=7
+        ),
+        'swin_B_224_22k': dict(
+            embed_dim=128,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 4, 8, 16, 32 ],
+            window_size=7
+        ),
+        'swin_B_384_22k': dict(
+            embed_dim=128,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 4, 8, 16, 32 ],
+            window_size=12
+        ),
+        'swin_L_224_22k': dict(
+            embed_dim=192,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 6, 12, 24, 48 ],
+            window_size=7
+        ),
+        'swin_L_384_22k': dict(
+            embed_dim=192,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 6, 12, 24, 48 ],
+            window_size=12
+        ),
+    }
+    kw_cgf = model_para_dict[modelname]
+    kw_cgf.update(kw)
+    model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)
+    return model
+
+if __name__ == "__main__":
+    model = build_swin_transformer('swin_L_384_22k', 384, dilation=True)
+    x = torch.rand(2, 3, 1024, 1024)
+    y = model.forward_raw(x)
+    import ipdb; ipdb.set_trace()
+    x = torch.rand(2, 3, 384, 384)
+    y = model.forward_raw(x)
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/transformer_deformable.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/transformer_deformable.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d441e8df03414019d734c7dbe1118a5dd7e4435
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/transformer_deformable.py
@@ -0,0 +1,595 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+
+import copy
+import math
+import torch
+from torch import nn, Tensor
+from torch.nn.init import xavier_uniform_, constant_, normal_
+from typing import Optional
+
+from util.misc import inverse_sigmoid
+from .ops.modules import MSDeformAttn
+from .utils import MLP, _get_activation_fn, gen_sineembed_for_position
+
+class DeformableTransformer(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,
+                 activation="relu", return_intermediate_dec=False,
+                 num_feature_levels=4, dec_n_points=4,  enc_n_points=4,
+                 two_stage=False, two_stage_num_proposals=300,
+                 use_dab=False, high_dim_query_update=False, no_sine_embed=False):
+        super().__init__()
+
+        self.d_model = d_model
+        self.nhead = nhead
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.use_dab = use_dab
+
+        encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels, nhead, enc_n_points)
+        self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)
+
+        decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels, nhead, dec_n_points)
+        self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec,
+                                                            use_dab=use_dab, d_model=d_model, high_dim_query_update=high_dim_query_update, no_sine_embed=no_sine_embed)
+
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+
+        if two_stage:
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+            self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(d_model * 2)
+        else:
+            if not self.use_dab:
+                self.reference_points = nn.Linear(d_model, 2)
+
+        self.high_dim_query_update = high_dim_query_update
+        if high_dim_query_update:
+            assert not self.use_dab, "use_dab must be True"
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if not self.two_stage and not self.use_dab:
+            xavier_uniform_(self.reference_points.weight.data, gain=1.0)
+            constant_(self.reference_points.bias.data, 0.)
+        normal_(self.level_embed)
+
+    def get_proposal_pos_embed(self, proposals):
+        num_pos_feats = 128
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
+        N_, S_, C_ = memory.shape
+        base_scale = 4.0
+        proposals = []
+        _cur = 0
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                            torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+            proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+            proposals.append(proposal)
+            _cur += (H_ * W_)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def forward(self, srcs, masks, pos_embeds, query_embed=None):
+        """
+        Input:
+            - srcs: List([bs, c, h, w])
+            - masks: List([bs, h, w])
+        """
+        assert self.two_stage or query_embed is not None
+
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+
+            src = src.flatten(2).transpose(1, 2)                # bs, hw, c
+            mask = mask.flatten(1)                              # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)    # bs, hw, c
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)     # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)   # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
+        # import ipdb; ipdb.set_trace()
+
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        if self.two_stage:
+            output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
+
+            # hack implementation for two-stage Deformable DETR
+            enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
+            enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals
+
+            topk = self.two_stage_num_proposals
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_embed, tgt = torch.split(pos_trans_out, c, dim=2)
+        elif self.use_dab:
+            reference_points = query_embed[..., self.d_model:].sigmoid()
+            tgt = query_embed[..., :self.d_model]
+            tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
+            init_reference_out = reference_points
+        else:
+            query_embed, tgt = torch.split(query_embed, c, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)
+            tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+                # bs, num_quires, 2
+            init_reference_out = reference_points
+
+        # decoder
+        # import ipdb; ipdb.set_trace()
+        hs, inter_references = self.decoder(tgt, reference_points, memory,
+                                            spatial_shapes, level_start_index, valid_ratios,
+                                            query_pos=query_embed if not self.use_dab else None,
+                                            src_padding_mask=mask_flatten)
+
+        inter_references_out = inter_references
+        if self.two_stage:
+            return hs, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact
+        return hs, init_reference_out, inter_references_out, None, None
+
+
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 add_channel_attention=False,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 ):
+        super().__init__()
+
+        # self attention
+        if use_deformable_box_attn:
+            self.self_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
+        else:
+            self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # channel attention
+        self.add_channel_attention = add_channel_attention
+        if add_channel_attention:
+            self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
+            self.norm_channel = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
+        # self attention
+        # import ipdb; ipdb.set_trace()
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, key_padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        src = self.forward_ffn(src)
+
+        # channel attn
+        if self.add_channel_attention:
+            src = self.norm_channel(src + self.activ_channel(src))
+
+        return src
+
+
+class DeformableTransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(encoder_layer, num_layers)
+        else:
+            self.layers = []
+            del encoder_layer
+        self.num_layers = num_layers
+        self.norm = norm
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - padding_mask: [bs, sum(hi*wi)]
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_lebel, 2]
+        """
+        output = src
+        # bs, sum(hi*wi), 256
+        # import ipdb; ipdb.set_trace()
+        if self.num_layers > 0:
+            reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        for _, layer in enumerate(self.layers):
+            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 key_aware_type=None,
+                 decoder_sa_type='ca',
+                 module_seq=['sa', 'ca', 'ffn'],
+                 ):
+        super().__init__()
+        self.module_seq = module_seq
+        assert sorted(module_seq) == ['ca', 'ffn', 'sa']
+
+        # cross attention
+        # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        if use_deformable_box_attn:
+            self.cross_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
+        else:
+            self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self.key_aware_type = key_aware_type
+        self.key_aware_proj = None
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+
+        if decoder_sa_type == 'ca_content':
+            self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+
+
+
+
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_sa(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None, # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+        # self attention
+        if self.self_attn is not None:
+            # import ipdb; ipdb.set_trace()
+            if self.decoder_sa_type == 'sa':
+                q = k = self.with_pos_embed(tgt, tgt_query_pos)
+                tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+                tgt = tgt + self.dropout2(tgt2)
+                tgt = self.norm2(tgt)
+            elif self.decoder_sa_type == 'ca_label':
+                # import ipdb; ipdb.set_trace()
+                # q = self.with_pos_embed(tgt, tgt_query_pos)
+                bs = tgt.shape[1]
+                k = v = self.label_embedding.weight[:, None, :].repeat(1, bs, 1)
+                tgt2 = self.self_attn(tgt, k, v, attn_mask=self_attn_mask)[0]
+                tgt = tgt + self.dropout2(tgt2)
+                tgt = self.norm2(tgt)
+            elif self.decoder_sa_type == 'ca_content':
+                tgt2 = self.self_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                            tgt_reference_points.transpose(0, 1).contiguous(),
+                            memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
+                tgt = tgt + self.dropout2(tgt2)
+                tgt = self.norm2(tgt)
+            else:
+                raise NotImplementedError("Unknown decoder_sa_type {}".format(self.decoder_sa_type))
+
+        return tgt
+
+    def forward_ca(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None, # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+        # cross attention
+        # import ipdb; ipdb.set_trace()
+        if self.key_aware_type is not None:
+
+            if self.key_aware_type == 'mean':
+                tgt = tgt + memory.mean(0, keepdim=True)
+            elif self.key_aware_type == 'proj_mean':
+                tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True)
+            else:
+                raise NotImplementedError("Unknown key_aware_type: {}".format(self.key_aware_type))
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                               tgt_reference_points.transpose(0, 1).contiguous(),
+                               memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        return tgt
+
+    def forward(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None, # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+
+        for funcname in self.module_seq:
+            # if os.environ.get('IPDB_DEBUG_SHILONG') == 'INFO':
+            #     import ipdb; ipdb.set_trace()
+            if funcname == 'ffn':
+                tgt = self.forward_ffn(tgt)
+            elif funcname == 'ca':
+                tgt = self.forward_ca(tgt, tgt_query_pos, tgt_query_sine_embed, \
+                    tgt_key_padding_mask, tgt_reference_points, \
+                        memory, memory_key_padding_mask, memory_level_start_index, \
+                            memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
+            elif funcname == 'sa':
+                tgt = self.forward_sa(tgt, tgt_query_pos, tgt_query_sine_embed, \
+                    tgt_key_padding_mask, tgt_reference_points, \
+                        memory, memory_key_padding_mask, memory_level_start_index, \
+                            memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
+            else:
+                raise ValueError('unknown funcname {}'.format(funcname))
+
+        return tgt
+
+
+
+class DeformableTransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False, use_dab=False, d_model=256, query_dim=4):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+        assert return_intermediate
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+        self.use_dab = use_dab
+        self.d_model = d_model
+        self.query_dim = query_dim
+        if use_dab:
+            self.query_scale = MLP(d_model, d_model, d_model, 2)
+            self.ref_point_head = MLP(2 * d_model, d_model, d_model, 2)
+
+
+    def forward(self, tgt, reference_points, src, src_spatial_shapes,
+                src_level_start_index, src_valid_ratios,
+                query_pos=None, src_padding_mask=None):
+        output = tgt
+        if self.use_dab:
+            assert query_pos is None
+
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        for layer_id, layer in enumerate(self.layers):
+            # import ipdb; ipdb.set_trace()
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None] # bs, nq, 4, 4
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
+
+            if self.use_dab:
+                # import ipdb; ipdb.set_trace()
+                query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # bs, nq, 256*2
+                raw_query_pos = self.ref_point_head(query_sine_embed) # bs, nq, 256
+                pos_scale = self.query_scale(output) if layer_id != 0 else 1
+                query_pos = pos_scale * raw_query_pos
+
+            output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                box_holder = self.bbox_embed(output)
+                box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)
+                new_reference_points = box_holder[..., :self.query_dim].sigmoid()
+                reference_points = new_reference_points.detach()
+                if layer_id != self.num_layers - 1:
+                    intermediate_reference_points.append(new_reference_points)
+
+            intermediate.append(output)
+
+        return torch.stack(intermediate), torch.stack(intermediate_reference_points)
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def build_deforamble_transformer(args):
+    return DeformableTransformer(
+        d_model=args.hidden_dim,
+        nhead=args.nheads,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        dim_feedforward=args.dim_feedforward,
+        dropout=args.dropout,
+        activation="relu",
+        return_intermediate_dec=True,
+        num_feature_levels=args.ddetr_num_feature_levels,
+        dec_n_points=args.ddetr_dec_n_points,
+        enc_n_points=args.ddetr_enc_n_points,
+        two_stage=args.ddetr_two_stage,
+        two_stage_num_proposals=args.num_queries,
+        use_dab=args.ddetr_use_dab,
+        high_dim_query_update=args.ddetr_high_dim_query_update,
+        no_sine_embed=args.ddetr_no_sine_embed)
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/transformer_vanilla.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/transformer_vanilla.py
new file mode 100644
index 0000000000000000000000000000000000000000..450885a97323f6d68cfbed845a2a91c32e79b4ca
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/transformer_vanilla.py
@@ -0,0 +1,102 @@
+# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR Transformer class.
+
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import torch
+from torch import Tensor, nn
+from typing import List, Optional
+
+from .utils import  _get_activation_fn, _get_clones
+
+
+class TextTransformer(nn.Module):
+    def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
+        super().__init__()
+        self.num_layers = num_layers
+        self.d_model = d_model
+        self.nheads = nheads
+        self.dim_feedforward = dim_feedforward
+        self.norm = None
+
+        single_encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout)
+        self.layers = _get_clones(single_encoder_layer, num_layers)
+
+
+    def forward(self, memory_text:torch.Tensor, text_attention_mask:torch.Tensor):
+        """        
+
+        Args:
+            text_attention_mask: bs, num_token
+            memory_text: bs, num_token, d_model
+
+        Raises:
+            RuntimeError: _description_
+
+        Returns:
+            output: bs, num_token, d_model
+        """
+
+        output = memory_text.transpose(0, 1)
+
+        for layer in self.layers:
+            output = layer(output, src_key_padding_mask=text_attention_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output.transpose(0, 1)
+
+
+
+
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self.nhead = nhead
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        # repeat attn mask
+        if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
+            # bs, num_q, num_k
+            src_mask = src_mask.repeat(self.nhead, 1, 1)
+
+        q = k = self.with_pos_embed(src, pos)
+
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]
+
+        # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/unipose.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/unipose.py
new file mode 100644
index 0000000000000000000000000000000000000000..d05c94c8c784ba3d68262af11080d3f6f3dfed9a
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/unipose.py
@@ -0,0 +1,621 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# ------------------------------------------------------------------------
+import os
+import copy
+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import List
+
+from util.keypoint_ops import keypoint_xyzxyz_to_xyxyzz
+from util.misc import NestedTensor, nested_tensor_from_tensor_list,inverse_sigmoid
+
+from .utils import MLP
+from .backbone import build_backbone
+from ..registry import MODULE_BUILD_FUNCS
+from .mask_generate import prepare_for_mask, post_process
+from .deformable_transformer import build_deformable_transformer
+
+
+class UniPose(nn.Module):
+    """ This is the Cross-Attention Detector module that performs object detection """
+
+    def __init__(self, backbone, transformer, num_classes, num_queries,
+                 aux_loss=False, iter_update=False,
+                 query_dim=2,
+                 random_refpoints_xy=False,
+                 fix_refpoints_hw=-1,
+                 num_feature_levels=1,
+                 nheads=8,
+                 # two stage
+                 two_stage_type='no',  # ['no', 'standard']
+                 two_stage_add_query_num=0,
+                 dec_pred_class_embed_share=True,
+                 dec_pred_bbox_embed_share=True,
+                 two_stage_class_embed_share=True,
+                 two_stage_bbox_embed_share=True,
+                 decoder_sa_type='sa',
+                 num_patterns=0,
+                 dn_number=100,
+                 dn_box_noise_scale=0.4,
+                 dn_label_noise_ratio=0.5,
+                 dn_labelbook_size=100,
+                 use_label_enc=True,
+
+                 text_encoder_type='bert-base-uncased',
+
+                 binary_query_selection=False,
+                 use_cdn=True,
+                 sub_sentence_present=True,
+                 num_body_points=68,
+                 num_box_decoder_layers=2,
+                 ):
+        """ Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+
+            fix_refpoints_hw: -1(default): learn w and h for each box seperately
+                                >0 : given fixed number
+                                -2 : learn a shared w and h
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim = transformer.d_model
+        self.num_feature_levels = num_feature_levels
+        self.nheads = nheads
+        self.use_label_enc = use_label_enc
+        if use_label_enc:
+            self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)
+        else:
+            raise NotImplementedError
+            self.label_enc = None
+        self.max_text_len = 256
+        self.binary_query_selection = binary_query_selection
+        self.sub_sentence_present = sub_sentence_present
+
+        # setting query dim
+        self.query_dim = query_dim
+        assert query_dim == 4
+        self.random_refpoints_xy = random_refpoints_xy
+        self.fix_refpoints_hw = fix_refpoints_hw
+
+        # for dn training
+        self.num_patterns = num_patterns
+        self.dn_number = dn_number
+        self.dn_box_noise_scale = dn_box_noise_scale
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_labelbook_size = dn_labelbook_size
+        self.use_cdn = use_cdn
+
+
+        self.projection = MLP(512, hidden_dim, hidden_dim, 3)
+
+        self.projection_kpt = MLP(512, hidden_dim, hidden_dim, 3)
+
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # model, _ = clip.load("ViT-B/32", device=device)
+        # self.clip_model = model
+        # visual_parameters = list(self.clip_model.visual.parameters())
+        # #
+        # for param in visual_parameters:
+        #     param.requires_grad = False
+
+        self.pos_proj = nn.Linear(hidden_dim, 768)
+        self.padding = nn.Embedding(1, 768)
+
+        # prepare input projection layers
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.num_channels)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                ))
+            for _ in range(num_feature_levels - num_backbone_outs):
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                    nn.GroupNorm(32, hidden_dim),
+                ))
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            assert two_stage_type == 'no', "two_stage_type should be no if num_feature_levels=1 !!!"
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(backbone.num_channels[-1], hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )])
+
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.box_pred_damping = box_pred_damping = None
+
+        self.iter_update = iter_update
+        assert iter_update, "Why not iter_update?"
+
+        # prepare pred layers
+        self.dec_pred_class_embed_share = dec_pred_class_embed_share
+        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share
+        # prepare class & box embed
+        _class_embed = ContrastiveAssign()
+
+
+
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+
+        _pose_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        _pose_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_pose_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_pose_embed.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)]
+        else:
+            box_embed_layerlist = [copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers)]
+        if dec_pred_class_embed_share:
+            class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)]
+        else:
+            class_embed_layerlist = [copy.deepcopy(_class_embed) for i in range(transformer.num_decoder_layers)]
+
+
+        if dec_pred_bbox_embed_share:
+
+            pose_embed_layerlist = [_pose_embed for i in
+                                    range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
+        else:
+            pose_embed_layerlist = [copy.deepcopy(_pose_embed) for i in
+                                    range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
+
+        pose_hw_embed_layerlist = [_pose_hw_embed for i in
+                                   range(transformer.num_decoder_layers - num_box_decoder_layers)]
+
+
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.bbox_embed = nn.ModuleList(box_embed_layerlist)
+        self.class_embed = nn.ModuleList(class_embed_layerlist)
+        self.num_body_points = num_body_points
+        self.pose_embed = nn.ModuleList(pose_embed_layerlist)
+        self.pose_hw_embed = nn.ModuleList(pose_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.class_embed = self.class_embed
+
+        self.transformer.decoder.pose_embed = self.pose_embed
+        self.transformer.decoder.pose_hw_embed = self.pose_hw_embed
+
+        self.transformer.decoder.num_body_points = num_body_points
+
+
+        # two stage
+        self.two_stage_type = two_stage_type
+        self.two_stage_add_query_num = two_stage_add_query_num
+        assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
+        if two_stage_type != 'no':
+            if two_stage_bbox_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_bbox_embed = _bbox_embed
+            else:
+                self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)
+
+            if two_stage_class_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_class_embed = _class_embed
+            else:
+                self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
+
+            self.refpoint_embed = None
+            if self.two_stage_add_query_num > 0:
+                self.init_ref_points(two_stage_add_query_num)
+
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+        # self.replace_sa_with_double_ca = replace_sa_with_double_ca
+        if decoder_sa_type == 'ca_label':
+            self.label_embedding = nn.Embedding(num_classes, hidden_dim)
+            for layer in self.transformer.decoder.layers:
+                layer.label_embedding = self.label_embedding
+        else:
+            for layer in self.transformer.decoder.layers:
+                layer.label_embedding = None
+            self.label_embedding = None
+
+        self._reset_parameters()
+
+    def open_set_transfer_init(self):
+        for name, param in self.named_parameters():
+            if 'fusion_layers' in name:
+                continue
+            if 'ca_text' in name:
+                continue
+            if 'catext_norm' in name:
+                continue
+            if 'catext_dropout' in name:
+                continue
+            if "text_layers" in name:
+                continue
+            if 'bert' in name:
+                continue
+            if 'bbox_embed' in name:
+                continue
+            if 'label_enc.weight' in name:
+                continue
+            if 'feat_map' in name:
+                continue
+            if 'enc_output' in name:
+                continue
+
+            param.requires_grad_(False)
+
+        # import ipdb; ipdb.set_trace()
+
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
+
+        if self.random_refpoints_xy:
+            # import ipdb; ipdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+
+        if self.fix_refpoints_hw > 0:
+            print("fix_refpoints_hw: {}".format(self.fix_refpoints_hw))
+            assert self.random_refpoints_xy
+            self.refpoint_embed.weight.data[:, 2:] = self.fix_refpoints_hw
+            self.refpoint_embed.weight.data[:, 2:] = inverse_sigmoid(self.refpoint_embed.weight.data[:, 2:])
+            self.refpoint_embed.weight.data[:, 2:].requires_grad = False
+        elif int(self.fix_refpoints_hw) == -1:
+            pass
+        elif int(self.fix_refpoints_hw) == -2:
+            print('learn a shared h and w')
+            assert self.random_refpoints_xy
+            self.refpoint_embed = nn.Embedding(use_num_queries, 2)
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+            self.hw_embed = nn.Embedding(1, 1)
+        else:
+            raise NotImplementedError('Unknown fix_refpoints_hw {}'.format(self.fix_refpoints_hw))
+
+    def forward(self, samples: NestedTensor, targets: List = None, **kw):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+            It returns a dict with the following elements:
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x num_classes]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, width, height). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+
+        captions = [t['instance_text_prompt'] for t in targets]
+        bs=len(captions)
+        tensor_list = [tgt["object_embeddings_text"] for tgt in targets]
+        max_size = 350
+        padded_tensors = [torch.cat([tensor, torch.zeros(max_size - tensor.size(0), tensor.size(1),device=tensor.device)]) if tensor.size(0) < max_size else tensor for tensor in tensor_list]
+        object_embeddings_text = torch.stack(padded_tensors)
+
+        kpts_embeddings_text = torch.stack([tgt["kpts_embeddings_text"] for tgt in targets])[:, :self.num_body_points]
+        encoded_text=self.projection(object_embeddings_text) # bs, 81, 101, 256
+        kpt_embeddings_specific=self.projection_kpt(kpts_embeddings_text) # bs, 81, 101, 256
+
+
+        kpt_vis = torch.stack([tgt["kpt_vis_text"] for tgt in targets])[:, :self.num_body_points]
+        kpt_mask = torch.cat((torch.ones_like(kpt_vis, device=kpt_vis.device)[..., 0].unsqueeze(-1), kpt_vis), dim=-1)
+
+
+        num_classes = encoded_text.shape[1] # bs, 81, 101, 256
+        text_self_attention_masks = torch.eye(num_classes).unsqueeze(0).expand(bs, -1, -1).bool().to(samples.device)
+        text_token_mask = torch.zeros(samples.shape[0],num_classes).to(samples.device)>0
+        for i in range(bs):
+            text_token_mask[i,:len(captions[i])]=True
+
+        position_ids = torch.zeros(samples.shape[0], num_classes).to(samples.device)
+
+        for i in range(bs):
+            position_ids[i,:len(captions[i])]= 1
+
+
+        text_dict = {
+            'encoded_text': encoded_text, # bs, 195, d_model
+            'text_token_mask': text_token_mask, # bs, 195
+            'position_ids': position_ids, # bs, 195
+            'text_self_attention_masks': text_self_attention_masks # bs, 195,195
+        }
+
+
+        # import ipdb; ipdb.set_trace()
+
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, poss = self.backbone(samples)
+        if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            import ipdb;
+            ipdb.set_trace()
+
+
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(mask)
+            assert mask is not None
+
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                poss.append(pos_l)
+
+        if self.label_enc is not None:
+            label_enc = self.label_enc
+        else:
+            raise NotImplementedError
+            label_enc = encoded_text
+        if self.dn_number > 0 or targets is not None:
+            input_query_label, input_query_bbox, attn_mask, attn_mask2, dn_meta = \
+                prepare_for_mask(kpt_mask=kpt_mask)
+        else:
+            assert targets is None
+            input_query_bbox = input_query_label = attn_mask = attn_mask2 = dn_meta = None
+
+
+        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(srcs, masks, input_query_bbox, poss,
+                                                                                 input_query_label, attn_mask, attn_mask2,
+                                                                                 text_dict, dn_meta,targets,kpt_embeddings_specific)
+
+        # In case num object=0
+        if self.label_enc is not None:
+            hs[0] += self.label_enc.weight[0, 0] * 0.0
+
+        hs[0] += self.pos_proj.weight[0, 0] * 0.0
+        hs[0] += self.pos_proj.bias[0] * 0.0
+        hs[0] += self.padding.weight[0, 0] * 0.0
+
+        num_group = 50
+        effective_dn_number = dn_meta['pad_size'] if self.training else 0
+        outputs_coord_list = []
+        outputs_class = []
+
+
+        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_cls_embed, layer_hs) in enumerate(
+                zip(reference[:-1], self.bbox_embed, self.class_embed, hs)):
+
+
+            if dec_lid < self.num_box_decoder_layers:
+                layer_delta_unsig = layer_bbox_embed(layer_hs)
+                layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+                layer_outputs_unsig = layer_outputs_unsig.sigmoid()
+                layer_cls = layer_cls_embed(layer_hs, text_dict)
+                outputs_coord_list.append(layer_outputs_unsig)
+                outputs_class.append(layer_cls)
+
+
+            else:
+
+                layer_hs_bbox_dn = layer_hs[:, :effective_dn_number, :]
+                layer_hs_bbox_norm = layer_hs[:, effective_dn_number:, :][:, 0::(self.num_body_points + 1), :]
+                bs = layer_ref_sig.shape[0]
+                reference_before_sigmoid_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]
+                reference_before_sigmoid_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][:,
+                                                     0::(self.num_body_points + 1), :]
+                layer_delta_unsig_dn = layer_bbox_embed(layer_hs_bbox_dn)
+                layer_delta_unsig_norm = layer_bbox_embed(layer_hs_bbox_norm)
+                layer_outputs_unsig_dn = layer_delta_unsig_dn + inverse_sigmoid(reference_before_sigmoid_bbox_dn)
+                layer_outputs_unsig_dn = layer_outputs_unsig_dn.sigmoid()
+                layer_outputs_unsig_norm = layer_delta_unsig_norm + inverse_sigmoid(reference_before_sigmoid_bbox_norm)
+                layer_outputs_unsig_norm = layer_outputs_unsig_norm.sigmoid()
+                layer_outputs_unsig = torch.cat((layer_outputs_unsig_dn, layer_outputs_unsig_norm), dim=1)
+                layer_cls_dn = layer_cls_embed(layer_hs_bbox_dn, text_dict)
+                layer_cls_norm = layer_cls_embed(layer_hs_bbox_norm, text_dict)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+                outputs_class.append(layer_cls)
+                outputs_coord_list.append(layer_outputs_unsig)
+
+        # update keypoints
+        outputs_keypoints_list = []
+        outputs_keypoints_hw = []
+        kpt_index = [x for x in range(num_group * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
+        for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                assert isinstance(layer_hs, torch.Tensor)
+                bs = layer_hs.shape[0]
+                layer_res = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points * 3))
+                outputs_keypoints_list.append(layer_res)
+            else:
+                bs = layer_ref_sig.shape[0]
+                layer_hs_kpt = layer_hs[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
+                                                                                                 device=layer_hs.device))
+                delta_xy_unsig = self.pose_embed[dec_lid - self.num_box_decoder_layers](layer_hs_kpt)
+                layer_ref_sig_kpt = layer_ref_sig[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
+                                                                                                           device=layer_hs.device))
+                layer_outputs_unsig_keypoints = delta_xy_unsig + inverse_sigmoid(layer_ref_sig_kpt[..., :2])
+                vis_xy_unsig = torch.ones_like(layer_outputs_unsig_keypoints,
+                                               device=layer_outputs_unsig_keypoints.device)
+                xyv = torch.cat((layer_outputs_unsig_keypoints, vis_xy_unsig[:, :, 0].unsqueeze(-1)), dim=-1)
+                xyv = xyv.sigmoid()
+                layer_res = xyv.reshape((bs, num_group, self.num_body_points, 3)).flatten(2, 3)
+                layer_hw = layer_ref_sig_kpt[..., 2:].reshape(bs, num_group, self.num_body_points, 2).flatten(2, 3)
+                layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)
+                outputs_keypoints_list.append(layer_res)
+                outputs_keypoints_hw.append(layer_hw)
+
+
+        if self.dn_number > 0 and dn_meta is not None:
+            outputs_class, outputs_coord_list = \
+                post_process(outputs_class, outputs_coord_list,
+                                dn_meta, self.aux_loss, self._set_aux_loss)
+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord_list[-1],
+               'pred_keypoints': outputs_keypoints_list[-1]}
+
+        return out
+
+
+@MODULE_BUILD_FUNCS.registe_with_name(module_name='UniPose')
+def build_unipose(args):
+
+    num_classes = args.num_classes
+    device = torch.device(args.device)
+
+    backbone = build_backbone(args)
+
+    transformer = build_deformable_transformer(args)
+
+    try:
+        match_unstable_error = args.match_unstable_error
+        dn_labelbook_size = args.dn_labelbook_size
+    except:
+        match_unstable_error = True
+        dn_labelbook_size = num_classes
+
+    try:
+        dec_pred_class_embed_share = args.dec_pred_class_embed_share
+    except:
+        dec_pred_class_embed_share = True
+    try:
+        dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
+    except:
+        dec_pred_bbox_embed_share = True
+
+    binary_query_selection = False
+    try:
+        binary_query_selection = args.binary_query_selection
+    except:
+        binary_query_selection = False
+
+    use_cdn = True
+    try:
+        use_cdn = args.use_cdn
+    except:
+        use_cdn = True
+
+    sub_sentence_present = True
+    try:
+        sub_sentence_present = args.sub_sentence_present
+    except:
+        sub_sentence_present = True
+    # print('********* sub_sentence_present', sub_sentence_present)
+
+    model = UniPose(
+        backbone,
+        transformer,
+        num_classes=num_classes,
+        num_queries=args.num_queries,
+        aux_loss=True,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=args.random_refpoints_xy,
+        fix_refpoints_hw=args.fix_refpoints_hw,
+        num_feature_levels=args.num_feature_levels,
+        nheads=args.nheads,
+        dec_pred_class_embed_share=dec_pred_class_embed_share,
+        dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,
+        # two stage
+        two_stage_type=args.two_stage_type,
+        # box_share
+        two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,
+        two_stage_class_embed_share=args.two_stage_class_embed_share,
+        decoder_sa_type=args.decoder_sa_type,
+        num_patterns=args.num_patterns,
+        dn_number=args.dn_number if args.use_dn else 0,
+        dn_box_noise_scale=args.dn_box_noise_scale,
+        dn_label_noise_ratio=args.dn_label_noise_ratio,
+        dn_labelbook_size=dn_labelbook_size,
+        use_label_enc=args.use_label_enc,
+
+        text_encoder_type=args.text_encoder_type,
+
+        binary_query_selection=binary_query_selection,
+        use_cdn=use_cdn,
+        sub_sentence_present=sub_sentence_present
+    )
+
+    return model
+
+
+class ContrastiveAssign(nn.Module):
+    def __init__(self, project=False, cal_bias=None, max_text_len=256):
+        """
+        :param x: query
+        :param y: text embed
+        :param proj:
+        :return:
+        """
+        super().__init__()
+        self.project = project
+        self.cal_bias = cal_bias
+        self.max_text_len = max_text_len
+
+    def forward(self, x, text_dict):
+        """_summary_
+
+        Args:
+            x (_type_): _description_
+            text_dict (_type_): _description_
+            {
+                'encoded_text': encoded_text, # bs, 195, d_model
+                'text_token_mask': text_token_mask, # bs, 195
+                        # True for used tokens. False for padding tokens
+            }
+        Returns:
+            _type_: _description_
+        """
+        assert isinstance(text_dict, dict)
+
+        y = text_dict['encoded_text']
+
+
+        max_text_len = y.shape[1]
+
+
+
+        text_token_mask = text_dict['text_token_mask']
+
+        if self.cal_bias is not None:
+            raise NotImplementedError
+            return x @ y.transpose(-1, -2) + self.cal_bias.weight.repeat(x.shape[0], x.shape[1], 1)
+        res = x @ y.transpose(-1, -2)
+        res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
+
+        # padding to max_text_len
+        new_res = torch.full((*res.shape[:-1], max_text_len), float('-inf'), device=res.device)
+        new_res[..., :res.shape[-1]] = res
+
+        return new_res
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/utils.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..350d8316ae353434b6baca449d0ecd1d4dd9c813
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/UniPose/utils.py
@@ -0,0 +1,348 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+
+import copy
+import torch
+import random
+from torch import nn, Tensor
+import os
+import numpy as np
+import math
+import torch.nn.functional as F
+from torch import nn
+
+
+def _get_clones(module, N, layer_share=False):
+    # import ipdb; ipdb.set_trace()
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def get_sine_pos_embed(
+        pos_tensor: torch.Tensor,
+        num_pos_feats: int = 128,
+        temperature: int = 10000,
+        exchange_xy: bool = True,
+):
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (torch.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
+        return sin_x
+
+    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = torch.cat(pos_res, dim=-1)
+    return pos_res
+
+
+def gen_encoder_output_proposals(memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None):
+    """
+    Input:
+        - memory: bs, \sum{hw}, d_model
+        - memory_padding_mask: bs, \sum{hw}
+        - spatial_shapes: nlevel, 2
+        - learnedwh: 2
+    Output:
+        - output_memory: bs, \sum{hw}, d_model
+        - output_proposals: bs, \sum{hw}, 4
+    """
+    N_, S_, C_ = memory.shape
+    base_scale = 4.0
+    proposals = []
+    _cur = 0
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+        # import ipdb; ipdb.set_trace()
+
+        grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                        torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2
+
+        scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+
+        if learnedwh is not None:
+            # import ipdb; ipdb.set_trace()
+            wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl)
+        else:
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+
+        # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
+        # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+        # wh = torch.ones_like(grid) / scale
+        proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+        proposals.append(proposal)
+        _cur += (H_ * W_)
+    # import ipdb; ipdb.set_trace()
+    output_proposals = torch.cat(proposals, 1)
+    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+    output_proposals = torch.log(output_proposals / (1 - output_proposals))  # unsigmoid
+    output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+    output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+
+    output_memory = memory
+    output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+
+    # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+    # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
+
+    return output_memory, output_proposals
+
+
+class RandomBoxPerturber():
+    def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:
+        self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
+
+    def __call__(self, refanchors: Tensor) -> Tensor:
+        nq, bs, query_dim = refanchors.shape
+        device = refanchors.device
+
+        noise_raw = torch.rand_like(refanchors)
+        noise_scale = self.noise_scale.to(device)[:query_dim]
+
+        new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
+        return new_refanchors.clamp_(0, 1)
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if no_reduction:
+        return loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def _get_activation_fn(activation, d_model=256, batch_dim=0):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    if activation == "prelu":
+        return nn.PReLU()
+    if activation == "selu":
+        return F.selu
+
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+
+
+def gen_sineembed_for_position(pos_tensor):
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
+    return pos
+
+
+def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
+    sigmas = kpt_preds.new_tensor(sigmas)
+    variances = (sigmas * 2) ** 2
+
+    assert kpt_preds.size(0) == kpt_gts.size(0)
+    kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2)
+    kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2)
+
+    squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
+                       (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
+    # import pdb
+    # pdb.set_trace()
+    # assert (kpt_valids.sum(-1) > 0).all()
+    squared_distance0 = squared_distance / (kpt_areas[:, None] * variances[None, :] * 2)
+    squared_distance1 = torch.exp(-squared_distance0)
+    squared_distance1 = squared_distance1 * kpt_valids
+    oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1) + 1e-6)
+
+    return oks
+
+
+def oks_loss(pred,
+             target,
+             valid=None,
+             area=None,
+             linear=False,
+             sigmas=None,
+             eps=1e-6):
+    """Oks loss.
+    Computing the oks loss between a set of predicted poses and target poses.
+    The loss is calculated as negative log of oks.
+    Args:
+        pred (torch.Tensor): Predicted poses of format (x1, y1, x2, y2, ...),
+            shape (n, 2K).
+        target (torch.Tensor): Corresponding gt poses, shape (n, 2K).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        eps (float): Eps to avoid log(0).
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps)
+    if linear:
+        loss = 1 - oks
+    else:
+        loss = -oks.log()
+    return loss
+
+
+class OKSLoss(nn.Module):
+    """IoULoss.
+    Computing the oks loss between a set of predicted poses and target poses.
+    Args:
+        linear (bool): If True, use linear scale of loss instead of log scale.
+            Default: False.
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 linear=False,
+                 num_keypoints=17,
+                 eps=1e-6,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(OKSLoss, self).__init__()
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        if num_keypoints == 68:
+            self.sigmas = np.array([
+                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                1.07, .87, .87, .89, .89, .25, .25, .25, .25, .25, .25, .25, .25,
+                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
+                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
+                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
+            ], dtype=np.float32) / 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+
+    def forward(self,
+                pred,
+                target,
+                valid,
+                area,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            valid (torch.Tensor): The visible flag of the target pose.
+            area (torch.Tensor): The area of the target pose.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * oks_loss(
+            pred,
+            target,
+            valid=valid,
+            area=area,
+            linear=self.linear,
+            sigmas=self.sigmas,
+            eps=self.eps)
+        return loss
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/__init__.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab6c39b38edc376198b02e3d63c5bfc538703530
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/__init__.py
@@ -0,0 +1,16 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .UniPose.unipose import build_unipose
+
+def build_model(args):
+    # we use register to maintain models from catdet6 on.
+    from .registry import MODULE_BUILD_FUNCS
+
+    assert args.modelname in MODULE_BUILD_FUNCS._module_dict
+    build_func = MODULE_BUILD_FUNCS.get(args.modelname)
+    model = build_func(args)
+    return model
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/registry.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f438c6e3918a84cc2004b5da9c1d79d18cfb3118
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/models/registry.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+# @Author: Yihao Chen
+# @Date:   2021-08-16 16:03:17
+# @Last Modified by:   Shilong Liu
+# @Last Modified time: 2022-01-23 15:26
+# modified from mmcv
+
+import inspect
+from functools import partial
+
+
+class Registry(object):
+
+    def __init__(self, name):
+        self._name = name
+        self._module_dict = dict()
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + '(name={}, items={})'.format(
+            self._name, list(self._module_dict.keys()))
+        return format_str
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    def get(self, key):
+        return self._module_dict.get(key, None)
+
+    def registe_with_name(self, module_name=None, force=False):
+        return partial(self.register, module_name=module_name, force=force)
+
+    def register(self, module_build_function, module_name=None, force=False):
+        """Register a module build function.
+        Args:
+            module (:obj:`nn.Module`): Module to be registered.
+        """
+        if not inspect.isfunction(module_build_function):
+            raise TypeError('module_build_function must be a function, but got {}'.format(
+                type(module_build_function)))
+        if module_name is None:
+            module_name = module_build_function.__name__
+        if not force and module_name in self._module_dict:
+            raise KeyError('{} is already registered in {}'.format(
+                module_name, self.name))
+        self._module_dict[module_name] = module_build_function
+
+        return module_build_function
+
+MODULE_BUILD_FUNCS = Registry('model build functions')
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/predefined_keypoints.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/predefined_keypoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32c5adb346783095b6dd192090cde30488f0194
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/predefined_keypoints.py
@@ -0,0 +1,56 @@
+person = {"keypoints":['nose', 'left eye', 'right eye', 'left ear', 'right ear', 'left shoulder', 'right shoulder', 'left elbow', 'right elbow', 'left wrist', 'right wrist', 'left hip', 'right hip', 'left knee', 'right knee', 'left ankle', 'right ankle'],"skeleton": [[16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13],[6,7],[6,8],[7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]]}
+
+face = {"keypoints": ['right cheekbone 1', 'right cheekbone 2', 'right cheek 1', 'right cheek 2', 'right cheek 3', 'right cheek 4', 'right cheek 5', 'right chin', 'chin center', 'left chin', 'left cheek 5', 'left cheek 4', 'left cheek 3', 'left cheek 2', 'left cheek 1', 'left cheekbone 2', 'left cheekbone 1', 'right eyebrow 1', 'right eyebrow 2', 'right eyebrow 3', 'right eyebrow 4', 'right eyebrow 5', 'left eyebrow 1', 'left eyebrow 2', 'left eyebrow 3', 'left eyebrow 4', 'left eyebrow 5', 'nasal bridge 1', 'nasal bridge 2', 'nasal bridge 3', 'nasal bridge 4', 'right nasal wing 1', 'right nasal wing 2', 'nasal wing center', 'left nasal wing 1', 'left nasal wing 2', 'right eye eye corner 1', 'right eye upper eyelid 1', 'right eye upper eyelid 2', 'right eye eye corner 2', 'right eye lower eyelid 2', 'right eye lower eyelid 1', 'left eye eye corner 1', 'left eye upper eyelid 1', 'left eye upper eyelid 2', 'left eye eye corner 2', 'left eye lower eyelid 2', 'left eye lower eyelid 1', 'right mouth corner', 'upper lip outer edge 1', 'upper lip outer edge 2', 'upper lip outer edge 3', 'upper lip outer edge 4', 'upper lip outer edge 5', 'left mouth corner', 'lower lip outer edge 5', 'lower lip outer edge 4', 'lower lip outer edge 3', 'lower lip outer edge 2', 'lower lip outer edge 1', 'upper lip inter edge 1', 'upper lip inter edge 2', 'upper lip inter edge 3', 'upper lip inter edge 4', 'upper lip inter edge 5', 'lower lip inter edge 3', 'lower lip inter edge 2', 'lower lip inter edge 1'], "skeleton": []}
+
+hand = {"keypoints":['wrist', 'thumb root', "thumb's third knuckle", "thumb's second knuckle", 'thumb’s first knuckle', "forefinger's root", "forefinger's third knuckle", "forefinger's second knuckle", "forefinger's first knuckle", "middle finger's root", "middle finger's third knuckle", "middle finger's second knuckle", "middle finger's first knuckle", "ring finger's root", "ring finger's third knuckle", "ring finger's second knuckle", "ring finger's first knuckle", "pinky finger's root", "pinky finger's third knuckle", "pinky finger's second knuckle", "pinky finger's first knuckle"],"skeleton": []}
+
+animal_in_AnimalKindom = {"keypoints":['head mid top', 'eye left', 'eye right', 'mouth front top', 'mouth back left', 'mouth back right', 'mouth front bottom', 'shoulder left', 'shoulder right', 'elbow left', 'elbow right', 'wrist left', 'wrist right', 'torso mid back', 'hip left', 'hip right', 'knee left', 'knee right', 'ankle left ', 'ankle right', 'tail top back', 'tail mid back', 'tail end back'],"skeleton": [[1, 0], [2, 0], [3, 4], [3, 5], [4, 6], [5, 6], [0, 7], [0, 8], [7, 9], [8, 10], [9, 11], [10, 12], [0, 13], [13, 20], [20, 14], [20, 15], [14, 16], [15, 17], [16, 18], [17, 19], [20, 21], [21, 22]]}
+
+animal_in_AP10K = {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
+
+animal= {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
+
+animal_face = {"keypoints": ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip'], "skeleton": []}
+
+fly = {"keypoints": ['head', 'eye left', 'eye right', 'neck', 'thorax', 'abdomen', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'wing left', 'wing right'], "skeleton": [[2, 1], [3, 1], [4, 1], [5, 4], [6, 5], [8, 7], [9, 8], [10, 9], [12, 11], [13, 12], [14, 13], [16, 15], [17, 16], [18, 17], [20, 19], [21, 20], [22, 21], [24, 23], [25, 24], [26, 25], [28, 27], [29, 28], [30, 29], [31, 4], [32, 4]]}
+
+locust = {"keypoints": ['head', 'neck', 'thorax', 'abdomen1', 'abdomen2', 'anttip left', 'antbase left', 'eye left', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'anttip right', 'antbase right', 'eye right', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip'],"skeleton": [[2, 1], [3, 2], [4, 3], [5, 4], [7, 6], [8, 7], [10, 9], [11, 10], [12, 11], [14, 13], [15, 14],[16, 15], [18, 17], [19, 18], [20, 19], [22, 21], [23, 22], [25, 24], [26, 25], [27, 26],[29, 28], [30, 29], [31, 30], [33, 32], [34, 33], [35, 34]]}
+
+car ={"keypoints": ['right front wheel center', 'left front wheel center', 'right rear wheel center', 'left rear wheel center', 'front right', 'front left', 'back right', 'back left', 'none', 'roof front right', 'roof front left', 'roof back right', 'roof back left', 'none'],"skeleton": [[0, 2], [1, 3], [0, 1], [2, 3], [9, 11], [10, 12], [9, 10], [11, 12], [4, 0], [4, 9], [4, 5], [5, 1], [5, 10], [6, 2], [6, 11], [7, 3], [7, 12], [6, 7]]}
+
+short_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+
+long_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'right sleeve inside 3', 'right sleeve inside 4', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 4', 'left sleeve inside 3', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
+
+short_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
+
+sling={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
+
+vest = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
+
+long_sleeved_dress={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'center hem', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+
+long_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+
+trousers = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right side outside 3', 'right cuff outside', 'right cuff inside', 'right side inside 1', 'crotch', 'left side inside 1', 'left cuff inside', 'left cuff outside', 'left side outside 3', 'left side outside 2'], 'skeleton': []}
+
+sling_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
+
+vest_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
+
+skirt = {'keypoints': ['right side 1', 'upper center', 'left side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2'], 'skeleton': []}
+
+short_sleeved_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'left side 1', 'left side 2', 'left side 3', 'left side 4', 'left side 5', 'center hem', 'right side 5', 'right side 4', 'right side 3', 'right side 2', 'right side 1', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+
+shorts = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right cuff outside', 'right cuff inside', 'crotch', 'left cuff inside', 'left cuff outside', 'left side outside 2'], 'skeleton': []}
+
+table = {'keypoints': ['desktop corner 1', 'desktop corner 2', 'desktop corner 3', 'desktop corner 4', 'table leg 1', 'table leg 2', 'table leg 3', 'table leg 4'], 'skeleton': []}
+
+chair = {'keypoints': ['legs righttopcorner', 'legs lefttopcorner', 'legs leftbottomcorner', 'legs rightbottomcorner', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'headboard righttop', 'headboard lefttop'], 'skeleton': []}
+
+bed = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'backrest lefttop'], 'skeleton': []}
+
+sofa = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'armrests rightbottomcorner', 'armrests righttopcorner', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'armrests leftbottomcorner', 'armrests lefttopcorner', 'backrest lefttop'], 'skeleton': []}
+
+swivelchair = {'keypoints': ['rotatingbase 1', 'rotatingbase 2', 'rotatingbase 3', 'rotatingbase 4', 'rotatingbase 5', 'rotatingbase center', 'base center', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'backrest righttop', 'backrest lefttop'], 'skeleton': []}
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/transforms.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9155913bc34afe0cf9c23495a1dac3d8225d2a94
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/transforms.py
@@ -0,0 +1,394 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import os
+import sys
+import random
+
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from util.box_ops import box_xyxy_to_cxcywh
+from util.misc import interpolate
+
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    if target is not None:
+        target = target.copy()
+        i, j, h, w = region
+        id2catname = target["id2catname"]
+        caption_list = target["caption_list"]
+        target["size"] = torch.tensor([h, w])
+
+        fields = ["labels", "area", "iscrowd", "positive_map","keypoints"]
+
+        if "boxes" in target:
+            boxes = target["boxes"]
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+            cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+            cropped_boxes = cropped_boxes.clamp(min=0)
+            area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+            target["boxes"] = cropped_boxes.reshape(-1, 4)
+            target["area"] = area
+            fields.append("boxes")
+
+        if "masks" in target:
+            # FIXME should we update the area here if there are no boxes?
+            target['masks'] = target['masks'][:, i:i + h, j:j + w]
+            fields.append("masks")
+
+
+        # remove elements for which the boxes or masks that have zero area
+        if "boxes" in target or "masks" in target:
+            # favor boxes selection when defining which elements to keep
+            # this is compatible with previous implementation
+            if "boxes" in target:
+                cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+                keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+            else:
+                keep = target['masks'].flatten(1).any(1)
+
+            for field in fields:
+                if field in target:
+                    target[field] = target[field][keep]
+
+        if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            # for debug and visualization only.
+            if 'strings_positive' in target:
+                target['strings_positive'] = [_i for _i, _j in zip(target['strings_positive'], keep) if _j]
+
+
+        if "keypoints" in target:
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            keypoints = target["keypoints"]
+            cropped_keypoints = keypoints.view(-1, 3)[:,:2] - torch.as_tensor([j, i])
+            cropped_keypoints = torch.min(cropped_keypoints, max_size)
+            cropped_keypoints = cropped_keypoints.clamp(min=0)
+            cropped_keypoints = torch.cat([cropped_keypoints, keypoints.view(-1, 3)[:,2].unsqueeze(1)], dim=1)
+            target["keypoints"] = cropped_keypoints.view(target["keypoints"].shape[0], target["keypoints"].shape[1], 3)
+
+        target["id2catname"] = id2catname
+        target["caption_list"] = caption_list
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    if target is not None:
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+            target["boxes"] = boxes
+
+        if "masks" in target:
+            target['masks'] = target['masks'].flip(-1)
+
+
+        if "keypoints" in target:
+            dataset_name=target["dataset_name"]
+            if dataset_name == "coco_person" or dataset_name == "macaque":
+                flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8],
+                                   [9, 10], [11, 12], [13, 14], [15, 16]]
+
+            elif dataset_name=="animalkindom_ak_P1_animal":
+                flip_pairs = [[1, 2], [4, 5],[7,8],[9,10],[11,12],[14,15],[16,17],[18,19]]
+
+            elif dataset_name=="animalweb_animal":
+                flip_pairs = [[0, 3], [1, 2], [5, 6]]
+
+            elif dataset_name=="face":
+                flip_pairs = [
+                                [0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9],
+                                [17, 26], [18, 25], [19, 24], [20, 23], [21, 22],
+                                [31, 35], [32, 34],
+                                [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
+                                [48, 54], [49, 53], [50, 52],
+                                [55, 59], [56, 58],
+                                [60, 64], [61, 63],
+                                [65, 67]
+                            ]
+
+            elif dataset_name=="hand":
+                flip_pairs = []
+
+            elif dataset_name=="foot":
+                flip_pairs = []
+
+            elif dataset_name=="locust":
+                flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]]
+
+            elif dataset_name=="fly":
+                flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], [16, 28], [17, 29], [30, 31]]
+
+            elif dataset_name == "ap_36k_animal" or dataset_name == "ap_10k_animal":
+                flip_pairs = [[0, 1],[5, 8], [6, 9], [7, 10], [11, 14], [12, 15], [13, 16]]
+
+
+
+            keypoints = target["keypoints"]
+            keypoints[:,:,0] = w - keypoints[:,:, 0]-1
+            for pair in flip_pairs:
+                keypoints[:,pair[0], :], keypoints[:,pair[1], :] = keypoints[:,pair[1], :], keypoints[:,pair[0], :].clone()
+            target["keypoints"] = keypoints
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+
+    if "keypoints" in target:
+        keypoints = target["keypoints"]
+        scaled_keypoints = keypoints * torch.as_tensor([ratio_width, ratio_height, 1])
+        target["keypoints"] = scaled_keypoints
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+
+
+class ResizeDebug(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        return resize(img, target, self.size)
+
+
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+
+
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
+        # respect_boxes:    True to keep all boxes
+        #                   False to tolerence box filter
+        self.min_size = min_size
+        self.max_size = max_size
+        self.respect_boxes = respect_boxes
+
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        init_boxes = len(target["boxes"]) if (target is not None and "boxes" in target) else 0
+        max_patience = 10
+        for i in range(max_patience):
+            w = random.randint(self.min_size, min(img.width, self.max_size))
+            h = random.randint(self.min_size, min(img.height, self.max_size))
+            region = T.RandomCrop.get_params(img, [h, w])
+            result_img, result_target = crop(img, target, region)
+            if target is not None:
+                if not self.respect_boxes or len(result_target["boxes"]) == init_boxes or i == max_patience - 1:
+                    return result_img, result_target
+        return result_img, result_target
+
+
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+
+
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+
+
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+
+
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+
+
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+
+
+class RandomErasing(object):
+
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+
+    def __call__(self, img, target):
+        return self.eraser(img), target
+
+
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+
+        if "area" in target:
+            area = target["area"]
+            area = area / (torch.tensor(w, dtype=torch.float32)*torch.tensor(h, dtype=torch.float32))
+            target["area"] = area
+
+        if "keypoints" in target:
+            keypoints = target["keypoints"]
+            V = keypoints[:, :, 2]
+            V[V == 2] = 1
+            Z=keypoints[:, :, :2]
+            Z = Z.contiguous().view(-1, 2 * V.shape[-1])
+            Z = Z / torch.tensor([w, h] * V.shape[-1], dtype=torch.float32)
+            target["valid_kpt_num"] = V.shape[1]
+            Z_pad = torch.zeros(Z.shape[0],68 * 2 - Z.shape[1])
+            V_pad = torch.zeros(V.shape[0],68 - V.shape[1])
+            V=torch.cat([V, V_pad], dim=1)
+            Z=torch.cat([Z, Z_pad], dim=1)
+            all_keypoints = torch.cat([Z, V], dim=1)
+            target["keypoints"] = all_keypoints
+
+
+        return image, target
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/addict.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/addict.py
new file mode 100644
index 0000000000000000000000000000000000000000..55e02d1d17596c77a6f3642ba02eeb30971048bd
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/addict.py
@@ -0,0 +1,159 @@
+import copy
+
+
+class Dict(dict):
+
+    def __init__(__self, *args, **kwargs):
+        object.__setattr__(__self, '__parent', kwargs.pop('__parent', None))
+        object.__setattr__(__self, '__key', kwargs.pop('__key', None))
+        object.__setattr__(__self, '__frozen', False)
+        for arg in args:
+            if not arg:
+                continue
+            elif isinstance(arg, dict):
+                for key, val in arg.items():
+                    __self[key] = __self._hook(val)
+            elif isinstance(arg, tuple) and (not isinstance(arg[0], tuple)):
+                __self[arg[0]] = __self._hook(arg[1])
+            else:
+                for key, val in iter(arg):
+                    __self[key] = __self._hook(val)
+
+        for key, val in kwargs.items():
+            __self[key] = __self._hook(val)
+
+    def __setattr__(self, name, value):
+        if hasattr(self.__class__, name):
+            raise AttributeError("'Dict' object attribute "
+                                 "'{0}' is read-only".format(name))
+        else:
+            self[name] = value
+
+    def __setitem__(self, name, value):
+        isFrozen = (hasattr(self, '__frozen') and
+                    object.__getattribute__(self, '__frozen'))
+        if isFrozen and name not in super(Dict, self).keys():
+                raise KeyError(name)
+        super(Dict, self).__setitem__(name, value)
+        try:
+            p = object.__getattribute__(self, '__parent')
+            key = object.__getattribute__(self, '__key')
+        except AttributeError:
+            p = None
+            key = None
+        if p is not None:
+            p[key] = self
+            object.__delattr__(self, '__parent')
+            object.__delattr__(self, '__key')
+
+    def __add__(self, other):
+        if not self.keys():
+            return other
+        else:
+            self_type = type(self).__name__
+            other_type = type(other).__name__
+            msg = "unsupported operand type(s) for +: '{}' and '{}'"
+            raise TypeError(msg.format(self_type, other_type))
+
+    @classmethod
+    def _hook(cls, item):
+        if isinstance(item, dict):
+            return cls(item)
+        elif isinstance(item, (list, tuple)):
+            return type(item)(cls._hook(elem) for elem in item)
+        return item
+
+    def __getattr__(self, item):
+        return self.__getitem__(item)
+
+    def __missing__(self, name):
+        if object.__getattribute__(self, '__frozen'):
+            raise KeyError(name)
+        return self.__class__(__parent=self, __key=name)
+
+    def __delattr__(self, name):
+        del self[name]
+
+    def to_dict(self):
+        base = {}
+        for key, value in self.items():
+            if isinstance(value, type(self)):
+                base[key] = value.to_dict()
+            elif isinstance(value, (list, tuple)):
+                base[key] = type(value)(
+                    item.to_dict() if isinstance(item, type(self)) else
+                    item for item in value)
+            else:
+                base[key] = value
+        return base
+
+    def copy(self):
+        return copy.copy(self)
+
+    def deepcopy(self):
+        return copy.deepcopy(self)
+
+    def __deepcopy__(self, memo):
+        other = self.__class__()
+        memo[id(self)] = other
+        for key, value in self.items():
+            other[copy.deepcopy(key, memo)] = copy.deepcopy(value, memo)
+        return other
+
+    def update(self, *args, **kwargs):
+        other = {}
+        if args:
+            if len(args) > 1:
+                raise TypeError()
+            other.update(args[0])
+        other.update(kwargs)
+        for k, v in other.items():
+            if ((k not in self) or
+                (not isinstance(self[k], dict)) or
+                (not isinstance(v, dict))):
+                self[k] = v
+            else:
+                self[k].update(v)
+
+    def __getnewargs__(self):
+        return tuple(self.items())
+
+    def __getstate__(self):
+        return self
+
+    def __setstate__(self, state):
+        self.update(state)
+
+    def __or__(self, other):
+        if not isinstance(other, (Dict, dict)):
+            return NotImplemented
+        new = Dict(self)
+        new.update(other)
+        return new
+
+    def __ror__(self, other):
+        if not isinstance(other, (Dict, dict)):
+            return NotImplemented
+        new = Dict(other)
+        new.update(self)
+        return new
+
+    def __ior__(self, other):
+        self.update(other)
+        return self
+
+    def setdefault(self, key, default=None):
+        if key in self:
+            return self[key]
+        else:
+            self[key] = default
+            return default
+
+    def freeze(self, shouldFreeze=True):
+        object.__setattr__(self, '__frozen', shouldFreeze)
+        for key, val in self.items():
+            if isinstance(val, Dict):
+                val.freeze(shouldFreeze)
+
+    def unfreeze(self):
+        self.freeze(False)
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/box_ops.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/box_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff6624064ca10682f0da4c52073fd8006456a9b
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/box_ops.py
@@ -0,0 +1,139 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Utilities for bounding box manipulation and GIoU.
+"""
+import torch, os
+from torchvision.ops.boxes import box_area
+
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    # import ipdb; ipdb.set_trace()
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / (union + 1e-6)
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    The boxes should be in [x0, y0, x1, y1] format
+
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    # except:
+    #     import ipdb; ipdb.set_trace()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / (area + 1e-6)
+
+
+
+# modified from torchvision to also return the union
+def box_iou_pairwise(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+
+    union = area1 + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou_pairwise(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    Input:
+        - boxes1, boxes2: N,4
+    Output:
+        - giou: N, 4
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    assert boxes1.shape == boxes2.shape
+    iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
+
+    lt = torch.min(boxes1[:, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    area = wh[:, 0] * wh[:, 1]
+
+    return iou - (area - union) / area
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+if __name__ == '__main__':
+    x = torch.rand(5, 4)
+    y = torch.rand(3, 4)
+    iou, union = box_iou(x, y)
+    import ipdb; ipdb.set_trace()
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/config.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..22553079479228cbc26cac4ae70d5a757b0d8e52
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/config.py
@@ -0,0 +1,428 @@
+# ==========================================================
+# Modified from mmcv
+# ==========================================================
+import sys
+import os.path as osp
+import ast
+import tempfile
+import shutil
+from importlib import import_module
+from argparse import Action
+
+from .addict import Dict
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text', 'get', 'dump', 'merge_from_dict']
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super(ConfigDict, self).__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+
+class Config(object):
+    """
+    config files.
+    only support .py file as config now.
+
+    ref: mmcv.utils.config
+
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+    """
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename) as f:
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}')
+
+    @staticmethod
+    def _file2dict(filename):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        if filename.lower().endswith('.py'):
+            with tempfile.TemporaryDirectory() as temp_config_dir:
+                temp_config_file = tempfile.NamedTemporaryFile(
+                    dir=temp_config_dir, suffix='.py')
+                temp_config_name = osp.basename(temp_config_file.name)
+                # close temp file before copy
+                temp_config_file.close()
+                shutil.copyfile(filename,
+                                osp.join(temp_config_dir, temp_config_name))
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                Config._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                }
+                # delete imported module
+                del sys.modules[temp_module_name]
+                
+
+        elif filename.lower().endswith(('.yml', '.yaml', '.json')):
+            from .slio import slload
+            cfg_dict = slload(filename)
+        else:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+
+        cfg_text = filename + '\n'
+        with open(filename, 'r') as f:
+            cfg_text += f.read()
+
+        # parse the base file
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                if len(base_cfg_dict.keys() & c.keys()) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases')
+                    # TODO Allow the duplicate key while warnning user
+                base_cfg_dict.update(c)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b):
+        """merge dict `a` into dict `b` (non-inplace).
+            values in `a` will overwrite `b`.
+            copy first to avoid inplace modification
+            
+        Args:
+            a ([type]): [description]
+            b ([type]): [description]
+
+        Returns:
+            [dict]: [description]
+        """
+        # import ipdb; ipdb.set_trace()
+        if not isinstance(a, dict):
+            return a
+
+        b = b.copy()
+        for k, v in a.items():
+            if isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False):
+            
+                if not isinstance(b[k], dict) and not isinstance(b[k], list):
+                    # if :
+                    # import ipdb; ipdb.set_trace()
+                    raise TypeError(
+                        f'{k}={v} in child config cannot inherit from base '
+                        f'because {k} is a dict in the child config but is of '
+                        f'type {type(b[k])} in base config. You may set '
+                        f'`{DELETE_KEY}=True` to ignore the base config')
+                b[k] = Config._merge_a_into_b(v, b[k])
+            elif isinstance(b, list):
+                try:
+                    _ = int(k)
+                except:
+                    raise TypeError(
+                        f'b is a list, '
+                        f'index {k} should be an int when input but {type(k)}'
+                    )
+                b[int(k)] = Config._merge_a_into_b(v, b[int(k)])
+            else:   
+                b[k] = v
+                
+        return b
+
+    @staticmethod
+    def fromfile(filename):
+        cfg_dict, cfg_text = Config._file2dict(filename)
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super(Config, self).__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, 'r') as f:
+                text = f.read()
+        else:
+            text = ''
+        super(Config, self).__setattr__('_text', text)
+
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        return text
+    
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+        # # debug
+        # print('+'*15)
+        # print('name=%s' % name)
+        # print("addr:", id(self))
+        # # print('type(self):', type(self))
+        # print(self.__dict__)
+        # print('+'*15)
+        # if self.__dict__ == {}:
+        #     raise ValueError
+
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def dump(self, file=None):
+        # import ipdb; ipdb.set_trace()
+        if file is None:
+            return self.pretty_text
+        else:
+            with open(file, 'w') as f:
+                f.write(self.pretty_text)
+
+    def merge_from_dict(self, options):
+        """Merge list into cfg_dict
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+
+        Args:
+            options (dict): dict of configs to merge from.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+        super(Config, self).__setattr__(
+            '_cfg_dict', Config._merge_a_into_b(option_cfg_dict, cfg_dict))
+
+    # for multiprocess
+    def __setstate__(self, state):
+        self.__init__(state)
+
+
+    def copy(self):
+        return Config(self._cfg_dict.copy())
+
+    def deepcopy(self):
+        return Config(self._cfg_dict.deepcopy())
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options should
+    be passed as comma separated values, i.e KEY=V1,V2,V3
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        if val.lower() in ['none', 'null']:
+            return None
+        return val
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            val = [self._parse_int_float_bool(v) for v in val.split(',')]
+            if len(val) == 1:
+                val = val[0]
+            options[key] = val
+        setattr(namespace, self.dest, options)
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/keypoint_ops.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/keypoint_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..036d813d555f2f9beee252319c40b05c2f716168
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/keypoint_ops.py
@@ -0,0 +1,29 @@
+import torch, os
+
+def keypoint_xyxyzz_to_xyzxyz(keypoints: torch.Tensor):
+    """_summary_
+
+    Args:
+        keypoints (torch.Tensor): ..., 51
+    """
+    res = torch.zeros_like(keypoints)
+    num_points = keypoints.shape[-1] // 3
+    Z = keypoints[..., :2*num_points]
+    V = keypoints[..., 2*num_points:]
+    res[...,0::3] = Z[..., 0::2]
+    res[...,1::3] = Z[..., 1::2]
+    res[...,2::3] = V[...]
+    return res
+
+def keypoint_xyzxyz_to_xyxyzz(keypoints: torch.Tensor):
+    """_summary_
+
+    Args:
+        keypoints (torch.Tensor): ..., 51
+    """
+    res = torch.zeros_like(keypoints)
+    num_points = keypoints.shape[-1] // 3
+    res[...,0:2*num_points:2] = keypoints[..., 0::3]
+    res[...,1:2*num_points:2] = keypoints[..., 1::3]
+    res[...,2*num_points:] = keypoints[..., 2::3]
+    return res
\ No newline at end of file
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/misc.py b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fa90f3be6f389cd3ecf7323b55583f021616247
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/XPose/util/misc.py
@@ -0,0 +1,701 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Misc functions, including distributed helpers.
+
+Mostly copy-paste from torchvision references.
+"""
+import functools
+import io
+import os
+import random 
+import subprocess
+import time
+from collections import OrderedDict, defaultdict, deque
+import datetime
+import pickle
+from typing import Optional, List
+
+import json, time
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch import Tensor
+
+import colorsys
+
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+__torchvision_need_compat_flag = float(torchvision.__version__.split('.')[1]) < 7
+if __torchvision_need_compat_flag:
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        if d.shape[0] == 0:
+            return 0
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        if os.environ.get("SHILONG_AMP", None) == '1':
+            eps = 1e-4
+        else:
+            eps = 1e-6
+        return self.total / (self.count + eps)
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    """
+
+    if dist.get_backend() == "nccl":
+        return dist.new_group(backend="gloo")
+
+    return dist.group.WORLD
+
+def all_gather_cpu(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    cpu_group = _get_global_gloo_group()
+
+    buffer = io.BytesIO()
+    torch.save(data, buffer)
+    data_view = buffer.getbuffer()
+    device = "cuda" if cpu_group is None else "cpu"
+    tensor = torch.ByteTensor(data_view).to(device)
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device=device, dtype=torch.long)
+    size_list = [torch.tensor([0], device=device, dtype=torch.long) for _ in range(world_size)]
+    if cpu_group is None:
+        dist.all_gather(size_list, local_size)
+    else:
+        print("gathering on cpu")
+        dist.all_gather(size_list, local_size, group=cpu_group)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    assert isinstance(local_size.item(), int)
+    local_size = int(local_size.item())
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device=device))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device=device)
+        tensor = torch.cat((tensor, padding), dim=0)
+    if cpu_group is None:
+        dist.all_gather(tensor_list, tensor)
+    else:
+        dist.all_gather(tensor_list, tensor, group=cpu_group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        tensor = torch.split(tensor, [size, max_size - size], dim=0)[0]
+        buffer = io.BytesIO(tensor.cpu().numpy())
+        obj = torch.load(buffer)
+        data_list.append(obj)
+
+    return data_list
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+
+    if os.getenv("CPU_REDUCE") == "1":
+        return all_gather_cpu(data)
+
+
+
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            # print(name, str(meter))
+            # import ipdb;ipdb.set_trace()
+            if meter.count > 0:
+                loss_str.append(
+                    "{}: {}".format(name, str(meter))
+                )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None, logger=None):
+        if logger is None:
+            print_func = print
+        else:
+            print_func = logger.info
+
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            # import ipdb; ipdb.set_trace()
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print_func(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print_func(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print_func('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
+    sha = 'N/A'
+    diff = "clean"
+    branch = 'N/A'
+    try:
+        sha = _run(['git', 'rev-parse', 'HEAD'])
+        subprocess.check_output(['git', 'diff'], cwd=cwd)
+        diff = _run(['git', 'diff-index', 'HEAD'])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+def collate_fn(batch):
+    # import ipdb; ipdb.set_trace()
+    batch = list(zip(*batch))
+    batch[0] = nested_tensor_from_tensor_list(batch[0])
+    return tuple(batch)
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+        if mask == 'auto':
+            self.mask = torch.zeros_like(tensors).to(tensors.device)
+            if self.mask.dim() == 3:
+                self.mask = self.mask.sum(0).to(bool)
+            elif self.mask.dim() == 4:
+                self.mask = self.mask.sum(1).to(bool)
+            else:
+                raise ValueError("tensors dim must be 3 or 4 but {}({})".format(self.tensors.dim(), self.tensors.shape))
+
+    def imgsize(self):
+        res = []
+        for i in range(self.tensors.shape[0]):
+            mask = self.mask[i]
+            maxH = (~mask).sum(0).max()
+            maxW = (~mask).sum(1).max()
+            res.append(torch.Tensor([maxH, maxW]))
+        return res
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def to_img_list_single(self, tensor, mask):
+        assert tensor.dim() == 3, "dim of tensor should be 3 but {}".format(tensor.dim())
+        maxH = (~mask).sum(0).max()
+        maxW = (~mask).sum(1).max()
+        img = tensor[:, :maxH, :maxW]
+        return img
+
+    def to_img_list(self):
+        """remove the padding and convert to img list
+
+        Returns:
+            [type]: [description]
+        """
+        if self.tensors.dim() == 3:
+            return self.to_img_list_single(self.tensors, self.mask)
+        else:
+            res = []
+            for i in range(self.tensors.shape[0]):
+                tensor_i = self.tensors[i]
+                mask_i = self.mask[i]
+                res.append(self.to_img_list_single(tensor_i, mask_i))
+            return res
+
+    @property
+    def device(self):
+        return self.tensors.device
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+    @property
+    def shape(self):
+        return {
+            'tensors.shape': self.tensors.shape,
+            'mask.shape': self.mask.shape
+        }
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], :img.shape[2]] = False
+    else:
+        raise ValueError('not supported')
+    return NestedTensor(tensor, mask)
+
+
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+
+    return NestedTensor(tensor, mask=mask)
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+def init_distributed_mode(args):
+    if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and 
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
+
+        # launch by torch.distributed.launch
+        # Single node
+        #   python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 1 --rank 0 ...
+        # Multi nodes
+        #   python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 2 --rank 0 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' ...
+        #   python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 2 --rank 1 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' ...
+        # args.rank = int(os.environ.get('OMPI_COMM_WORLD_RANK'))        
+        # local_world_size = int(os.environ['GPU_PER_NODE_COUNT'])
+        # args.world_size = args.world_size * local_world_size
+        # args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
+        # args.rank = args.rank * local_world_size + args.local_rank
+        print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank))
+        print(json.dumps(dict(os.environ), indent=2))
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID'])
+        args.world_size = int(os.environ['SLURM_NPROCS'])
+
+        if os.environ.get('HAND_DEFINE_DIST_URL', 0) == '1':
+            pass
+        else:
+            import util.hostlist as uh
+            nodenames = uh.parse_nodelist(os.environ['SLURM_JOB_NODELIST'])
+            gpu_ids = [int(node[3:]) for node in nodenames]
+            fixid = int(os.environ.get('FIX_DISTRIBUTED_PORT_NUMBER', 0))
+            # fixid += random.randint(0, 300)
+            port = str(3137 + int(min(gpu_ids)) + fixid)
+            args.dist_url = "tcp://{ip}:{port}".format(ip=uh.nodename_to_ip(nodenames[0]), port=port)
+
+        print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count()))
+
+
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        args.world_size = 1
+        args.rank = 0
+        args.local_rank = 0
+        return
+
+    print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank))
+    args.distributed = True
+    torch.cuda.set_device(args.local_rank)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
+
+    torch.distributed.init_process_group(
+        backend=args.dist_backend, 
+        world_size=args.world_size, 
+        rank=args.rank,
+        init_method=args.dist_url,
+    )
+
+    print("Before torch.distributed.barrier()")
+    torch.distributed.barrier()
+    print("End torch.distributed.barrier()")
+    setup_for_distributed(args.rank == 0)
+
+
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+@torch.no_grad()
+def accuracy_onehot(pred, gt):
+    """_summary_
+
+    Args:
+        pred (_type_): n, c
+        gt (_type_): n, c
+    """
+    tp = ((pred - gt).abs().sum(-1) < 1e-4).float().sum()
+    acc = tp / gt.shape[0] * 100
+    return acc
+
+
+
+
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if __torchvision_need_compat_flag < 0.7:
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(
+                input, size, scale_factor, mode, align_corners
+            )
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+
+class color_sys():
+    def __init__(self, num_colors) -> None:
+        self.num_colors = num_colors
+        colors=[]
+        for i in np.arange(0., 360., 360. / num_colors):
+            hue = i/360.
+            lightness = (50 + np.random.rand() * 10)/100.
+            saturation = (90 + np.random.rand() * 10)/100.
+            colors.append(tuple([int(j*255) for j in colorsys.hls_to_rgb(hue, lightness, saturation)]))
+        self.colors = colors
+
+    def __call__(self, idx):
+        return self.colors[idx]
+
+def inverse_sigmoid(x, eps=1e-3):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1/x2)
+
+def clean_state_dict(state_dict):
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k[:7] == 'module.':
+            k = k[7:]  # remove `module.`
+        new_state_dict[k] = v
+    return new_state_dict
\ No newline at end of file
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/__init__.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fcc3cf5bd57edd623aeffe3d798dfa6714629a9
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/__init__.py
@@ -0,0 +1,19 @@
+# coding: utf-8
+"""InsightFace: A Face Analysis Toolkit."""
+from __future__ import absolute_import
+
+try:
+    #import mxnet as mx
+    import onnxruntime
+except ImportError:
+    raise ImportError(
+        "Unable to import dependency onnxruntime. "
+    )
+
+__version__ = '0.7.3'
+
+from . import model_zoo
+from . import utils
+from . import app
+from . import data
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/app/__init__.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/app/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc574616885290489798bac5c682e7aaa65a5dad
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/app/__init__.py
@@ -0,0 +1 @@
+from .face_analysis import *
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/app/common.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/app/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..82ca987aeede35510b3aef72b4edf2390ad84e65
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/app/common.py
@@ -0,0 +1,49 @@
+import numpy as np
+from numpy.linalg import norm as l2norm
+#from easydict import EasyDict
+
+class Face(dict):
+
+    def __init__(self, d=None, **kwargs):
+        if d is None:
+            d = {}
+        if kwargs:
+            d.update(**kwargs)
+        for k, v in d.items():
+            setattr(self, k, v)
+        # Class attributes
+        #for k in self.__class__.__dict__.keys():
+        #    if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):
+        #        setattr(self, k, getattr(self, k))
+
+    def __setattr__(self, name, value):
+        if isinstance(value, (list, tuple)):
+            value = [self.__class__(x)
+                    if isinstance(x, dict) else x for x in value]
+        elif isinstance(value, dict) and not isinstance(value, self.__class__):
+            value = self.__class__(value)
+        super(Face, self).__setattr__(name, value)
+        super(Face, self).__setitem__(name, value)
+
+    __setitem__ = __setattr__
+
+    def __getattr__(self, name):
+        return None
+
+    @property
+    def embedding_norm(self):
+        if self.embedding is None:
+            return None
+        return l2norm(self.embedding)
+
+    @property 
+    def normed_embedding(self):
+        if self.embedding is None:
+            return None
+        return self.embedding / self.embedding_norm
+
+    @property 
+    def sex(self):
+        if self.gender is None:
+            return None
+        return 'M' if self.gender==1 else 'F'
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/app/face_analysis.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/app/face_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d612cf6f64b3d73281433ed6ad3a0abfd6eef41
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/app/face_analysis.py
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      :
+
+
+from __future__ import division
+
+import glob
+import os.path as osp
+
+import numpy as np
+import onnxruntime
+from numpy.linalg import norm
+
+from ..model_zoo import model_zoo
+from ..utils import ensure_available
+from .common import Face
+
+
+DEFAULT_MP_NAME = 'buffalo_l'
+__all__ = ['FaceAnalysis'] 
+
+class FaceAnalysis:
+    def __init__(self, name=DEFAULT_MP_NAME, root='~/.insightface', allowed_modules=None, **kwargs):
+        onnxruntime.set_default_logger_severity(3)
+        self.models = {}
+        self.model_dir = ensure_available('models', name, root=root)
+        onnx_files = glob.glob(osp.join(self.model_dir, '*.onnx'))
+        onnx_files = sorted(onnx_files)
+        for onnx_file in onnx_files:
+            model = model_zoo.get_model(onnx_file, **kwargs)
+            if model is None:
+                print('model not recognized:', onnx_file)
+            elif allowed_modules is not None and model.taskname not in allowed_modules:
+                print('model ignore:', onnx_file, model.taskname)
+                del model
+            elif model.taskname not in self.models and (allowed_modules is None or model.taskname in allowed_modules):
+                # print('find model:', onnx_file, model.taskname, model.input_shape, model.input_mean, model.input_std)
+                self.models[model.taskname] = model
+            else:
+                print('duplicated model task type, ignore:', onnx_file, model.taskname)
+                del model
+        assert 'detection' in self.models
+        self.det_model = self.models['detection']
+
+
+    def prepare(self, ctx_id, det_thresh=0.5, det_size=(640, 640)):
+        self.det_thresh = det_thresh
+        assert det_size is not None
+        # print('set det-size:', det_size)
+        self.det_size = det_size
+        for taskname, model in self.models.items():
+            if taskname=='detection':
+                model.prepare(ctx_id, input_size=det_size, det_thresh=det_thresh)
+            else:
+                model.prepare(ctx_id)
+
+    def get(self, img, max_num=0):
+        bboxes, kpss = self.det_model.detect(img,
+                                             max_num=max_num,
+                                             metric='default')
+        if bboxes.shape[0] == 0:
+            return []
+        ret = []
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i, 0:4]
+            det_score = bboxes[i, 4]
+            kps = None
+            if kpss is not None:
+                kps = kpss[i]
+            face = Face(bbox=bbox, kps=kps, det_score=det_score)
+            for taskname, model in self.models.items():
+                if taskname=='detection':
+                    continue
+                model.get(img, face)
+            ret.append(face)
+        return ret
+
+    def draw_on(self, img, faces):
+        import cv2
+        dimg = img.copy()
+        for i in range(len(faces)):
+            face = faces[i]
+            box = face.bbox.astype(np.int)
+            color = (0, 0, 255)
+            cv2.rectangle(dimg, (box[0], box[1]), (box[2], box[3]), color, 2)
+            if face.kps is not None:
+                kps = face.kps.astype(np.int)
+                #print(landmark.shape)
+                for l in range(kps.shape[0]):
+                    color = (0, 0, 255)
+                    if l == 0 or l == 3:
+                        color = (0, 255, 0)
+                    cv2.circle(dimg, (kps[l][0], kps[l][1]), 1, color,
+                               2)
+            if face.gender is not None and face.age is not None:
+                cv2.putText(dimg,'%s,%d'%(face.sex,face.age), (box[0]-1, box[1]-4),cv2.FONT_HERSHEY_COMPLEX,0.7,(0,255,0),1)
+
+            #for key, value in face.items():
+            #    if key.startswith('landmark_3d'):
+            #        print(key, value.shape)
+            #        print(value[0:10,:])
+            #        lmk = np.round(value).astype(np.int)
+            #        for l in range(lmk.shape[0]):
+            #            color = (255, 0, 0)
+            #            cv2.circle(dimg, (lmk[l][0], lmk[l][1]), 1, color,
+            #                       2)
+        return dimg
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/__init__.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..665c59ec99b6ebf12822015e0350969c7903e243
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/__init__.py
@@ -0,0 +1,2 @@
+from .image import get_image
+from .pickle_object import get_object
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/image.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d32c4bcb1b13d33bcb0d840cf7b8c08d183b3ea
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/image.py
@@ -0,0 +1,27 @@
+import cv2
+import os
+import os.path as osp
+from pathlib import Path
+
+class ImageCache:
+    data = {}
+
+def get_image(name, to_rgb=False):
+    key = (name, to_rgb)
+    if key in ImageCache.data:
+        return ImageCache.data[key]
+    images_dir = osp.join(Path(__file__).parent.absolute(), 'images')
+    ext_names = ['.jpg', '.png', '.jpeg']
+    image_file = None
+    for ext_name in ext_names:
+        _image_file = osp.join(images_dir, "%s%s"%(name, ext_name))
+        if osp.exists(_image_file):
+            image_file = _image_file
+            break
+    assert image_file is not None, '%s not found'%name
+    img = cv2.imread(image_file)
+    if to_rgb:
+        img = img[:,:,::-1]
+    ImageCache.data[key] = img
+    return img
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/Tom_Hanks_54745.png b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/Tom_Hanks_54745.png
new file mode 100644
index 0000000000000000000000000000000000000000..906315d13fa29bb3a5ded3e162592f2c7f041b23
Binary files /dev/null and b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/Tom_Hanks_54745.png differ
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_black.jpg b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_black.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0eab0df555c23f1e033537fe39f3c0c8303dd369
Binary files /dev/null and b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_black.jpg differ
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_blue.jpg b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_blue.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f71336b9a0d3038ebd84e6995ebfbe54946fcbb4
Binary files /dev/null and b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_blue.jpg differ
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_green.jpg b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_green.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ac2ad55f4fc580c915dfa4c157ca3bfc84e453f4
Binary files /dev/null and b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_green.jpg differ
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_white.jpg b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_white.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2148ab2d09fdee6e3f59315470e98ecfc54339e4
Binary files /dev/null and b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/mask_white.jpg differ
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/t1.jpg b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/t1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8fd6427a177bd01650c0150e9d02457c3a5dcddd
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/images/t1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47f682e945b659f93a9e490b9c9c4a2a864abe64dace9e1a2893845ddfd69489
+size 128824
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/objects/meanshape_68.pkl b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/objects/meanshape_68.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..d5297e9e8ea5574298ddd287b058252e03aa18c1
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/objects/meanshape_68.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39ffecf84ba73f0d0d7e49380833ba88713c9fcdec51df4f7ac45a48b8f4cc51
+size 974
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/pickle_object.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/pickle_object.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbd87030ea15e1d01af1cd4cff1be2bc54cc82dd
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/pickle_object.py
@@ -0,0 +1,17 @@
+import cv2
+import os
+import os.path as osp
+from pathlib import Path
+import pickle
+
+def get_object(name):
+    objects_dir = osp.join(Path(__file__).parent.absolute(), 'objects')
+    if not name.endswith('.pkl'):
+        name = name+".pkl"
+    filepath = osp.join(objects_dir, name)
+    if not osp.exists(filepath):
+        return None
+    with open(filepath, 'rb') as f:
+        obj = pickle.load(f)
+    return obj
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/rec_builder.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/rec_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e02abc969da2f882639326f5bad3c7e8d08c1fde
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/data/rec_builder.py
@@ -0,0 +1,71 @@
+import pickle
+import numpy as np
+import os
+import os.path as osp
+import sys
+import mxnet as mx
+
+
+class RecBuilder():
+    def __init__(self, path, image_size=(112, 112)):
+        self.path = path
+        self.image_size = image_size
+        self.widx = 0
+        self.wlabel = 0
+        self.max_label = -1
+        assert not osp.exists(path), '%s exists' % path
+        os.makedirs(path)
+        self.writer = mx.recordio.MXIndexedRecordIO(os.path.join(path, 'train.idx'), 
+                                                    os.path.join(path, 'train.rec'),
+                                                    'w')
+        self.meta = []
+
+    def add(self, imgs):
+        #!!! img should be BGR!!!!
+        #assert label >= 0
+        #assert label > self.last_label
+        assert len(imgs) > 0
+        label = self.wlabel
+        for img in imgs:
+            idx = self.widx
+            image_meta = {'image_index': idx, 'image_classes': [label]}
+            header = mx.recordio.IRHeader(0, label, idx, 0)
+            if isinstance(img, np.ndarray):
+                s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg')
+            else:
+                s = mx.recordio.pack(header, img)
+            self.writer.write_idx(idx, s)
+            self.meta.append(image_meta)
+            self.widx += 1
+        self.max_label = label
+        self.wlabel += 1
+
+
+    def add_image(self, img, label):
+        #!!! img should be BGR!!!!
+        #assert label >= 0
+        #assert label > self.last_label
+        idx = self.widx
+        header = mx.recordio.IRHeader(0, label, idx, 0)
+        if isinstance(label, list):
+            idlabel = label[0]
+        else:
+            idlabel = label
+        image_meta = {'image_index': idx, 'image_classes': [idlabel]}
+        if isinstance(img, np.ndarray):
+            s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg')
+        else:
+            s = mx.recordio.pack(header, img)
+        self.writer.write_idx(idx, s)
+        self.meta.append(image_meta)
+        self.widx += 1
+        self.max_label = max(self.max_label, idlabel)
+
+    def close(self):
+        with open(osp.join(self.path, 'train.meta'), 'wb') as pfile:
+            pickle.dump(self.meta, pfile, protocol=pickle.HIGHEST_PROTOCOL)
+        print('stat:', self.widx, self.wlabel)
+        with open(os.path.join(self.path, 'property'), 'w') as f:
+            f.write("%d,%d,%d\n" % (self.max_label+1, self.image_size[0], self.image_size[1]))
+            f.write("%d\n" % (self.widx))
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/__init__.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..225623d6142c968b4040f391039bfab88bdd1b2a
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/__init__.py
@@ -0,0 +1,6 @@
+from .model_zoo import get_model
+from .arcface_onnx import ArcFaceONNX
+from .retinaface import RetinaFace
+from .scrfd import SCRFD
+from .landmark import Landmark
+from .attribute import Attribute
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/arcface_onnx.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/arcface_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..b537ce2ee15d4a1834d54e185f34e336aab30a77
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/arcface_onnx.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+from __future__ import division
+import numpy as np
+import cv2
+import onnx
+import onnxruntime
+from ..utils import face_align
+
+__all__ = [
+    'ArcFaceONNX',
+]
+
+
+class ArcFaceONNX:
+    def __init__(self, model_file=None, session=None):
+        assert model_file is not None
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'recognition'
+        find_sub = False
+        find_mul = False
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        for nid, node in enumerate(graph.node[:8]):
+            #print(nid, node.name)
+            if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                find_sub = True
+            if node.name.startswith('Mul') or node.name.startswith('_mul'):
+                find_mul = True
+        if find_sub and find_mul:
+            #mxnet arcface model
+            input_mean = 0.0
+            input_std = 1.0
+        else:
+            input_mean = 127.5
+            input_std = 127.5
+        self.input_mean = input_mean
+        self.input_std = input_std
+        #print('input mean and std:', self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        input_name = input_cfg.name
+        self.input_size = tuple(input_shape[2:4][::-1])
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        self.output_shape = outputs[0].shape
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+
+    def get(self, img, face):
+        aimg = face_align.norm_crop(img, landmark=face.kps, image_size=self.input_size[0])
+        face.embedding = self.get_feat(aimg).flatten()
+        return face.embedding
+
+    def compute_sim(self, feat1, feat2):
+        from numpy.linalg import norm
+        feat1 = feat1.ravel()
+        feat2 = feat2.ravel()
+        sim = np.dot(feat1, feat2) / (norm(feat1) * norm(feat2))
+        return sim
+
+    def get_feat(self, imgs):
+        if not isinstance(imgs, list):
+            imgs = [imgs]
+        input_size = self.input_size
+        
+        blob = cv2.dnn.blobFromImages(imgs, 1.0 / self.input_std, input_size,
+                                      (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+    def forward(self, batch_data):
+        blob = (batch_data - self.input_mean) / self.input_std
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/attribute.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/attribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..40c34de3f0995499448cf5779004cc1e5f3564fb
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/attribute.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-06-19
+# @Function      : 
+
+from __future__ import division
+import numpy as np
+import cv2
+import onnx
+import onnxruntime
+from ..utils import face_align
+
+__all__ = [
+    'Attribute',
+]
+
+
+class Attribute:
+    def __init__(self, model_file=None, session=None):
+        assert model_file is not None
+        self.model_file = model_file
+        self.session = session
+        find_sub = False
+        find_mul = False
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        for nid, node in enumerate(graph.node[:8]):
+            #print(nid, node.name)
+            if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                find_sub = True
+            if node.name.startswith('Mul') or node.name.startswith('_mul'):
+                find_mul = True
+            if nid<3 and node.name=='bn_data':
+                find_sub = True
+                find_mul = True
+        if find_sub and find_mul:
+            #mxnet arcface model
+            input_mean = 0.0
+            input_std = 1.0
+        else:
+            input_mean = 127.5
+            input_std = 128.0
+        self.input_mean = input_mean
+        self.input_std = input_std
+        #print('input mean and std:', model_file, self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        input_name = input_cfg.name
+        self.input_size = tuple(input_shape[2:4][::-1])
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        output_shape = outputs[0].shape
+        #print('init output_shape:', output_shape)
+        if output_shape[1]==3:
+            self.taskname = 'genderage'
+        else:
+            self.taskname = 'attribute_%d'%output_shape[1]
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+
+    def get(self, img, face):
+        bbox = face.bbox
+        w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
+        center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
+        rotate = 0
+        _scale = self.input_size[0]  / (max(w, h)*1.5)
+        #print('param:', img.shape, bbox, center, self.input_size, _scale, rotate)
+        aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate)
+        input_size = tuple(aimg.shape[0:2][::-1])
+        #assert input_size==self.input_size
+        blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        pred = self.session.run(self.output_names, {self.input_name : blob})[0][0]
+        if self.taskname=='genderage':
+            assert len(pred)==3
+            gender = np.argmax(pred[:2])
+            age = int(np.round(pred[2]*100))
+            face['gender'] = gender
+            face['age'] = age
+            return gender, age
+        else:
+            return pred
+
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/inswapper.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/inswapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f321c627ee66cceddcab98b561b997441dd4f768
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/inswapper.py
@@ -0,0 +1,114 @@
+import time
+import numpy as np
+import onnxruntime
+import cv2
+import onnx
+from onnx import numpy_helper
+from ..utils import face_align
+
+
+
+
+class INSwapper():
+    def __init__(self, model_file=None, session=None):
+        self.model_file = model_file
+        self.session = session
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        self.emap = numpy_helper.to_array(graph.initializer[-1])
+        self.input_mean = 0.0
+        self.input_std = 255.0
+        #print('input mean and std:', model_file, self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        inputs = self.session.get_inputs()
+        self.input_names = []
+        for inp in inputs:
+            self.input_names.append(inp.name)
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        output_shape = outputs[0].shape
+        input_cfg = inputs[0]
+        input_shape = input_cfg.shape
+        self.input_shape = input_shape
+        # print('inswapper-shape:', self.input_shape)
+        self.input_size = tuple(input_shape[2:4][::-1])
+
+    def forward(self, img, latent):
+        img = (img - self.input_mean) / self.input_std
+        pred = self.session.run(self.output_names, {self.input_names[0]: img, self.input_names[1]: latent})[0]
+        return pred
+
+    def get(self, img, target_face, source_face, paste_back=True):
+        face_mask = np.zeros((img.shape[0], img.shape[1]), np.uint8)
+        cv2.fillPoly(face_mask, np.array([target_face.landmark_2d_106[[1,9,10,11,12,13,14,15,16,2,3,4,5,6,7,8,0,24,23,22,21,20,19,18,32,31,30,29,28,27,26,25,17,101,105,104,103,51,49,48,43]].astype('int64')]), 1)
+        aimg, M = face_align.norm_crop2(img, target_face.kps, self.input_size[0])
+        blob = cv2.dnn.blobFromImage(aimg, 1.0 / self.input_std, self.input_size,
+                                      (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        latent = source_face.normed_embedding.reshape((1,-1))
+        latent = np.dot(latent, self.emap)
+        latent /= np.linalg.norm(latent)
+        pred = self.session.run(self.output_names, {self.input_names[0]: blob, self.input_names[1]: latent})[0]
+        #print(latent.shape, latent.dtype, pred.shape)
+        img_fake = pred.transpose((0,2,3,1))[0]
+        bgr_fake = np.clip(255 * img_fake, 0, 255).astype(np.uint8)[:,:,::-1]
+        if not paste_back:
+            return bgr_fake, M
+        else:
+            target_img = img
+            fake_diff = bgr_fake.astype(np.float32) - aimg.astype(np.float32)
+            fake_diff = np.abs(fake_diff).mean(axis=2)
+            fake_diff[:2,:] = 0
+            fake_diff[-2:,:] = 0
+            fake_diff[:,:2] = 0
+            fake_diff[:,-2:] = 0
+            IM = cv2.invertAffineTransform(M)
+            img_white = np.full((aimg.shape[0],aimg.shape[1]), 255, dtype=np.float32)
+            bgr_fake = cv2.warpAffine(bgr_fake, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
+            img_white = cv2.warpAffine(img_white, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
+            fake_diff = cv2.warpAffine(fake_diff, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
+            img_white[img_white>20] = 255
+            fthresh = 10
+            fake_diff[fake_diff<fthresh] = 0
+            fake_diff[fake_diff>=fthresh] = 255
+            img_mask = img_white
+            mask_h_inds, mask_w_inds = np.where(img_mask==255)
+            mask_h = np.max(mask_h_inds) - np.min(mask_h_inds)
+            mask_w = np.max(mask_w_inds) - np.min(mask_w_inds)
+            mask_size = int(np.sqrt(mask_h*mask_w))
+            k = max(mask_size//10, 10)
+            #k = max(mask_size//20, 6)
+            #k = 6
+            kernel = np.ones((k,k),np.uint8)
+            img_mask = cv2.erode(img_mask,kernel,iterations = 1)
+            kernel = np.ones((2,2),np.uint8)
+            fake_diff = cv2.dilate(fake_diff,kernel,iterations = 1)
+
+            face_mask = cv2.erode(face_mask,np.ones((11,11),np.uint8),iterations = 1)
+            fake_diff[face_mask==1] = 255
+
+            k = max(mask_size//20, 5)
+            #k = 3
+            #k = 3
+            kernel_size = (k, k)
+            blur_size = tuple(2*i+1 for i in kernel_size)
+            img_mask = cv2.GaussianBlur(img_mask, blur_size, 0)
+            k = 5
+            kernel_size = (k, k)
+            blur_size = tuple(2*i+1 for i in kernel_size)
+            fake_diff = cv2.blur(fake_diff, (11,11), 0)
+            ##fake_diff = cv2.GaussianBlur(fake_diff, blur_size, 0)
+            # print('blur_size: ', blur_size)
+            # fake_diff = cv2.blur(fake_diff, (21, 21), 0) # blur_size
+            img_mask /= 255
+            fake_diff /= 255
+            # img_mask = fake_diff
+            img_mask = img_mask*fake_diff
+            img_mask = np.reshape(img_mask, [img_mask.shape[0],img_mask.shape[1],1])
+            fake_merged = img_mask * bgr_fake + (1-img_mask) * target_img.astype(np.float32)
+            fake_merged = fake_merged.astype(np.uint8)
+            return fake_merged
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/landmark.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/landmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..598b4b29a2d0674d8bb25b681f921c61460d101c
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/landmark.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+from __future__ import division
+import numpy as np
+import cv2
+import onnx
+import onnxruntime
+from ..utils import face_align
+from ..utils import transform
+from ..data import get_object
+
+__all__ = [
+    'Landmark',
+]
+
+
+class Landmark:
+    def __init__(self, model_file=None, session=None):
+        assert model_file is not None
+        self.model_file = model_file
+        self.session = session
+        find_sub = False
+        find_mul = False
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        for nid, node in enumerate(graph.node[:8]):
+            #print(nid, node.name)
+            if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                find_sub = True
+            if node.name.startswith('Mul') or node.name.startswith('_mul'):
+                find_mul = True
+            if nid<3 and node.name=='bn_data':
+                find_sub = True
+                find_mul = True
+        if find_sub and find_mul:
+            #mxnet arcface model
+            input_mean = 0.0
+            input_std = 1.0
+        else:
+            input_mean = 127.5
+            input_std = 128.0
+        self.input_mean = input_mean
+        self.input_std = input_std
+        #print('input mean and std:', model_file, self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        input_name = input_cfg.name
+        self.input_size = tuple(input_shape[2:4][::-1])
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        output_shape = outputs[0].shape
+        self.require_pose = False
+        #print('init output_shape:', output_shape)
+        if output_shape[1]==3309:
+            self.lmk_dim = 3
+            self.lmk_num = 68
+            self.mean_lmk = get_object('meanshape_68.pkl')
+            self.require_pose = True
+        else:
+            self.lmk_dim = 2
+            self.lmk_num = output_shape[1]//self.lmk_dim
+        self.taskname = 'landmark_%dd_%d'%(self.lmk_dim, self.lmk_num)
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+
+    def get(self, img, face):
+        bbox = face.bbox
+        w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
+        center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
+        rotate = 0
+        _scale = self.input_size[0]  / (max(w, h)*1.5)
+        #print('param:', img.shape, bbox, center, self.input_size, _scale, rotate)
+        aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate)
+        input_size = tuple(aimg.shape[0:2][::-1])
+        #assert input_size==self.input_size
+        blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        pred = self.session.run(self.output_names, {self.input_name : blob})[0][0]
+        if pred.shape[0] >= 3000:
+            pred = pred.reshape((-1, 3))
+        else:
+            pred = pred.reshape((-1, 2))
+        if self.lmk_num < pred.shape[0]:
+            pred = pred[self.lmk_num*-1:,:]
+        pred[:, 0:2] += 1
+        pred[:, 0:2] *= (self.input_size[0] // 2)
+        if pred.shape[1] == 3:
+            pred[:, 2] *= (self.input_size[0] // 2)
+
+        IM = cv2.invertAffineTransform(M)
+        pred = face_align.trans_points(pred, IM)
+        face[self.taskname] = pred
+        if self.require_pose:
+            P = transform.estimate_affine_matrix_3d23d(self.mean_lmk, pred)
+            s, R, t = transform.P2sRt(P)
+            rx, ry, rz = transform.matrix2angle(R)
+            pose = np.array( [rx, ry, rz], dtype=np.float32 )
+            face['pose'] = pose #pitch, yaw, roll
+        return pred
+
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/model_store.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/model_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..50bb85d314f5b7a0ea8211d2cd21186e32791592
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/model_store.py
@@ -0,0 +1,103 @@
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/model_store.py
+"""
+from __future__ import print_function
+
+__all__ = ['get_model_file']
+import os
+import zipfile
+import glob
+
+from ..utils import download, check_sha1
+
+_model_sha1 = {
+    name: checksum
+    for checksum, name in [
+        ('95be21b58e29e9c1237f229dae534bd854009ce0', 'arcface_r100_v1'),
+        ('', 'arcface_mfn_v1'),
+        ('39fd1e087a2a2ed70a154ac01fecaa86c315d01b', 'retinaface_r50_v1'),
+        ('2c9de8116d1f448fd1d4661f90308faae34c990a', 'retinaface_mnet025_v1'),
+        ('0db1d07921d005e6c9a5b38e059452fc5645e5a4', 'retinaface_mnet025_v2'),
+        ('7dd8111652b7aac2490c5dcddeb268e53ac643e6', 'genderage_v1'),
+    ]
+}
+
+base_repo_url = 'https://insightface.ai/files/'
+_url_format = '{repo_url}models/{file_name}.zip'
+
+
+def short_hash(name):
+    if name not in _model_sha1:
+        raise ValueError(
+            'Pretrained model for {name} is not available.'.format(name=name))
+    return _model_sha1[name][:8]
+
+
+def find_params_file(dir_path):
+    if not os.path.exists(dir_path):
+        return None
+    paths = glob.glob("%s/*.params" % dir_path)
+    if len(paths) == 0:
+        return None
+    paths = sorted(paths)
+    return paths[-1]
+
+
+def get_model_file(name, root=os.path.join('~', '.insightface', 'models')):
+    r"""Return location for the pretrained on local file system.
+
+    This function will download from online model zoo when model cannot be found or has mismatch.
+    The root directory will be created if it doesn't exist.
+
+    Parameters
+    ----------
+    name : str
+        Name of the model.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
+
+    Returns
+    -------
+    file_path
+        Path to the requested pretrained model file.
+    """
+
+    file_name = name
+    root = os.path.expanduser(root)
+    dir_path = os.path.join(root, name)
+    file_path = find_params_file(dir_path)
+    #file_path = os.path.join(root, file_name + '.params')
+    sha1_hash = _model_sha1[name]
+    if file_path is not None:
+        if check_sha1(file_path, sha1_hash):
+            return file_path
+        else:
+            print(
+                'Mismatch in the content of model file detected. Downloading again.'
+            )
+    else:
+        print('Model file is not found. Downloading.')
+
+    if not os.path.exists(root):
+        os.makedirs(root)
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+
+    zip_file_path = os.path.join(root, file_name + '.zip')
+    repo_url = base_repo_url
+    if repo_url[-1] != '/':
+        repo_url = repo_url + '/'
+    download(_url_format.format(repo_url=repo_url, file_name=file_name),
+             path=zip_file_path,
+             overwrite=True)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(dir_path)
+    os.remove(zip_file_path)
+    file_path = find_params_file(dir_path)
+
+    if check_sha1(file_path, sha1_hash):
+        return file_path
+    else:
+        raise ValueError(
+            'Downloaded file has different hash. Please try again.')
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/model_zoo.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..c90b79142e41998552f6c6c1c199c02f3b34f06a
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/model_zoo.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      :
+
+import os
+import os.path as osp
+import glob
+import onnxruntime
+from .arcface_onnx import *
+from .retinaface import *
+#from .scrfd import *
+from .landmark import *
+from .attribute import Attribute
+from .inswapper import INSwapper
+from ..utils import download_onnx
+
+__all__ = ['get_model']
+
+
+class PickableInferenceSession(onnxruntime.InferenceSession):
+    # This is a wrapper to make the current InferenceSession class pickable.
+    def __init__(self, model_path, **kwargs):
+        super().__init__(model_path, **kwargs)
+        self.model_path = model_path
+
+    def __getstate__(self):
+        return {'model_path': self.model_path}
+
+    def __setstate__(self, values):
+        model_path = values['model_path']
+        self.__init__(model_path)
+
+class ModelRouter:
+    def __init__(self, onnx_file):
+        self.onnx_file = onnx_file
+
+    def get_model(self, **kwargs):
+        session = PickableInferenceSession(self.onnx_file, **kwargs)
+        # print(f'Applied providers: {session._providers}, with options: {session._provider_options}')
+        inputs = session.get_inputs()
+        input_cfg = inputs[0]
+        input_shape = input_cfg.shape
+        outputs = session.get_outputs()
+
+        if len(outputs)>=5:
+            return RetinaFace(model_file=self.onnx_file, session=session)
+        elif input_shape[2]==192 and input_shape[3]==192:
+            return Landmark(model_file=self.onnx_file, session=session)
+        elif input_shape[2]==96 and input_shape[3]==96:
+            return Attribute(model_file=self.onnx_file, session=session)
+        elif len(inputs)==2 and input_shape[2]==128 and input_shape[3]==128:
+            return INSwapper(model_file=self.onnx_file, session=session)
+        elif input_shape[2]==input_shape[3] and input_shape[2]>=112 and input_shape[2]%16==0:
+            return ArcFaceONNX(model_file=self.onnx_file, session=session)
+        else:
+            #raise RuntimeError('error on model routing')
+            return None
+
+def find_onnx_file(dir_path):
+    if not os.path.exists(dir_path):
+        return None
+    paths = glob.glob("%s/*.onnx" % dir_path)
+    if len(paths) == 0:
+        return None
+    paths = sorted(paths)
+    return paths[-1]
+
+def get_default_providers():
+    return ['CUDAExecutionProvider', 'CoreMLExecutionProvider', 'CPUExecutionProvider']
+
+def get_default_provider_options():
+    return None
+
+def get_model(name, **kwargs):
+    root = kwargs.get('root', '~/.insightface')
+    root = os.path.expanduser(root)
+    model_root = osp.join(root, 'models')
+    allow_download = kwargs.get('download', False)
+    download_zip = kwargs.get('download_zip', False)
+    if not name.endswith('.onnx'):
+        model_dir = os.path.join(model_root, name)
+        model_file = find_onnx_file(model_dir)
+        if model_file is None:
+            return None
+    else:
+        model_file = name
+    if not osp.exists(model_file) and allow_download:
+        model_file = download_onnx('models', model_file, root=root, download_zip=download_zip)
+    assert osp.exists(model_file), 'model_file %s should exist'%model_file
+    assert osp.isfile(model_file), 'model_file %s should be a file'%model_file
+    router = ModelRouter(model_file)
+    providers = kwargs.get('providers', get_default_providers())
+    provider_options = kwargs.get('provider_options', get_default_provider_options())
+    model = router.get_model(providers=providers, provider_options=provider_options)
+    return model
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/retinaface.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/retinaface.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc4ad91ed70688b38503127137e928dc7e5433e1
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/retinaface.py
@@ -0,0 +1,301 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-09-18
+# @Function      : 
+
+from __future__ import division
+import datetime
+import numpy as np
+import onnx
+import onnxruntime
+import os
+import os.path as osp
+import cv2
+import sys
+
+def softmax(z):
+    assert len(z.shape) == 2
+    s = np.max(z, axis=1)
+    s = s[:, np.newaxis] # necessary step to do broadcasting
+    e_x = np.exp(z - s)
+    div = np.sum(e_x, axis=1)
+    div = div[:, np.newaxis] # dito
+    return e_x / div
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+
+class RetinaFace:
+    def __init__(self, model_file=None, session=None):
+        import onnxruntime
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'detection'
+        if self.session is None:
+            assert self.model_file is not None
+            assert osp.exists(self.model_file)
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self.det_thresh = 0.5
+        self._init_vars()
+
+    def _init_vars(self):
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        #print(input_shape)
+        if isinstance(input_shape[2], str):
+            self.input_size = None
+        else:
+            self.input_size = tuple(input_shape[2:4][::-1])
+        #print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.input_mean = 127.5
+        self.input_std = 128.0
+        #print(self.output_names)
+        #assert len(outputs)==10 or len(outputs)==15
+        self.use_kps = False
+        self._anchor_ratio = 1.0
+        self._num_anchors = 1
+        if len(outputs)==6:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+        elif len(outputs)==9:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+            self.use_kps = True
+        elif len(outputs)==10:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+        elif len(outputs)==15:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+            self.use_kps = True
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+        nms_thresh = kwargs.get('nms_thresh', None)
+        if nms_thresh is not None:
+            self.nms_thresh = nms_thresh
+        det_thresh = kwargs.get('det_thresh', None)
+        if det_thresh is not None:
+            self.det_thresh = det_thresh
+        input_size = kwargs.get('input_size', None)
+        if input_size is not None:
+            if self.input_size is not None:
+                print('warning: det_size is already set in detection model, ignore')
+            else:
+                self.input_size = input_size
+
+    def forward(self, img, threshold):
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_outs = self.session.run(self.output_names, {self.input_name : blob})
+
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            scores = net_outs[idx]
+            bbox_preds = net_outs[idx+fmc]
+            bbox_preds = bbox_preds * stride
+            if self.use_kps:
+                kps_preds = net_outs[idx+fmc*2] * stride
+            height = input_height // stride
+            width = input_width // stride
+            K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                #solution-1, c style:
+                #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )
+                #for i in range(height):
+                #    anchor_centers[i, :, 1] = i
+                #for i in range(width):
+                #    anchor_centers[:, i, 0] = i
+
+                #solution-2:
+                #ax = np.arange(width, dtype=np.float32)
+                #ay = np.arange(height, dtype=np.float32)
+                #xv, yv = np.meshgrid(np.arange(width), np.arange(height))
+                #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)
+
+                #solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                #print(anchor_centers.shape)
+
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                if self._num_anchors>1:
+                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
+                if len(self.center_cache)<100:
+                    self.center_cache[key] = anchor_centers
+
+            pos_inds = np.where(scores>=threshold)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                #kpss = kps_preds
+                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        return scores_list, bboxes_list, kpss_list
+
+    def detect(self, img, input_size = None, max_num=0, metric='default'):
+        assert input_size is not None or self.input_size is not None
+        input_size = self.input_size if input_size is None else input_size
+            
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio>model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
+        det_img[:new_height, :new_width, :] = resized_img
+
+        scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)
+
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order,:,:]
+            kpss = kpss[keep,:,:]
+        else:
+            kpss = None
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                    det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(
+                values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+def get_retinaface(name, download=False, root='~/.insightface/models', **kwargs):
+    if not download:
+        assert os.path.exists(name)
+        return RetinaFace(name)
+    else:
+        from .model_store import get_model_file
+        _file = get_model_file("retinaface_%s" % name, root=root)
+        return retinaface(_file)
+
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/scrfd.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/scrfd.py
new file mode 100644
index 0000000000000000000000000000000000000000..674db4bba761157592dfb95c5d1638da1099f89c
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/model_zoo/scrfd.py
@@ -0,0 +1,348 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+from __future__ import division
+import datetime
+import numpy as np
+import onnx
+import onnxruntime
+import os
+import os.path as osp
+import cv2
+import sys
+
+def softmax(z):
+    assert len(z.shape) == 2
+    s = np.max(z, axis=1)
+    s = s[:, np.newaxis] # necessary step to do broadcasting
+    e_x = np.exp(z - s)
+    div = np.sum(e_x, axis=1)
+    div = div[:, np.newaxis] # dito
+    return e_x / div
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+
+class SCRFD:
+    def __init__(self, model_file=None, session=None):
+        import onnxruntime
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'detection'
+        self.batched = False
+        if self.session is None:
+            assert self.model_file is not None
+            assert osp.exists(self.model_file)
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self.det_thresh = 0.5
+        self._init_vars()
+
+    def _init_vars(self):
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        #print(input_shape)
+        if isinstance(input_shape[2], str):
+            self.input_size = None
+        else:
+            self.input_size = tuple(input_shape[2:4][::-1])
+        #print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        if len(outputs[0].shape) == 3:
+            self.batched = True
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.input_mean = 127.5
+        self.input_std = 128.0
+        #print(self.output_names)
+        #assert len(outputs)==10 or len(outputs)==15
+        self.use_kps = False
+        self._anchor_ratio = 1.0
+        self._num_anchors = 1
+        if len(outputs)==6:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+        elif len(outputs)==9:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+            self.use_kps = True
+        elif len(outputs)==10:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+        elif len(outputs)==15:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+            self.use_kps = True
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+        nms_thresh = kwargs.get('nms_thresh', None)
+        if nms_thresh is not None:
+            self.nms_thresh = nms_thresh
+        det_thresh = kwargs.get('det_thresh', None)
+        if det_thresh is not None:
+            self.det_thresh = det_thresh
+        input_size = kwargs.get('input_size', None)
+        if input_size is not None:
+            if self.input_size is not None:
+                print('warning: det_size is already set in scrfd model, ignore')
+            else:
+                self.input_size = input_size
+
+    def forward(self, img, threshold):
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_outs = self.session.run(self.output_names, {self.input_name : blob})
+
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            # If model support batch dim, take first output
+            if self.batched:
+                scores = net_outs[idx][0]
+                bbox_preds = net_outs[idx + fmc][0]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2][0] * stride
+            # If model doesn't support batching take output as is
+            else:
+                scores = net_outs[idx]
+                bbox_preds = net_outs[idx + fmc]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2] * stride
+
+            height = input_height // stride
+            width = input_width // stride
+            K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                #solution-1, c style:
+                #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )
+                #for i in range(height):
+                #    anchor_centers[i, :, 1] = i
+                #for i in range(width):
+                #    anchor_centers[:, i, 0] = i
+
+                #solution-2:
+                #ax = np.arange(width, dtype=np.float32)
+                #ay = np.arange(height, dtype=np.float32)
+                #xv, yv = np.meshgrid(np.arange(width), np.arange(height))
+                #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)
+
+                #solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                #print(anchor_centers.shape)
+
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                if self._num_anchors>1:
+                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
+                if len(self.center_cache)<100:
+                    self.center_cache[key] = anchor_centers
+
+            pos_inds = np.where(scores>=threshold)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                #kpss = kps_preds
+                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        return scores_list, bboxes_list, kpss_list
+
+    def detect(self, img, input_size = None, max_num=0, metric='default'):
+        assert input_size is not None or self.input_size is not None
+        input_size = self.input_size if input_size is None else input_size
+            
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio>model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
+        det_img[:new_height, :new_width, :] = resized_img
+
+        scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)
+
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order,:,:]
+            kpss = kpss[keep,:,:]
+        else:
+            kpss = None
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                    det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(
+                values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+def get_scrfd(name, download=False, root='~/.insightface/models', **kwargs):
+    if not download:
+        assert os.path.exists(name)
+        return SCRFD(name)
+    else:
+        from .model_store import get_model_file
+        _file = get_model_file("scrfd_%s" % name, root=root)
+        return SCRFD(_file)
+
+
+def scrfd_2p5gkps(**kwargs):
+    return get_scrfd("2p5gkps", download=True, **kwargs)
+
+
+if __name__ == '__main__':
+    import glob
+    detector = SCRFD(model_file='./det.onnx')
+    detector.prepare(-1)
+    img_paths = ['tests/data/t1.jpg']
+    for img_path in img_paths:
+        img = cv2.imread(img_path)
+
+        for _ in range(1):
+            ta = datetime.datetime.now()
+            #bboxes, kpss = detector.detect(img, 0.5, input_size = (640, 640))
+            bboxes, kpss = detector.detect(img, 0.5)
+            tb = datetime.datetime.now()
+            print('all cost:', (tb-ta).total_seconds()*1000)
+        print(img_path, bboxes.shape)
+        if kpss is not None:
+            print(kpss.shape)
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i]
+            x1,y1,x2,y2,score = bbox.astype(np.int)
+            cv2.rectangle(img, (x1,y1)  , (x2,y2) , (255,0,0) , 2)
+            if kpss is not None:
+                kps = kpss[i]
+                for kp in kps:
+                    kp = kp.astype(np.int)
+                    cv2.circle(img, tuple(kp) , 1, (0,0,255) , 2)
+        filename = img_path.split('/')[-1]
+        print('output:', filename)
+        cv2.imwrite('./outputs/%s'%filename, img)
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/__init__.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6960431b1bd6db38890e391c4c94dd2182f2e1fd
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/__init__.py
@@ -0,0 +1,6 @@
+from __future__ import absolute_import
+
+from .storage import download, ensure_available, download_onnx
+from .filesystem import get_model_dir
+from .filesystem import makedirs, try_import_dali
+from .constant import *
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/constant.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..8860ff077ae7227235591edfc84c0cdc227a6432
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/constant.py
@@ -0,0 +1,3 @@
+
+DEFAULT_MP_NAME = 'buffalo_l'
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/download.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0186a004c96195535485952e0eaf29d98dbcfc3
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/download.py
@@ -0,0 +1,92 @@
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/download.py
+"""
+import os
+import hashlib
+import requests
+
+
+def check_sha1(filename, sha1_hash):
+    """Check whether the sha1 hash of the file content matches the expected hash.
+    Parameters
+    ----------
+    filename : str
+        Path to the file.
+    sha1_hash : str
+        Expected sha1 hash in hexadecimal digits.
+    Returns
+    -------
+    bool
+        Whether the file content matches the expected hash.
+    """
+    sha1 = hashlib.sha1()
+    with open(filename, 'rb') as f:
+        while True:
+            data = f.read(1048576)
+            if not data:
+                break
+            sha1.update(data)
+
+    sha1_file = sha1.hexdigest()
+    l = min(len(sha1_file), len(sha1_hash))
+    return sha1.hexdigest()[0:l] == sha1_hash[0:l]
+
+
+def download_file(url, path=None, overwrite=False, sha1_hash=None):
+    """Download an given URL
+    Parameters
+    ----------
+    url : str
+        URL to download
+    path : str, optional
+        Destination path to store downloaded file. By default stores to the
+        current directory with same name as in url.
+    overwrite : bool, optional
+        Whether to overwrite destination file if already exists.
+    sha1_hash : str, optional
+        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
+        but doesn't match.
+    Returns
+    -------
+    str
+        The file path of the downloaded file.
+    """
+    if path is None:
+        fname = url.split('/')[-1]
+    else:
+        path = os.path.expanduser(path)
+        if os.path.isdir(path):
+            fname = os.path.join(path, url.split('/')[-1])
+        else:
+            fname = path
+
+    if overwrite or not os.path.exists(fname) or (
+            sha1_hash and not check_sha1(fname, sha1_hash)):
+        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+
+        print('Downloading %s from %s...' % (fname, url))
+        r = requests.get(url, stream=True)
+        if r.status_code != 200:
+            raise RuntimeError("Failed downloading url %s" % url)
+        total_length = r.headers.get('content-length')
+        with open(fname, 'wb') as f:
+            if total_length is None:  # no content length header
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:  # filter out keep-alive new chunks
+                        f.write(chunk)
+            else:
+                total_length = int(total_length)
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:          # filter out keep-alive chunks
+                        f.write(chunk)
+
+
+        if sha1_hash and not check_sha1(fname, sha1_hash):
+            raise UserWarning('File {} is downloaded but the content hash does not match. ' \
+                              'The repo may be outdated or download may be incomplete. ' \
+                              'If the "repo_url" is overridden, consider switching to ' \
+                              'the default repo.'.format(fname))
+
+    return fname
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/face_align.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/face_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..226628b39cf743947df230feffbb97bf5c585e1d
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/face_align.py
@@ -0,0 +1,103 @@
+import cv2
+import numpy as np
+from skimage import transform as trans
+
+
+arcface_dst = np.array(
+    [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
+     [41.5493, 92.3655], [70.7299, 92.2041]],
+    dtype=np.float32)
+
+def estimate_norm(lmk, image_size=112,mode='arcface'):
+    assert lmk.shape == (5, 2)
+    assert image_size%112==0 or image_size%128==0
+    if image_size%112==0:
+        ratio = float(image_size)/112.0
+        diff_x = 0
+    else:
+        ratio = float(image_size)/128.0
+        diff_x = 8.0*ratio
+    dst = arcface_dst * ratio
+    dst[:,0] += diff_x
+    tform = trans.SimilarityTransform()
+    tform.estimate(lmk, dst)
+    M = tform.params[0:2, :]
+    return M
+
+def norm_crop(img, landmark, image_size=112, mode='arcface'):
+    M = estimate_norm(landmark, image_size, mode)
+    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
+    return warped
+
+def norm_crop2(img, landmark, image_size=112, mode='arcface'):
+    M = estimate_norm(landmark, image_size, mode)
+    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
+    return warped, M
+
+def square_crop(im, S):
+    if im.shape[0] > im.shape[1]:
+        height = S
+        width = int(float(im.shape[1]) / im.shape[0] * S)
+        scale = float(S) / im.shape[0]
+    else:
+        width = S
+        height = int(float(im.shape[0]) / im.shape[1] * S)
+        scale = float(S) / im.shape[1]
+    resized_im = cv2.resize(im, (width, height))
+    det_im = np.zeros((S, S, 3), dtype=np.uint8)
+    det_im[:resized_im.shape[0], :resized_im.shape[1], :] = resized_im
+    return det_im, scale
+
+
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = scale
+    rot = float(rotation) * np.pi / 180.0
+    #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
+    t1 = trans.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = trans.SimilarityTransform(rotation=rot)
+    t4 = trans.SimilarityTransform(translation=(output_size / 2,
+                                                output_size / 2))
+    t = t1 + t2 + t3 + t4
+    M = t.params[0:2]
+    cropped = cv2.warpAffine(data,
+                             M, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, M
+
+
+def trans_points2d(pts, M):
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i] = new_pt[0:2]
+
+    return new_pts
+
+
+def trans_points3d(pts, M):
+    scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])
+    #print(scale)
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i][0:2] = new_pt[0:2]
+        new_pts[i][2] = pts[i][2] * scale
+
+    return new_pts
+
+
+def trans_points(pts, M):
+    if pts.shape[1] == 2:
+        return trans_points2d(pts, M)
+    else:
+        return trans_points3d(pts, M)
+
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/filesystem.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/filesystem.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e3851975bdcbbf7f5eeb7e68e70a36dc040535
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/filesystem.py
@@ -0,0 +1,157 @@
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/filesystem.py
+"""
+import os
+import os.path as osp
+import errno
+
+
+def get_model_dir(name, root='~/.insightface'):
+    root = os.path.expanduser(root)
+    model_dir = osp.join(root, 'models', name)
+    return model_dir
+
+def makedirs(path):
+    """Create directory recursively if not exists.
+    Similar to `makedir -p`, you can skip checking existence before this function.
+
+    Parameters
+    ----------
+    path : str
+        Path of the desired dir
+    """
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+
+
+def try_import(package, message=None):
+    """Try import specified package, with custom message support.
+
+    Parameters
+    ----------
+    package : str
+        The name of the targeting package.
+    message : str, default is None
+        If not None, this function will raise customized error message when import error is found.
+
+
+    Returns
+    -------
+    module if found, raise ImportError otherwise
+
+    """
+    try:
+        return __import__(package)
+    except ImportError as e:
+        if not message:
+            raise e
+        raise ImportError(message)
+
+
+def try_import_cv2():
+    """Try import cv2 at runtime.
+
+    Returns
+    -------
+    cv2 module if found. Raise ImportError otherwise
+
+    """
+    msg = "cv2 is required, you can install by package manager, e.g. 'apt-get', \
+        or `pip install opencv-python --user` (note that this is unofficial PYPI package)."
+
+    return try_import('cv2', msg)
+
+
+def try_import_mmcv():
+    """Try import mmcv at runtime.
+
+    Returns
+    -------
+    mmcv module if found. Raise ImportError otherwise
+
+    """
+    msg = "mmcv is required, you can install by first `pip install Cython --user` \
+        and then `pip install mmcv --user` (note that this is unofficial PYPI package)."
+
+    return try_import('mmcv', msg)
+
+
+def try_import_rarfile():
+    """Try import rarfile at runtime.
+
+    Returns
+    -------
+    rarfile module if found. Raise ImportError otherwise
+
+    """
+    msg = "rarfile is required, you can install by first `sudo apt-get install unrar` \
+        and then `pip install rarfile --user` (note that this is unofficial PYPI package)."
+
+    return try_import('rarfile', msg)
+
+
+def import_try_install(package, extern_url=None):
+    """Try import the specified package.
+    If the package not installed, try use pip to install and import if success.
+
+    Parameters
+    ----------
+    package : str
+        The name of the package trying to import.
+    extern_url : str or None, optional
+        The external url if package is not hosted on PyPI.
+        For example, you can install a package using:
+         "pip install git+http://github.com/user/repo/tarball/master/egginfo=xxx".
+        In this case, you can pass the url to the extern_url.
+
+    Returns
+    -------
+    <class 'Module'>
+        The imported python module.
+
+    """
+    try:
+        return __import__(package)
+    except ImportError:
+        try:
+            from pip import main as pipmain
+        except ImportError:
+            from pip._internal import main as pipmain
+
+        # trying to install package
+        url = package if extern_url is None else extern_url
+        pipmain(['install', '--user',
+                 url])  # will raise SystemExit Error if fails
+
+        # trying to load again
+        try:
+            return __import__(package)
+        except ImportError:
+            import sys
+            import site
+            user_site = site.getusersitepackages()
+            if user_site not in sys.path:
+                sys.path.append(user_site)
+            return __import__(package)
+    return __import__(package)
+
+
+def try_import_dali():
+    """Try import NVIDIA DALI at runtime.
+    """
+    try:
+        dali = __import__('nvidia.dali', fromlist=['pipeline', 'ops', 'types'])
+        dali.Pipeline = dali.pipeline.Pipeline
+    except ImportError:
+
+        class dali:
+            class Pipeline:
+                def __init__(self):
+                    raise NotImplementedError(
+                        "DALI not found, please check if you installed it correctly."
+                    )
+
+    return dali
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/storage.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bf37e2d17b28dee2a8839484778815f87fc4a9c
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/storage.py
@@ -0,0 +1,52 @@
+
+import os
+import os.path as osp
+import zipfile
+from .download import download_file
+
+BASE_REPO_URL = 'https://github.com/deepinsight/insightface/releases/download/v0.7'
+
+def download(sub_dir, name, force=False, root='~/.insightface'):
+    _root = os.path.expanduser(root)
+    dir_path = os.path.join(_root, sub_dir, name)
+    if osp.exists(dir_path) and not force:
+        return dir_path
+    print('download_path:', dir_path)
+    zip_file_path = os.path.join(_root, sub_dir, name + '.zip')
+    model_url = "%s/%s.zip"%(BASE_REPO_URL, name)
+    download_file(model_url,
+             path=zip_file_path,
+             overwrite=True)
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(dir_path)
+    #os.remove(zip_file_path)
+    return dir_path
+
+def ensure_available(sub_dir, name, root='~/.insightface'):
+    return download(sub_dir, name, force=False, root=root)
+
+def download_onnx(sub_dir, model_file, force=False, root='~/.insightface', download_zip=False):
+    _root = os.path.expanduser(root)
+    model_root = osp.join(_root, sub_dir)
+    new_model_file = osp.join(model_root, model_file)
+    if osp.exists(new_model_file) and not force:
+        return new_model_file
+    if not osp.exists(model_root):
+        os.makedirs(model_root)
+    print('download_path:', new_model_file)
+    if not download_zip:
+        model_url = "%s/%s"%(BASE_REPO_URL, model_file)
+        download_file(model_url,
+                 path=new_model_file,
+                 overwrite=True)
+    else:
+        model_url = "%s/%s.zip"%(BASE_REPO_URL, model_file)
+        zip_file_path = new_model_file+".zip"
+        download_file(model_url,
+                 path=zip_file_path,
+                 overwrite=True)
+        with zipfile.ZipFile(zip_file_path) as zf:
+            zf.extractall(model_root)
+        return new_model_file
diff --git a/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/transform.py b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..06531d257b694211a0b9a09c9d741b9b2ff53bfe
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/dependencies/insightface/utils/transform.py
@@ -0,0 +1,116 @@
+import cv2
+import math
+import numpy as np
+from skimage import transform as trans
+
+
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = scale
+    rot = float(rotation) * np.pi / 180.0
+    #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
+    t1 = trans.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = trans.SimilarityTransform(rotation=rot)
+    t4 = trans.SimilarityTransform(translation=(output_size / 2,
+                                                output_size / 2))
+    t = t1 + t2 + t3 + t4
+    M = t.params[0:2]
+    cropped = cv2.warpAffine(data,
+                             M, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, M
+
+
+def trans_points2d(pts, M):
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i] = new_pt[0:2]
+
+    return new_pts
+
+
+def trans_points3d(pts, M):
+    scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])
+    #print(scale)
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i][0:2] = new_pt[0:2]
+        new_pts[i][2] = pts[i][2] * scale
+
+    return new_pts
+
+
+def trans_points(pts, M):
+    if pts.shape[1] == 2:
+        return trans_points2d(pts, M)
+    else:
+        return trans_points3d(pts, M)
+
+def estimate_affine_matrix_3d23d(X, Y):
+    ''' Using least-squares solution 
+    Args:
+        X: [n, 3]. 3d points(fixed)
+        Y: [n, 3]. corresponding 3d points(moving). Y = PX
+    Returns:
+        P_Affine: (3, 4). Affine camera matrix (the third row is [0, 0, 0, 1]).
+    '''
+    X_homo = np.hstack((X, np.ones([X.shape[0],1]))) #n x 4
+    P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
+    return P
+
+def P2sRt(P):
+    ''' decompositing camera matrix P
+    Args: 
+        P: (3, 4). Affine Camera Matrix.
+    Returns:
+        s: scale factor.
+        R: (3, 3). rotation matrix.
+        t: (3,). translation. 
+    '''
+    t = P[:, 3]
+    R1 = P[0:1, :3]
+    R2 = P[1:2, :3]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2.0
+    r1 = R1/np.linalg.norm(R1)
+    r2 = R2/np.linalg.norm(R2)
+    r3 = np.cross(r1, r2)
+
+    R = np.concatenate((r1, r2, r3), 0)
+    return s, R, t
+
+def matrix2angle(R):
+    ''' get three Euler angles from Rotation Matrix
+    Args:
+        R: (3,3). rotation matrix
+    Returns:
+        x: pitch
+        y: yaw
+        z: roll
+    '''
+    sy = math.sqrt(R[0,0] * R[0,0] +  R[1,0] * R[1,0])
+     
+    singular = sy < 1e-6
+ 
+    if  not singular :
+        x = math.atan2(R[2,1] , R[2,2])
+        y = math.atan2(-R[2,0], sy)
+        z = math.atan2(R[1,0], R[0,0])
+    else :
+        x = math.atan2(-R[1,2], R[1,1])
+        y = math.atan2(-R[2,0], sy)
+        z = 0
+
+    # rx, ry, rz = np.rad2deg(x), np.rad2deg(y), np.rad2deg(z)
+    rx, ry, rz = x*180/np.pi, y*180/np.pi, z*180/np.pi
+    return rx, ry, rz
+
diff --git a/src/thirdparty/liveportrait/src/utils/face_analysis_diy.py b/src/thirdparty/liveportrait/src/utils/face_analysis_diy.py
new file mode 100644
index 0000000000000000000000000000000000000000..f13a659134216958da3c7273aabf3b0f96fb320d
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/face_analysis_diy.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+
+"""
+face detectoin and alignment using InsightFace
+"""
+
+import numpy as np
+from .rprint import rlog as log
+from .dependencies.insightface.app import FaceAnalysis
+from .dependencies.insightface.app.common import Face
+from .timer import Timer
+
+
+def sort_by_direction(faces, direction: str = 'large-small', face_center=None):
+    if len(faces) <= 0:
+        return faces
+
+    if direction == 'left-right':
+        return sorted(faces, key=lambda face: face['bbox'][0])
+    if direction == 'right-left':
+        return sorted(faces, key=lambda face: face['bbox'][0], reverse=True)
+    if direction == 'top-bottom':
+        return sorted(faces, key=lambda face: face['bbox'][1])
+    if direction == 'bottom-top':
+        return sorted(faces, key=lambda face: face['bbox'][1], reverse=True)
+    if direction == 'small-large':
+        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]))
+    if direction == 'large-small':
+        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]), reverse=True)
+    if direction == 'distance-from-retarget-face':
+        return sorted(faces, key=lambda face: (((face['bbox'][2]+face['bbox'][0])/2-face_center[0])**2+((face['bbox'][3]+face['bbox'][1])/2-face_center[1])**2)**0.5)
+    return faces
+
+
+class FaceAnalysisDIY(FaceAnalysis):
+    def __init__(self, name='buffalo_l', root='~/.insightface', allowed_modules=None, **kwargs):
+        super().__init__(name=name, root=root, allowed_modules=allowed_modules, **kwargs)
+
+        self.timer = Timer()
+
+    def get(self, img_bgr, **kwargs):
+        max_num = kwargs.get('max_face_num', 0)  # the number of the detected faces, 0 means no limit
+        flag_do_landmark_2d_106 = kwargs.get('flag_do_landmark_2d_106', True)  # whether to do 106-point detection
+        direction = kwargs.get('direction', 'large-small')  # sorting direction
+        face_center = None
+
+        bboxes, kpss = self.det_model.detect(img_bgr, max_num=max_num, metric='default')
+        if bboxes.shape[0] == 0:
+            return []
+        ret = []
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i, 0:4]
+            det_score = bboxes[i, 4]
+            kps = None
+            if kpss is not None:
+                kps = kpss[i]
+            face = Face(bbox=bbox, kps=kps, det_score=det_score)
+            for taskname, model in self.models.items():
+                if taskname == 'detection':
+                    continue
+
+                if (not flag_do_landmark_2d_106) and taskname == 'landmark_2d_106':
+                    continue
+
+                # print(f'taskname: {taskname}')
+                model.get(img_bgr, face)
+            ret.append(face)
+
+        ret = sort_by_direction(ret, direction, face_center)
+        return ret
+
+    def warmup(self):
+        self.timer.tic()
+
+        img_bgr = np.zeros((512, 512, 3), dtype=np.uint8)
+        self.get(img_bgr)
+
+        elapse = self.timer.toc()
+        log(f'FaceAnalysisDIY warmup time: {elapse:.3f}s')
diff --git a/src/thirdparty/liveportrait/src/utils/filter.py b/src/thirdparty/liveportrait/src/utils/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8e27ca2a3c2aa58ac73171cacdcf583bdc42778
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/filter.py
@@ -0,0 +1,19 @@
+# coding: utf-8
+
+import torch
+import numpy as np
+from pykalman import KalmanFilter
+
+
+def smooth(x_d_lst, shape, device, observation_variance=3e-7, process_variance=1e-5):
+    x_d_lst_reshape = [x.reshape(-1) for x in x_d_lst]
+    x_d_stacked = np.vstack(x_d_lst_reshape)
+    kf = KalmanFilter(
+        initial_state_mean=x_d_stacked[0],
+        n_dim_obs=x_d_stacked.shape[1],
+        transition_covariance=process_variance * np.eye(x_d_stacked.shape[1]),
+        observation_covariance=observation_variance * np.eye(x_d_stacked.shape[1])
+    )
+    smoothed_state_means, _ = kf.smooth(x_d_stacked)
+    x_d_lst_smooth = [torch.tensor(state_mean.reshape(shape[-2:]), dtype=torch.float32, device=device) for state_mean in smoothed_state_means]
+    return x_d_lst_smooth
diff --git a/src/thirdparty/liveportrait/src/utils/helper.py b/src/thirdparty/liveportrait/src/utils/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc4912ad7831ec9ec1b346a94240f01640db15e
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/helper.py
@@ -0,0 +1,199 @@
+# coding: utf-8
+
+"""
+utility functions and classes to handle feature extraction and model loading
+"""
+
+import os
+import os.path as osp
+import torch
+from collections import OrderedDict
+import numpy as np
+from scipy.spatial import ConvexHull 
+from typing import Union
+import cv2
+
+from ..modules.spade_generator import SPADEDecoder
+from ..modules.warping_network import WarpingNetwork
+from ..modules.motion_extractor import MotionExtractor
+from ..modules.appearance_feature_extractor import AppearanceFeatureExtractor
+from ..modules.stitching_retargeting_network import StitchingRetargetingNetwork
+
+
+def tensor_to_numpy(data: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
+    """transform torch.Tensor into numpy.ndarray"""
+    if isinstance(data, torch.Tensor):
+        return data.data.cpu().numpy()
+    return data
+
+def calc_motion_multiplier(
+    kp_source: Union[np.ndarray, torch.Tensor],
+    kp_driving_initial: Union[np.ndarray, torch.Tensor]
+) -> float:
+    """calculate motion_multiplier based on the source image and the first driving frame"""
+    kp_source_np = tensor_to_numpy(kp_source)
+    kp_driving_initial_np = tensor_to_numpy(kp_driving_initial)
+
+    source_area = ConvexHull(kp_source_np.squeeze(0)).volume
+    driving_area = ConvexHull(kp_driving_initial_np.squeeze(0)).volume
+    motion_multiplier = np.sqrt(source_area) / np.sqrt(driving_area)
+    # motion_multiplier = np.cbrt(source_area) / np.cbrt(driving_area)
+
+    return motion_multiplier
+
+def suffix(filename):
+    """a.jpg -> jpg"""
+    pos = filename.rfind(".")
+    if pos == -1:
+        return ""
+    return filename[pos + 1:]
+
+
+def prefix(filename):
+    """a.jpg -> a"""
+    pos = filename.rfind(".")
+    if pos == -1:
+        return filename
+    return filename[:pos]
+
+
+def basename(filename):
+    """a/b/c.jpg -> c"""
+    return prefix(osp.basename(filename))
+
+
+def remove_suffix(filepath):
+    """a/b/c.jpg -> a/b/c"""
+    return osp.join(osp.dirname(filepath), basename(filepath))
+
+
+def is_image(file_path):
+    image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp')
+    return file_path.lower().endswith(image_extensions)
+
+
+def is_video(file_path):
+    if file_path.lower().endswith((".mp4", ".mov", ".avi", ".webm")) or osp.isdir(file_path):
+        return True
+    return False
+
+
+def is_template(file_path):
+    if file_path.endswith(".pkl"):
+        return True
+    return False
+
+
+def mkdir(d, log=False):
+    # return self-assined `d`, for one line code
+    if not osp.exists(d):
+        os.makedirs(d, exist_ok=True)
+        if log:
+            print(f"Make dir: {d}")
+    return d
+
+
+def squeeze_tensor_to_numpy(tensor):
+    out = tensor.data.squeeze(0).cpu().numpy()
+    return out
+
+
+def dct2device(dct: dict, device):
+    for key in dct:
+        if isinstance(dct[key], torch.Tensor):
+            dct[key] = dct[key].to(device)
+        else:
+            dct[key] = torch.tensor(dct[key]).to(device)
+    return dct
+
+
+def concat_feat(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+    """
+    kp_source: (bs, k, 3)
+    kp_driving: (bs, k, 3)
+    Return: (bs, 2k*3)
+    """
+    bs_src = kp_source.shape[0]
+    bs_dri = kp_driving.shape[0]
+    assert bs_src == bs_dri, 'batch size must be equal'
+
+    feat = torch.cat([kp_source.view(bs_src, -1), kp_driving.view(bs_dri, -1)], dim=1)
+    return feat
+
+
+def remove_ddp_dumplicate_key(state_dict):
+    state_dict_new = OrderedDict()
+    for key in state_dict.keys():
+        state_dict_new[key.replace('module.', '')] = state_dict[key]
+    return state_dict_new
+
+
+def load_model(ckpt_path, model_config, device, model_type):
+    model_params = model_config['model_params'][f'{model_type}_params']
+
+    if model_type == 'appearance_feature_extractor':
+        model = AppearanceFeatureExtractor(**model_params).to(device)
+    elif model_type == 'motion_extractor':
+        model = MotionExtractor(**model_params).to(device)
+    elif model_type == 'warping_module':
+        model = WarpingNetwork(**model_params).to(device)
+    elif model_type == 'spade_generator':
+        model = SPADEDecoder(**model_params).to(device)
+    elif model_type == 'stitching_retargeting_module':
+        # Special handling for stitching and retargeting module
+        config = model_config['model_params']['stitching_retargeting_module_params']
+        checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
+
+        stitcher = StitchingRetargetingNetwork(**config.get('stitching'))
+        stitcher.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_shoulder']))
+        stitcher = stitcher.to(device)
+        stitcher.eval()
+
+        retargetor_lip = StitchingRetargetingNetwork(**config.get('lip'))
+        retargetor_lip.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_mouth']))
+        retargetor_lip = retargetor_lip.to(device)
+        retargetor_lip.eval()
+
+        retargetor_eye = StitchingRetargetingNetwork(**config.get('eye'))
+        retargetor_eye.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_eye']))
+        retargetor_eye = retargetor_eye.to(device)
+        retargetor_eye.eval()
+
+        return {
+            'stitching': stitcher,
+            'lip': retargetor_lip,
+            'eye': retargetor_eye
+        }
+    else:
+        raise ValueError(f"Unknown model type: {model_type}")
+
+    model.load_state_dict(torch.load(ckpt_path, map_location=lambda storage, loc: storage))
+    model.eval()
+    return model
+
+
+def load_description(fp):
+    with open(fp, 'r', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+def is_square_video(video_path):
+    video = cv2.VideoCapture(video_path)
+
+    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+    video.release()
+    # if width != height:
+        # gr.Info(f"Uploaded video is not square, force do crop (driving) to be True")
+
+    return width == height
+
+def clean_state_dict(state_dict):
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k[:7] == 'module.':
+            k = k[7:]  # remove `module.`
+        new_state_dict[k] = v
+    return new_state_dict
diff --git a/src/thirdparty/liveportrait/src/utils/human_landmark_runner.py b/src/thirdparty/liveportrait/src/utils/human_landmark_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..62c3b96a2f63f1bde901230cb07f42c7a0bfe33b
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/human_landmark_runner.py
@@ -0,0 +1,95 @@
+# coding: utf-8
+
+import os.path as osp
+import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)
+import torch
+import numpy as np
+import onnxruntime
+from .timer import Timer
+from .rprint import rlog
+from .crop import crop_image, _transform_pts
+
+
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+def to_ndarray(obj):
+    if isinstance(obj, torch.Tensor):
+        return obj.cpu().numpy()
+    elif isinstance(obj, np.ndarray):
+        return obj
+    else:
+        return np.array(obj)
+
+
+class LandmarkRunner(object):
+    """landmark runner"""
+
+    def __init__(self, **kwargs):
+        ckpt_path = kwargs.get('ckpt_path')
+        onnx_provider = kwargs.get('onnx_provider', 'cuda')  # 默认用cuda
+        device_id = kwargs.get('device_id', 0)
+        self.dsize = kwargs.get('dsize', 224)
+        self.timer = Timer()
+
+        if onnx_provider.lower() == 'cuda':
+            self.session = onnxruntime.InferenceSession(
+                ckpt_path, providers=[
+                    ('CUDAExecutionProvider', {'device_id': device_id})
+                ]
+            )
+        elif onnx_provider.lower() == 'mps':
+            self.session = onnxruntime.InferenceSession(
+                ckpt_path, providers=[
+                    'CoreMLExecutionProvider'
+                ]
+            )
+        else:
+            opts = onnxruntime.SessionOptions()
+            opts.intra_op_num_threads = 4  # 默认线程数为 4
+            self.session = onnxruntime.InferenceSession(
+                ckpt_path, providers=['CPUExecutionProvider'],
+                sess_options=opts
+            )
+
+    def _run(self, inp):
+        out = self.session.run(None, {'input': inp})
+        return out
+
+    def run(self, img_rgb: np.ndarray, lmk=None):
+        if lmk is not None:
+            crop_dct = crop_image(img_rgb, lmk, dsize=self.dsize, scale=1.5, vy_ratio=-0.1)
+            img_crop_rgb = crop_dct['img_crop']
+        else:
+            # NOTE: force resize to 224x224, NOT RECOMMEND!
+            img_crop_rgb = cv2.resize(img_rgb, (self.dsize, self.dsize))
+            scale = max(img_rgb.shape[:2]) / self.dsize
+            crop_dct = {
+                'M_c2o': np.array([
+                    [scale, 0., 0.],
+                    [0., scale, 0.],
+                    [0., 0., 1.],
+                ], dtype=np.float32),
+            }
+
+        inp = (img_crop_rgb.astype(np.float32) / 255.).transpose(2, 0, 1)[None, ...]  # HxWx3 (BGR) -> 1x3xHxW (RGB!)
+
+        out_lst = self._run(inp)
+        out_pts = out_lst[2]
+
+        # 2d landmarks 203 points
+        lmk = to_ndarray(out_pts[0]).reshape(-1, 2) * self.dsize  # scale to 0-224
+        lmk = _transform_pts(lmk, M=crop_dct['M_c2o'])
+
+        return lmk
+
+    def warmup(self):
+        self.timer.tic()
+
+        dummy_image = np.zeros((1, 3, self.dsize, self.dsize), dtype=np.float32)
+
+        _ = self._run(dummy_image)
+
+        elapse = self.timer.toc()
+        rlog(f'LandmarkRunner warmup time: {elapse:.3f}s')
diff --git a/src/thirdparty/liveportrait/src/utils/io.py b/src/thirdparty/liveportrait/src/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e4bc69536fdc7c668821eec5ec069c8e3154527
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/io.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+
+import os.path as osp
+import imageio
+import numpy as np
+import pickle
+import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)
+
+from .helper import mkdir, suffix
+
+
+def load_image_rgb(image_path: str):
+    if not osp.exists(image_path):
+        raise FileNotFoundError(f"Image not found: {image_path}")
+    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
+    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+
+def load_video(video_info, n_frames=-1):
+    reader = imageio.get_reader(video_info, "ffmpeg")
+
+    ret = []
+    for idx, frame_rgb in enumerate(reader):
+        if n_frames > 0 and idx >= n_frames:
+            break
+        ret.append(frame_rgb)
+
+    reader.close()
+    return ret
+
+
+def contiguous(obj):
+    if not obj.flags.c_contiguous:
+        obj = obj.copy(order="C")
+    return obj
+
+
+def resize_to_limit(img: np.ndarray, max_dim=1920, division=2):
+    """
+    ajust the size of the image so that the maximum dimension does not exceed max_dim, and the width and the height of the image are multiples of n.
+    :param img: the image to be processed.
+    :param max_dim: the maximum dimension constraint.
+    :param n: the number that needs to be multiples of.
+    :return: the adjusted image.
+    """
+    h, w = img.shape[:2]
+
+    # ajust the size of the image according to the maximum dimension
+    if max_dim > 0 and max(h, w) > max_dim:
+        if h > w:
+            new_h = max_dim
+            new_w = int(w * (max_dim / h))
+        else:
+            new_w = max_dim
+            new_h = int(h * (max_dim / w))
+        img = cv2.resize(img, (new_w, new_h))
+
+    # ensure that the image dimensions are multiples of n
+    division = max(division, 1)
+    new_h = img.shape[0] - (img.shape[0] % division)
+    new_w = img.shape[1] - (img.shape[1] % division)
+
+    if new_h == 0 or new_w == 0:
+        # when the width or height is less than n, no need to process
+        return img
+
+    if new_h != img.shape[0] or new_w != img.shape[1]:
+        img = img[:new_h, :new_w]
+
+    return img
+
+
+def load_img_online(obj, mode="bgr", **kwargs):
+    max_dim = kwargs.get("max_dim", 1920)
+    n = kwargs.get("n", 2)
+    if isinstance(obj, str):
+        if mode.lower() == "gray":
+            img = cv2.imread(obj, cv2.IMREAD_GRAYSCALE)
+        else:
+            img = cv2.imread(obj, cv2.IMREAD_COLOR)
+    else:
+        img = obj
+
+    # Resize image to satisfy constraints
+    img = resize_to_limit(img, max_dim=max_dim, division=n)
+
+    if mode.lower() == "bgr":
+        return contiguous(img)
+    elif mode.lower() == "rgb":
+        return contiguous(img[..., ::-1])
+    else:
+        raise Exception(f"Unknown mode {mode}")
+
+
+def load(fp):
+    suffix_ = suffix(fp)
+
+    if suffix_ == "npy":
+        return np.load(fp)
+    elif suffix_ == "pkl":
+        return pickle.load(open(fp, "rb"))
+    else:
+        raise Exception(f"Unknown type: {suffix}")
+
+
+def dump(wfp, obj):
+    wd = osp.split(wfp)[0]
+    if wd != "" and not osp.exists(wd):
+        mkdir(wd)
+
+    _suffix = suffix(wfp)
+    if _suffix == "npy":
+        np.save(wfp, obj)
+    elif _suffix == "pkl":
+        pickle.dump(obj, open(wfp, "wb"))
+    else:
+        raise Exception("Unknown type: {}".format(_suffix))
diff --git a/src/thirdparty/liveportrait/src/utils/resources/clip_embedding_68.pkl b/src/thirdparty/liveportrait/src/utils/resources/clip_embedding_68.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..d630c9fcc4742ece7e820746d9aae5753afd2085
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/resources/clip_embedding_68.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7face6b75009a4d29290a7a2ba8ce888057036e658e91cfe8482a67abcb06f1
+size 71352
diff --git a/src/thirdparty/liveportrait/src/utils/resources/clip_embedding_9.pkl b/src/thirdparty/liveportrait/src/utils/resources/clip_embedding_9.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..86607144a297beba26c2392e022ab82c3fbf57fe
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/resources/clip_embedding_9.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4982ea59b74a72fa210f544d20540cb94cd3912f3824d3df3a6feb30b1d2633b
+size 10923
diff --git a/src/thirdparty/liveportrait/src/utils/resources/lip_array.pkl b/src/thirdparty/liveportrait/src/utils/resources/lip_array.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..70b1d611224fffefc19f3cbfa10b659f9a4a2ae5
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/resources/lip_array.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d61aab1cc37a2741f4774fe9963b2db2566e063f798b1d28d791dd119352738
+size 658
diff --git a/src/thirdparty/liveportrait/src/utils/resources/mask_template.png b/src/thirdparty/liveportrait/src/utils/resources/mask_template.png
new file mode 100644
index 0000000000000000000000000000000000000000..bca6ca5977ba820d0d2c05b3793c6231cc82e715
Binary files /dev/null and b/src/thirdparty/liveportrait/src/utils/resources/mask_template.png differ
diff --git a/src/thirdparty/liveportrait/src/utils/retargeting_utils.py b/src/thirdparty/liveportrait/src/utils/retargeting_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae2e5f52effe8107503586c9f5a24f39dfdbbbcf
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/retargeting_utils.py
@@ -0,0 +1,24 @@
+
+"""
+Functions to compute distance ratios between specific pairs of facial landmarks
+"""
+
+import numpy as np
+
+
+def calculate_distance_ratio(lmk: np.ndarray, idx1: int, idx2: int, idx3: int, idx4: int, eps: float = 1e-6) -> np.ndarray:
+    return (np.linalg.norm(lmk[:, idx1] - lmk[:, idx2], axis=1, keepdims=True) /
+            (np.linalg.norm(lmk[:, idx3] - lmk[:, idx4], axis=1, keepdims=True) + eps))
+
+
+def calc_eye_close_ratio(lmk: np.ndarray, target_eye_ratio: np.ndarray = None) -> np.ndarray:
+    lefteye_close_ratio = calculate_distance_ratio(lmk, 6, 18, 0, 12)
+    righteye_close_ratio = calculate_distance_ratio(lmk, 30, 42, 24, 36)
+    if target_eye_ratio is not None:
+        return np.concatenate([lefteye_close_ratio, righteye_close_ratio, target_eye_ratio], axis=1)
+    else:
+        return np.concatenate([lefteye_close_ratio, righteye_close_ratio], axis=1)
+
+
+def calc_lip_close_ratio(lmk: np.ndarray) -> np.ndarray:
+    return calculate_distance_ratio(lmk, 90, 102, 48, 66)
diff --git a/src/thirdparty/liveportrait/src/utils/rprint.py b/src/thirdparty/liveportrait/src/utils/rprint.py
new file mode 100644
index 0000000000000000000000000000000000000000..c43a42f9855bbb019725e6c2b6c6c50e6fa4d0c5
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/rprint.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+
+"""
+custom print and log functions 
+"""
+
+__all__ = ['rprint', 'rlog']
+
+try:
+    from rich.console import Console
+    console = Console()
+    rprint = console.print
+    rlog = console.log
+except:
+    rprint = print
+    rlog = print
diff --git a/src/thirdparty/liveportrait/src/utils/timer.py b/src/thirdparty/liveportrait/src/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3bbb278c51447a6cb35a3bb5d3cb23e45e0f66c
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/timer.py
@@ -0,0 +1,65 @@
+# coding: utf-8
+
+"""
+tools to measure elapsed time
+"""
+
+import time
+
+class Timer(object):
+    """A simple timer."""
+
+    def __init__(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        
+        self.window = 100
+        self.pre_len = 0
+        self.times = []
+        self.average = 0.
+        self.first_time = 0.
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=False):
+        
+        self.diff = time.time() - self.start_time
+        self.times.append(self.diff)
+        
+        if average:
+            self.average = (self.diff + self.average * self.pre_len - self.first_time) / len(self.times)
+        else:
+            self.average = self.diff
+            
+        self.pre_len = len(self.times)
+            
+        if len(self.times) > self.window:
+            self.first_time = self.times.pop(0)
+        
+        return self.average
+
+    def clear(self):
+        self.start_time = 0.
+        self.diff = 0.
+        self.times = []
+        self.pre_len = 0
+        self.average = 0.
+        self.first_time = 0.
+        
+    def fout(self, secs):
+        secs = int(secs)
+        days = secs // (24 * 3600)
+        remaining_secs = secs % (24 * 3600)
+        hours = remaining_secs // 3600
+        remaining_secs %= 3600
+        minutes = remaining_secs // 60
+        seconds = remaining_secs % 60
+        
+        return f"{days}d,{hours:02d}-{minutes:02d}-{seconds:02d}"
+        
+        
diff --git a/src/thirdparty/liveportrait/src/utils/video.py b/src/thirdparty/liveportrait/src/utils/video.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e34e6d896d443ba5c88f89f1a38abca1c5030f9
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/video.py
@@ -0,0 +1,218 @@
+# coding: utf-8
+
+"""
+Functions for processing video
+
+ATTENTION: you need to install ffmpeg and ffprobe in your env!
+"""
+
+import os.path as osp
+import numpy as np
+import subprocess
+import imageio
+import cv2
+from rich.progress import track
+
+from .rprint import rlog as log
+from .rprint import rprint as print
+from .helper import prefix
+
+
+def exec_cmd(cmd):
+    return subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+
+def images2video(images, wfp, **kwargs):
+    fps = kwargs.get('fps', 30)
+    video_format = kwargs.get('format', 'mp4')  # default is mp4 format
+    codec = kwargs.get('codec', 'libx264')  # default is libx264 encoding
+    quality = kwargs.get('quality')  # video quality
+    pixelformat = kwargs.get('pixelformat', 'yuv420p')  # video pixel format
+    image_mode = kwargs.get('image_mode', 'rgb')
+    macro_block_size = kwargs.get('macro_block_size', 2)
+    ffmpeg_params = ['-crf', str(kwargs.get('crf', 18))]
+
+    writer = imageio.get_writer(
+        wfp, fps=fps, format=video_format,
+        codec=codec, quality=quality, ffmpeg_params=ffmpeg_params, pixelformat=pixelformat, macro_block_size=macro_block_size
+    )
+
+    n = len(images)
+    for i in track(range(n), description='Writing', transient=True):
+        if image_mode.lower() == 'bgr':
+            writer.append_data(images[i][..., ::-1])
+        else:
+            writer.append_data(images[i])
+
+    writer.close()
+
+
+def video2gif(video_fp, fps=30, size=256):
+    if osp.exists(video_fp):
+        d = osp.split(video_fp)[0]
+        fn = prefix(osp.basename(video_fp))
+        palette_wfp = osp.join(d, 'palette.png')
+        gif_wfp = osp.join(d, f'{fn}.gif')
+        # generate the palette
+        cmd = f'ffmpeg -i "{video_fp}" -vf "fps={fps},scale={size}:-1:flags=lanczos,palettegen" "{palette_wfp}" -y'
+        exec_cmd(cmd)
+        # use the palette to generate the gif
+        cmd = f'ffmpeg -i "{video_fp}" -i "{palette_wfp}" -filter_complex "fps={fps},scale={size}:-1:flags=lanczos[x];[x][1:v]paletteuse" "{gif_wfp}" -y'
+        exec_cmd(cmd)
+        return gif_wfp
+    else:
+        raise FileNotFoundError(f"video_fp: {video_fp} not exists!")
+
+
+def merge_audio_video(video_fp, audio_fp, wfp):
+    if osp.exists(video_fp) and osp.exists(audio_fp):
+        cmd = f'ffmpeg -i "{video_fp}" -i "{audio_fp}" -c:v copy -c:a aac "{wfp}" -y'
+        exec_cmd(cmd)
+        print(f'merge {video_fp} and {audio_fp} to {wfp}')
+    else:
+        print(f'video_fp: {video_fp} or audio_fp: {audio_fp} not exists!')
+
+
+def blend(img: np.ndarray, mask: np.ndarray, background_color=(255, 255, 255)):
+    mask_float = mask.astype(np.float32) / 255.
+    background_color = np.array(background_color).reshape([1, 1, 3])
+    bg = np.ones_like(img) * background_color
+    img = np.clip(mask_float * img + (1 - mask_float) * bg, 0, 255).astype(np.uint8)
+    return img
+
+
+def concat_frames(driving_image_lst, source_image_lst, I_p_lst):
+    # TODO: add more concat style, e.g., left-down corner driving
+    out_lst = []
+    h, w, _ = I_p_lst[0].shape
+    source_image_resized_lst = [cv2.resize(img, (w, h)) for img in source_image_lst]
+
+    for idx, _ in track(enumerate(I_p_lst), total=len(I_p_lst), description='Concatenating result...'):
+        I_p = I_p_lst[idx]
+        source_image_resized = source_image_resized_lst[idx] if len(source_image_lst) > 1 else source_image_resized_lst[0]
+
+        if driving_image_lst is None:
+            out = np.hstack((source_image_resized, I_p))
+        else:
+            driving_image = driving_image_lst[idx]
+            driving_image_resized = cv2.resize(driving_image, (w, h))
+            out = np.hstack((driving_image_resized, source_image_resized, I_p))
+
+        out_lst.append(out)
+    return out_lst
+
+
+class VideoWriter:
+    def __init__(self, **kwargs):
+        self.fps = kwargs.get('fps', 30)
+        self.wfp = kwargs.get('wfp', 'video.mp4')
+        self.video_format = kwargs.get('format', 'mp4')
+        self.codec = kwargs.get('codec', 'libx264')
+        self.quality = kwargs.get('quality')
+        self.pixelformat = kwargs.get('pixelformat', 'yuv420p')
+        self.image_mode = kwargs.get('image_mode', 'rgb')
+        self.ffmpeg_params = kwargs.get('ffmpeg_params')
+
+        self.writer = imageio.get_writer(
+            self.wfp, fps=self.fps, format=self.video_format,
+            codec=self.codec, quality=self.quality,
+            ffmpeg_params=self.ffmpeg_params, pixelformat=self.pixelformat
+        )
+
+    def write(self, image):
+        if self.image_mode.lower() == 'bgr':
+            self.writer.append_data(image[..., ::-1])
+        else:
+            self.writer.append_data(image)
+
+    def close(self):
+        if self.writer is not None:
+            self.writer.close()
+
+
+def change_video_fps(input_file, output_file, fps=20, codec='libx264', crf=12):
+    cmd = f'ffmpeg -i "{input_file}" -c:v {codec} -crf {crf} -r {fps} "{output_file}" -y'
+    exec_cmd(cmd)
+
+
+def get_fps(filepath, default_fps=25):
+    try:
+        fps = cv2.VideoCapture(filepath).get(cv2.CAP_PROP_FPS)
+
+        if fps in (0, None):
+            fps = default_fps
+    except Exception as e:
+        log(e)
+        fps = default_fps
+
+    return fps
+
+
+def has_audio_stream(video_path: str) -> bool:
+    """
+    Check if the video file contains an audio stream.
+
+    :param video_path: Path to the video file
+    :return: True if the video contains an audio stream, False otherwise
+    """
+    if osp.isdir(video_path):
+        return False
+
+    cmd = [
+        'ffprobe',
+        '-v', 'error',
+        '-select_streams', 'a',
+        '-show_entries', 'stream=codec_type',
+        '-of', 'default=noprint_wrappers=1:nokey=1',
+        f'"{video_path}"'
+    ]
+
+    try:
+        # result = subprocess.run(cmd, capture_output=True, text=True)
+        result = exec_cmd(' '.join(cmd))
+        if result.returncode != 0:
+            log(f"Error occurred while probing video: {result.stderr}")
+            return False
+
+        # Check if there is any output from ffprobe command
+        return bool(result.stdout.strip())
+    except Exception as e:
+        log(
+            f"Error occurred while probing video: {video_path}, "
+            "you may need to install ffprobe! (https://ffmpeg.org/download.html) "
+            "Now set audio to false!",
+            style="bold red"
+        )
+    return False
+
+
+def add_audio_to_video(silent_video_path: str, audio_video_path: str, output_video_path: str):
+    cmd = [
+        'ffmpeg',
+        '-y',
+        '-i', f'"{silent_video_path}"',
+        '-i', f'"{audio_video_path}"',
+        '-map', '0:v',
+        '-map', '1:a',
+        '-c:v', 'copy',
+        '-shortest',
+        f'"{output_video_path}"'
+    ]
+
+    try:
+        exec_cmd(' '.join(cmd))
+        log(f"Video with audio generated successfully: {output_video_path}")
+    except subprocess.CalledProcessError as e:
+        log(f"Error occurred: {e}")
+
+
+def bb_intersection_over_union(boxA, boxB):
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
+    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
+    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
+    iou = interArea / float(boxAArea + boxBArea - interArea)
+    return iou
diff --git a/src/thirdparty/liveportrait/src/utils/viz.py b/src/thirdparty/liveportrait/src/utils/viz.py
new file mode 100644
index 0000000000000000000000000000000000000000..59443cbf207f3395bee241f63c7acb95b9402530
--- /dev/null
+++ b/src/thirdparty/liveportrait/src/utils/viz.py
@@ -0,0 +1,19 @@
+# coding: utf-8
+
+import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)
+
+
+def viz_lmk(img_, vps, **kwargs):
+    """可视化点"""
+    lineType = kwargs.get("lineType", cv2.LINE_8)  # cv2.LINE_AA
+    img_for_viz = img_.copy()
+    for pt in vps:
+        cv2.circle(
+            img_for_viz,
+            (int(pt[0]), int(pt[1])),
+            radius=kwargs.get("radius", 1),
+            color=(0, 255, 0),
+            thickness=kwargs.get("thickness", 1),
+            lineType=lineType,
+        )
+    return img_for_viz
diff --git a/src/utils/UniPose_SwinT.py b/src/utils/UniPose_SwinT.py
new file mode 100644
index 0000000000000000000000000000000000000000..707b359fc414b525db5a11a9bc505105f6f66741
--- /dev/null
+++ b/src/utils/UniPose_SwinT.py
@@ -0,0 +1,125 @@
+_base_ = ['coco_transformer.py']
+
+use_label_enc = True
+
+num_classes=2
+
+lr = 0.0001
+param_dict_type = 'default'
+lr_backbone = 1e-05
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 12
+lr_drop = 11
+save_checkpoint_interval = 100
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = False
+lr_drop_list = [33, 45]
+
+
+modelname = 'UniPose'
+frozen_weights = None
+backbone = 'swin_T_224_1k'
+
+
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+unic_layers = 0
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+pdetr3_bbox_embed_diff_each_layer = False
+pdetr3_refHW = -1
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dabdetr_yolo_like_anchor_update = False
+dabdetr_deformable_encoder = False
+dabdetr_deformable_decoder = False
+use_deformable_box_attn = False
+box_attn_type = 'roi_align'
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+decoder_layer_noise = False
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+add_channel_attention = False
+add_pos_value = False
+two_stage_type = 'standard'
+two_stage_pat_embed = 0
+two_stage_add_query_num = 0
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+masks = False
+
+decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
+matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+
+dec_pred_bbox_embed_share = True
+dec_pred_class_embed_share = True
+
+
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef=1.0
+dn_bbox_coef=1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+
+match_unstable_error = True
+
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+
+use_detached_boxes_dec_out = False
+
+max_text_len = 256
+shuffle_type = None
+
+use_text_enhancer = True
+use_fusion_layer = True
+
+use_checkpoint = False # True
+use_transformer_ckpt = True
+text_encoder_type = 'bert-base-uncased'
+
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+
+num_body_points=68
+binary_query_selection = False
+use_cdn = True
+ffn_extra_layernorm = False
+
+fix_size=False
diff --git a/src/utils/__pycache__/filter.cpython-310.pyc b/src/utils/__pycache__/filter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..578170cc84d57c2a6f6c8044b879d7b852c61538
Binary files /dev/null and b/src/utils/__pycache__/filter.cpython-310.pyc differ
diff --git a/src/utils/__pycache__/rprint.cpython-310.pyc b/src/utils/__pycache__/rprint.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51b56f41e64eeeb9cafa55a8884e60c914f99aee
Binary files /dev/null and b/src/utils/__pycache__/rprint.cpython-310.pyc differ
diff --git a/src/utils/__pycache__/util.cpython-310.pyc b/src/utils/__pycache__/util.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1758fcb11d71f6366b7b6d1a0558a48ed75a202c
Binary files /dev/null and b/src/utils/__pycache__/util.cpython-310.pyc differ
diff --git a/src/utils/filter.py b/src/utils/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dea9bb40cc175d2fd0f566b3202e997b1c89112
--- /dev/null
+++ b/src/utils/filter.py
@@ -0,0 +1,164 @@
+# coding: utf-8
+
+import torch
+import numpy as np
+from pykalman import KalmanFilter
+PI = np.pi
+
+device = "cuda"
+def get_rotation_matrix(pitch_, yaw_, roll_):
+    """ the input is in degree
+    """
+    # transform to radian
+    pitch = pitch_ / 180 * PI
+    yaw = yaw_ / 180 * PI
+    roll = roll_ / 180 * PI
+
+    device = pitch.device
+
+    if pitch.ndim == 1:
+        pitch = pitch.unsqueeze(1)
+    if yaw.ndim == 1:
+        yaw = yaw.unsqueeze(1)
+    if roll.ndim == 1:
+        roll = roll.unsqueeze(1)
+
+    # calculate the euler matrix
+    bs = pitch.shape[0]
+    ones = torch.ones([bs, 1]).to(device)
+    zeros = torch.zeros([bs, 1]).to(device)
+    x, y, z = pitch, yaw, roll
+
+    rot_x = torch.cat([
+        ones, zeros, zeros,
+        zeros, torch.cos(x), -torch.sin(x),
+        zeros, torch.sin(x), torch.cos(x)
+    ], dim=1).reshape([bs, 3, 3])
+
+    rot_y = torch.cat([
+        torch.cos(y), zeros, torch.sin(y),
+        zeros, ones, zeros,
+        -torch.sin(y), zeros, torch.cos(y)
+    ], dim=1).reshape([bs, 3, 3])
+
+    rot_z = torch.cat([
+        torch.cos(z), -torch.sin(z), zeros,
+        torch.sin(z), torch.cos(z), zeros,
+        zeros, zeros, ones
+    ], dim=1).reshape([bs, 3, 3])
+
+    rot = rot_z @ rot_y @ rot_x
+    return rot.permute(0, 2, 1)  # transpose
+
+def smooth(x_d_lst, shape, device, observation_variance=3e-7, process_variance=1e-5):
+    x_d_lst_reshape = [x.reshape(-1) for x in x_d_lst]
+    x_d_stacked = np.vstack(x_d_lst_reshape)
+    kf = KalmanFilter(
+        initial_state_mean=x_d_stacked[0],
+        n_dim_obs=x_d_stacked.shape[1],
+        transition_covariance=process_variance * np.eye(x_d_stacked.shape[1]),
+        observation_covariance=observation_variance * np.eye(x_d_stacked.shape[1])
+    )
+    smoothed_state_means, _ = kf.smooth(x_d_stacked)
+    x_d_lst_smooth = [torch.tensor(state_mean.reshape(shape[-2:]), dtype=torch.float32, device=device) for state_mean in smoothed_state_means]
+    return x_d_lst_smooth
+
+class ExponentialMovingAverageFilter:
+    def __init__(self, alpha=0.6):
+        self.alpha = alpha
+        self.smoothed_value = None
+
+    def update(self, new_value):
+        if self.smoothed_value is None:
+            self.smoothed_value = new_value
+        else:
+            self.smoothed_value = self.alpha * new_value + (1 - self.alpha) * self.smoothed_value
+        return self.smoothed_value
+
+class MovingAverageFilter:
+    def __init__(self, window_size):
+        self.window_size = window_size
+        self.buffer = np.zeros((window_size, 7))
+        self.index = 0
+        self.full = False
+
+    def update(self, new_value):
+        # 更新队列
+        self.buffer[self.index] = new_value
+        self.index = (self.index + 1) % self.window_size
+        
+        # 如果队列未满，则只计算已有的元素
+        if not self.full and self.index == 0:
+            self.full = True
+
+        # 计算平均值
+        return np.mean(self.buffer[:self.window_size if self.full else self.index], axis=0)
+
+class MedianFilter:
+    def __init__(self, window_size):
+        self.window_size = window_size
+        self.buffer = np.zeros((window_size, 7))
+        self.index = 0
+        self.full = False
+
+    def update(self, new_value):
+        # 更新队列
+        self.buffer[self.index] = new_value
+        self.index = (self.index + 1) % self.window_size
+        
+        # 如果队列未满，则只计算已有的元素
+        if not self.full and self.index == 0:
+            self.full = True
+
+        # 计算中值
+        return np.median(self.buffer[:self.window_size if self.full else self.index], axis=0)
+
+def smooth_(ori_data, method="median"):
+    # 均值滤波 & 中值滤波
+    data_array = []
+    for frame_idx in range(ori_data["n_frames"]):
+        data_array.append(
+            np.concatenate((
+                ori_data['motion'][frame_idx]["scale"].flatten(),
+                ori_data['motion'][frame_idx]["t"].flatten(),
+                ori_data['motion'][frame_idx]["pitch"].flatten(),
+                ori_data['motion'][frame_idx]["yaw"].flatten(),
+                ori_data['motion'][frame_idx]["roll"].flatten(),
+            ))
+        )
+    data_array = np.array(data_array).astype(np.float32)
+    # print("data_array.shape: ", data_array.shape)
+    
+    # 滑动窗口大小
+    if method == "median":
+        window_size = 3
+        ma_filter = MedianFilter(window_size)
+    elif method == "ema":
+        ma_filter = ExponentialMovingAverageFilter(alpha=0.01)
+    else: 
+        window_size = 10
+        ma_filter = MovingAverageFilter(window_size)
+    smoothed_data = []
+    for value in data_array:
+        smoothed_value = ma_filter.update(value)
+        smoothed_data.append(smoothed_value)
+    smoothed_data = np.array(smoothed_data).astype(np.float32)
+    # print("smoothed_data_mean.shape: ", smoothed_data.shape)
+
+    # 整理结果
+    motion_list = []
+    for idx in range(smoothed_data.shape[0]):
+        exp = ori_data["motion"][idx]["exp"]
+        scale = smoothed_data[idx][0:1].reshape(1, 1)
+        # scale = 1.2 * np.ones((1, 1)).reshape(1, 1).astype(np.float32)
+        t = smoothed_data[idx][1:4].reshape(1, 3).astype(np.float32)
+        pitch = smoothed_data[idx][4:5].reshape(1, 1).astype(np.float32)
+        yaw = smoothed_data[idx][5:6].reshape(1, 1).astype(np.float32)
+        roll = smoothed_data[idx][6:7].reshape(1, 1).astype(np.float32)
+        R = get_rotation_matrix(torch.FloatTensor(pitch), torch.FloatTensor(yaw), torch.FloatTensor(roll))
+        R = R.reshape(1, 3, 3).cpu().numpy().astype(np.float32)
+
+        motion_list.append({"exp": exp, "scale": scale, "t": t, "pitch": pitch, "yaw": yaw, "roll": roll, "R": R})
+    # print(f"exp: {exp.shape}, scale: {scale.shape}, t: {t.shape}, pitch: {pitch.shape}, yaw: {yaw.shape}, roll: {roll.shape}, R: {R.shape}")
+    tgt_motion = {'n_frames': smoothed_data.shape[0], 'output_fps': 25, 'motion': motion_list}
+    return tgt_motion
diff --git a/src/utils/resources/lip_array.pkl b/src/utils/resources/lip_array.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..70b1d611224fffefc19f3cbfa10b659f9a4a2ae5
--- /dev/null
+++ b/src/utils/resources/lip_array.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d61aab1cc37a2741f4774fe9963b2db2566e063f798b1d28d791dd119352738
+size 658
diff --git a/src/utils/resources/mask_template.png b/src/utils/resources/mask_template.png
new file mode 100644
index 0000000000000000000000000000000000000000..bca6ca5977ba820d0d2c05b3793c6231cc82e715
Binary files /dev/null and b/src/utils/resources/mask_template.png differ
diff --git a/src/utils/rprint.py b/src/utils/rprint.py
new file mode 100644
index 0000000000000000000000000000000000000000..c43a42f9855bbb019725e6c2b6c6c50e6fa4d0c5
--- /dev/null
+++ b/src/utils/rprint.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+
+"""
+custom print and log functions 
+"""
+
+__all__ = ['rprint', 'rlog']
+
+try:
+    from rich.console import Console
+    console = Console()
+    rprint = console.print
+    rlog = console.log
+except:
+    rprint = print
+    rlog = print
diff --git a/src/utils/util.py b/src/utils/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..54cd6227a7f4d9e713d2e2f0c320412267ef6494
--- /dev/null
+++ b/src/utils/util.py
@@ -0,0 +1,1030 @@
+
+"""
+utils.py
+
+This module provides utility functions for various tasks such as setting random seeds,
+importing modules from files, managing checkpoint files, and saving video files from 
+sequences of PIL images.
+
+Functions:
+    seed_everything(seed)
+    import_filename(filename)
+    delete_additional_ckpt(base_path, num_keep)
+    save_videos_from_pil(pil_images, path, fps=8)
+
+Dependencies:
+    importlib
+    os
+    os.path as osp
+    random
+    shutil
+    sys
+    pathlib.Path
+    av
+    cv2
+    mediapipe as mp
+    numpy as np
+    torch
+    torchvision
+    einops.rearrange
+    moviepy.editor.AudioFileClip, VideoClip
+    PIL.Image
+
+Examples:
+    seed_everything(42)
+    imported_module = import_filename('path/to/your/module.py')
+    delete_additional_ckpt('path/to/checkpoints', 1)
+    save_videos_from_pil(pil_images, 'output/video.mp4', fps=12)
+
+The functions in this module ensure reproducibility of experiments by seeding random number 
+generators, allow dynamic importing of modules, manage checkpoint files by deleting extra ones, 
+and provide a way to save sequences of images as video files.
+
+Function Details:
+    seed_everything(seed)
+        Seeds all random number generators to ensure reproducibility.
+
+    import_filename(filename)
+        Imports a module from a given file location.
+
+    delete_additional_ckpt(base_path, num_keep)
+        Deletes additional checkpoint files in the given directory.
+
+    save_videos_from_pil(pil_images, path, fps=8)
+        Saves a sequence of images as a video using the Pillow library.
+
+Attributes:
+    _ (str): Placeholder for static type checking
+"""
+
+import importlib
+import os
+import os.path as osp
+import random
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+from typing import List
+
+import av
+import cv2
+import mediapipe as mp
+import numpy as np
+import torch
+import torchvision
+from einops import rearrange
+from moviepy.editor import AudioFileClip, VideoClip
+from moviepy.editor import VideoFileClip, concatenate_videoclips
+from PIL import Image
+
+
+def seed_everything(seed):
+    """
+    Seeds all random number generators to ensure reproducibility.
+
+    Args:
+        seed (int): The seed value to set for all random number generators.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed % (2**32))
+    random.seed(seed)
+
+
+def import_filename(filename):
+    """
+    Import a module from a given file location.
+
+    Args:
+        filename (str): The path to the file containing the module to be imported.
+
+    Returns:
+        module: The imported module.
+
+    Raises:
+        ImportError: If the module cannot be imported.
+
+    Example:
+        >>> imported_module = import_filename('path/to/your/module.py')
+    """
+    spec = importlib.util.spec_from_file_location("mymodule", filename)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def delete_additional_ckpt(base_path, num_keep):
+    """
+    Deletes additional checkpoint files in the given directory.
+
+    Args:
+        base_path (str): The path to the directory containing the checkpoint files.
+        num_keep (int): The number of most recent checkpoint files to keep.
+
+    Returns:
+        None
+
+    Raises:
+        FileNotFoundError: If the base_path does not exist.
+
+    Example:
+        >>> delete_additional_ckpt('path/to/checkpoints', 1)
+        # This will delete all but the most recent checkpoint file in 'path/to/checkpoints'.
+    """
+    dirs = []
+    for d in os.listdir(base_path):
+        if d.startswith("checkpoint-"):
+            dirs.append(d)
+    num_tot = len(dirs)
+    if num_tot <= num_keep:
+        return
+    # ensure ckpt is sorted and delete the ealier!
+    del_dirs = sorted(dirs, key=lambda x: int(
+        x.split("-")[-1]))[: num_tot - num_keep]
+    for d in del_dirs:
+        path_to_dir = osp.join(base_path, d)
+        if osp.exists(path_to_dir):
+            shutil.rmtree(path_to_dir)
+
+
+def save_videos_from_pil(pil_images, path, fps=8):
+    """
+    Save a sequence of images as a video using the Pillow library.
+
+    Args:
+        pil_images (List[PIL.Image]): A list of PIL.Image objects representing the frames of the video.
+        path (str): The output file path for the video.
+        fps (int, optional): The frames per second rate of the video. Defaults to 8.
+    
+    Returns:
+        None
+    
+    Raises:
+        ValueError: If the save format is not supported.
+
+    This function takes a list of PIL.Image objects and saves them as a video file with a specified frame rate.
+    The output file format is determined by the file extension of the provided path. Supported formats include
+    .mp4, .avi, and .mkv. The function uses the Pillow library to handle the image processing and video
+    creation.
+    """
+    save_fmt = Path(path).suffix
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    width, height = pil_images[0].size
+
+    if save_fmt == ".mp4":
+        codec = "libx264"
+        container = av.open(path, "w")
+        stream = container.add_stream(codec, rate=fps)
+
+        stream.width = width
+        stream.height = height
+
+        for pil_image in pil_images:
+            # pil_image = Image.fromarray(image_arr).convert("RGB")
+            av_frame = av.VideoFrame.from_image(pil_image)
+            container.mux(stream.encode(av_frame))
+        container.mux(stream.encode())
+        container.close()
+
+    elif save_fmt == ".gif":
+        pil_images[0].save(
+            fp=path,
+            format="GIF",
+            append_images=pil_images[1:],
+            save_all=True,
+            duration=(1 / fps * 1000),
+            loop=0,
+        )
+    else:
+        raise ValueError("Unsupported file type. Use .mp4 or .gif.")
+
+
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
+    """
+    Save a grid of videos as an animation or video.
+
+    Args:
+        videos (torch.Tensor): A tensor of shape (batch_size, channels, time, height, width)
+            containing the videos to save.
+        path (str): The path to save the video grid. Supported formats are .mp4, .avi, and .gif.
+        rescale (bool, optional): If True, rescale the video to the original resolution.
+            Defaults to False.
+        n_rows (int, optional): The number of rows in the video grid. Defaults to 6.
+        fps (int, optional): The frame rate of the saved video. Defaults to 8.
+
+    Raises:
+        ValueError: If the video format is not supported.
+
+    Returns:
+        None
+    """
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    # height, width = videos.shape[-2:]
+    outputs = []
+
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)  # (c h w)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)  # (h w c)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = (x * 255).numpy().astype(np.uint8)
+        x = Image.fromarray(x)
+
+        outputs.append(x)
+
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+
+    save_videos_from_pil(outputs, path, fps)
+
+
+def read_frames(video_path):
+    """
+    Reads video frames from a given video file.
+
+    Args:
+        video_path (str): The path to the video file.
+
+    Returns:
+        container (av.container.InputContainer): The input container object
+                                                   containing the video stream.
+
+    Raises:
+        FileNotFoundError: If the video file is not found.
+        RuntimeError: If there is an error in reading the video stream.
+
+    The function reads the video frames from the specified video file using the
+    Python AV library (av). It returns an input container object that contains
+    the video stream. If the video file is not found, it raises a FileNotFoundError,
+    and if there is an error in reading the video stream, it raises a RuntimeError.
+    """
+    container = av.open(video_path)
+
+    video_stream = next(s for s in container.streams if s.type == "video")
+    frames = []
+    for packet in container.demux(video_stream):
+        for frame in packet.decode():
+            image = Image.frombytes(
+                "RGB",
+                (frame.width, frame.height),
+                frame.to_rgb().to_ndarray(),
+            )
+            frames.append(image)
+
+    return frames
+
+
+def get_fps(video_path):
+    """
+    Get the frame rate (FPS) of a video file.
+
+    Args:
+        video_path (str): The path to the video file.
+
+    Returns:
+        int: The frame rate (FPS) of the video file.
+    """
+    container = av.open(video_path)
+    video_stream = next(s for s in container.streams if s.type == "video")
+    fps = video_stream.average_rate
+    container.close()
+    return fps
+
+
+def tensor_to_video(tensor, output_video_file, audio_source, fps=25):
+    """
+    Converts a Tensor with shape [c, f, h, w] into a video and adds an audio track from the specified audio file.
+
+    Args:
+        tensor (Tensor): The Tensor to be converted, shaped [c, f, h, w].
+        output_video_file (str): The file path where the output video will be saved.
+        audio_source (str): The path to the audio file (WAV file) that contains the audio track to be added.
+        fps (int): The frame rate of the output video. Default is 25 fps.
+    """
+    tensor = tensor.permute(1, 2, 3, 0).cpu(
+    ).numpy()  # convert to [f, h, w, c]
+    tensor = np.clip(tensor * 255, 0, 255).astype(
+        np.uint8
+    )  # to [0, 255]
+
+    def make_frame(t):
+        # get index
+        frame_index = min(int(t * fps), tensor.shape[0] - 1)
+        return tensor[frame_index]
+    new_video_clip = VideoClip(make_frame, duration=tensor.shape[0] / fps)
+    audio_clip = AudioFileClip(audio_source).subclip(0, tensor.shape[0] / fps)
+    new_video_clip = new_video_clip.set_audio(audio_clip)
+    new_video_clip.write_videofile(output_video_file, fps=fps, audio_codec='aac')
+
+
+def tensor_to_video_batch(tensor, output_video_file, start, audio_source, fps=25):
+    """
+    Converts a Tensor with shape [c, f, h, w] into a video and adds an audio track from the specified audio file.
+
+    Args:
+        tensor (Tensor): The Tensor to be converted, shaped [c, f, h, w].
+        output_video_file (str): The file path where the output video will be saved.
+        audio_source (str): The path to the audio file (WAV file) that contains the audio track to be added.
+        fps (int): The frame rate of the output video. Default is 25 fps.
+    """
+    tensor = tensor.permute(1, 2, 3, 0).cpu(
+    ).numpy()  # convert to [f, h, w, c]
+    tensor = np.clip(tensor * 255, 0, 255).astype(
+        np.uint8
+    )  # to [0, 255]
+
+    def make_frame(t):
+        # get index
+        frame_index = min(int(t * fps), tensor.shape[0] - 1)
+        return tensor[frame_index]
+    new_video_clip = VideoClip(make_frame, duration=tensor.shape[0] / fps)
+    audio_clip = AudioFileClip(audio_source).subclip(start / fps, (start + tensor.shape[0]) / fps)
+    new_video_clip = new_video_clip.set_audio(audio_clip)
+    new_video_clip.write_videofile(output_video_file, fps=fps, audio_codec='aac')
+
+def merge_videos(input_directory, output_file):
+    video_files = [f for f in os.listdir(input_directory) if f.endswith('.mp4')]
+    
+    video_files.sort()  
+
+    clips = []
+    
+    for video_file in video_files:
+        file_path = os.path.join(input_directory, video_file)
+        clip = VideoFileClip(file_path)
+        clips.append(clip)
+
+    final_clip = concatenate_videoclips(clips)
+
+    final_clip.write_videofile(output_file, codec="libx264")
+
+    for clip in clips:
+        clip.close()
+
+
+silhouette_ids = [
+    10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361, 288,
+    397, 365, 379, 378, 400, 377, 152, 148, 176, 149, 150, 136,
+    172, 58, 132, 93, 234, 127, 162, 21, 54, 103, 67, 109
+]
+lip_ids = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291,
+           146, 91, 181, 84, 17, 314, 405, 321, 375]
+
+
+def compute_face_landmarks(detection_result, h, w):
+    """
+    Compute face landmarks from a detection result.
+
+    Args:
+        detection_result (mediapipe.solutions.face_mesh.FaceMesh): The detection result containing face landmarks.
+        h (int): The height of the video frame.
+        w (int): The width of the video frame.
+
+    Returns:
+        face_landmarks_list (list): A list of face landmarks.
+    """
+    face_landmarks_list = detection_result.face_landmarks
+    if len(face_landmarks_list) != 1:
+        print("#face is invalid:", len(face_landmarks_list))
+        return []
+    return [[p.x * w, p.y * h] for p in face_landmarks_list[0]]
+
+
+def get_landmark(
+    file, 
+    model_path="pretrain_weights/face/face_landmarker_v2_with_blendshapes.task"
+):
+    """
+    This function takes a file as input and returns the facial landmarks detected in the file.
+
+    Args:
+        file (str): The path to the file containing the video or image to be processed.
+
+    Returns:
+        Tuple[List[float], List[float]]: A tuple containing two lists of floats representing the x and y coordinates of the facial landmarks.
+    """
+    #model_path = "pretrained_models/face_analysis/models/face_landmarker_v2_with_blendshapes.task"
+    BaseOptions = mp.tasks.BaseOptions
+    FaceLandmarker = mp.tasks.vision.FaceLandmarker
+    FaceLandmarkerOptions = mp.tasks.vision.FaceLandmarkerOptions
+    VisionRunningMode = mp.tasks.vision.RunningMode
+    # Create a face landmarker instance with the video mode:
+    options = FaceLandmarkerOptions(
+        base_options=BaseOptions(model_asset_path=model_path),
+        running_mode=VisionRunningMode.IMAGE,
+    )
+
+    with FaceLandmarker.create_from_options(options) as landmarker:
+        image = mp.Image.create_from_file(str(file))
+        height, width = image.height, image.width
+        face_landmarker_result = landmarker.detect(image)
+        face_landmark = compute_face_landmarks(
+            face_landmarker_result, height, width)
+
+    return np.array(face_landmark), height, width
+
+
+def get_landmark_overframes(landmark_model, frames_path):
+    """
+    This function iterate frames and returns the facial landmarks detected in each frame.
+
+    Args:
+        landmark_model: mediapipe landmark model instance
+        frames_path (str): The path to the video frames.
+
+    Returns:
+        List[List[float], float, float]: A List containing two lists of floats representing the x and y coordinates of the facial landmarks.
+    """
+
+    face_landmarks = []
+
+    for file in sorted(os.listdir(frames_path)):
+        image = mp.Image.create_from_file(os.path.join(frames_path, file))
+        height, width = image.height, image.width
+        landmarker_result = landmark_model.detect(image)
+        frame_landmark = compute_face_landmarks(
+            landmarker_result, height, width)
+        face_landmarks.append(frame_landmark)
+
+    return face_landmarks, height, width
+
+
+def get_lip_mask(landmarks, height, width, out_path=None, expand_ratio=2.0):
+    """
+    Extracts the lip region from the given landmarks and saves it as an image.
+
+    Parameters:
+        landmarks (numpy.ndarray): Array of facial landmarks.
+        height (int): Height of the output lip mask image.
+        width (int): Width of the output lip mask image.
+        out_path (pathlib.Path): Path to save the lip mask image.
+        expand_ratio (float): Expand ratio of mask.
+    """
+    lip_landmarks = np.take(landmarks, lip_ids, 0)
+    min_xy_lip = np.round(np.min(lip_landmarks, 0))
+    max_xy_lip = np.round(np.max(lip_landmarks, 0))
+    min_xy_lip[0], max_xy_lip[0], min_xy_lip[1], max_xy_lip[1] = expand_region(
+        [min_xy_lip[0], max_xy_lip[0], min_xy_lip[1], max_xy_lip[1]], width, height, expand_ratio)
+    lip_mask = np.zeros((height, width), dtype=np.uint8)
+    lip_mask[round(min_xy_lip[1]):round(max_xy_lip[1]),
+             round(min_xy_lip[0]):round(max_xy_lip[0])] = 255
+    if out_path:
+        cv2.imwrite(str(out_path), lip_mask)
+        return None
+
+    return lip_mask
+
+
+def get_union_lip_mask(landmarks, height, width, expand_ratio=1):
+    """
+    Extracts the lip region from the given landmarks and saves it as an image.
+
+    Parameters:
+        landmarks (numpy.ndarray): Array of facial landmarks.
+        height (int): Height of the output lip mask image.
+        width (int): Width of the output lip mask image.
+        expand_ratio (float): Expand ratio of mask.
+    """
+    lip_masks = []
+    for landmark in landmarks:
+        lip_masks.append(get_lip_mask(landmarks=landmark, height=height,
+                     width=width, expand_ratio=expand_ratio))
+    union_mask = get_union_mask(lip_masks)
+    return union_mask
+
+
+def get_face_mask(landmarks, height, width, out_path=None, expand_ratio=1.2):
+    """
+    Generate a face mask based on the given landmarks.
+
+    Args:
+        landmarks (numpy.ndarray): The landmarks of the face.
+        height (int): The height of the output face mask image.
+        width (int): The width of the output face mask image.
+        out_path (pathlib.Path): The path to save the face mask image.
+        expand_ratio (float): Expand ratio of mask.
+    Returns:
+        None. The face mask image is saved at the specified path.
+    """
+    face_landmarks = np.take(landmarks, silhouette_ids, 0)
+    min_xy_face = np.round(np.min(face_landmarks, 0))
+    max_xy_face = np.round(np.max(face_landmarks, 0))
+    min_xy_face[0], max_xy_face[0], min_xy_face[1], max_xy_face[1] = expand_region(
+        [min_xy_face[0], max_xy_face[0], min_xy_face[1], max_xy_face[1]], width, height, expand_ratio)
+    face_mask = np.zeros((height, width), dtype=np.uint8)
+    face_mask[round(min_xy_face[1]):round(max_xy_face[1]),
+              round(min_xy_face[0]):round(max_xy_face[0])] = 255
+    if out_path:
+        cv2.imwrite(str(out_path), face_mask)
+        return None
+
+    return face_mask
+
+
+def get_union_face_mask(landmarks, height, width, expand_ratio=1):
+    """
+    Generate a face mask based on the given landmarks.
+
+    Args:
+        landmarks (numpy.ndarray): The landmarks of the face.
+        height (int): The height of the output face mask image.
+        width (int): The width of the output face mask image.
+        expand_ratio (float): Expand ratio of mask.
+    Returns:
+        None. The face mask image is saved at the specified path.
+    """
+    face_masks = []
+    for landmark in landmarks:
+        face_masks.append(get_face_mask(landmarks=landmark,height=height,width=width,expand_ratio=expand_ratio))
+    union_mask = get_union_mask(face_masks)
+    return union_mask
+
+def get_mask(file, cache_dir, face_expand_raio):
+    """
+    Generate a face mask based on the given landmarks and save it to the specified cache directory.
+
+    Args:
+        file (str): The path to the file containing the landmarks.
+        cache_dir (str): The directory to save the generated face mask.
+
+    Returns:
+        None
+    """
+    landmarks, height, width = get_landmark(file)
+    file_name = os.path.basename(file).split(".")[0]
+    get_lip_mask(landmarks, height, width, os.path.join(
+        cache_dir, f"{file_name}_lip_mask.png"))
+    get_face_mask(landmarks, height, width, os.path.join(
+        cache_dir, f"{file_name}_face_mask.png"), face_expand_raio)
+    get_blur_mask(os.path.join(
+        cache_dir, f"{file_name}_face_mask.png"), os.path.join(
+        cache_dir, f"{file_name}_face_mask_blur.png"), kernel_size=(51, 51))
+    get_blur_mask(os.path.join(
+        cache_dir, f"{file_name}_lip_mask.png"), os.path.join(
+        cache_dir, f"{file_name}_sep_lip.png"), kernel_size=(31, 31))
+    get_background_mask(os.path.join(
+        cache_dir, f"{file_name}_face_mask_blur.png"), os.path.join(
+        cache_dir, f"{file_name}_sep_background.png"))
+    get_sep_face_mask(os.path.join(
+        cache_dir, f"{file_name}_face_mask_blur.png"), os.path.join(
+        cache_dir, f"{file_name}_sep_lip.png"), os.path.join(
+        cache_dir, f"{file_name}_sep_face.png"))
+
+
+def expand_region(region, image_w, image_h, expand_ratio=1.0):
+    """
+    Expand the given region by a specified ratio.
+    Args:
+        region (tuple): A tuple containing the coordinates (min_x, max_x, min_y, max_y) of the region.
+        image_w (int): The width of the image.
+        image_h (int): The height of the image.
+        expand_ratio (float, optional): The ratio by which the region should be expanded. Defaults to 1.0.
+
+    Returns:
+        tuple: A tuple containing the expanded coordinates (min_x, max_x, min_y, max_y) of the region.
+    """
+
+    min_x, max_x, min_y, max_y = region
+    mid_x = (max_x + min_x) // 2
+    side_len_x = (max_x - min_x) * expand_ratio
+    mid_y = (max_y + min_y) // 2
+    side_len_y = (max_y - min_y) * expand_ratio
+    min_x = mid_x - side_len_x // 2
+    max_x = mid_x + side_len_x // 2
+    min_y = mid_y - side_len_y // 2
+    max_y = mid_y + side_len_y // 2
+    if min_x < 0:
+        max_x -= min_x
+        min_x = 0
+    if max_x > image_w:
+        min_x -= max_x - image_w
+        max_x = image_w
+    if min_y < 0:
+        max_y -= min_y
+        min_y = 0
+    if max_y > image_h:
+        min_y -= max_y - image_h
+        max_y = image_h
+
+    return round(min_x), round(max_x), round(min_y), round(max_y)
+
+
+def get_blur_mask(file_path, output_file_path, resize_dim=(64, 64), kernel_size=(101, 101)):
+    """
+    Read, resize, blur, normalize, and save an image.
+
+    Parameters:
+    file_path (str): Path to the input image file.
+    output_dir (str): Path to the output directory to save blurred images.
+    resize_dim (tuple): Dimensions to resize the images to.
+    kernel_size (tuple): Size of the kernel to use for Gaussian blur.
+    """
+    # Read the mask image
+    mask = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
+
+    # Check if the image is loaded successfully
+    if mask is not None:
+        normalized_mask = blur_mask(mask,resize_dim=resize_dim,kernel_size=kernel_size)
+        # Save the normalized mask image
+        cv2.imwrite(output_file_path, normalized_mask)
+        return f"Processed, normalized, and saved: {output_file_path}"
+    return f"Failed to load image: {file_path}"
+
+
+def blur_mask(mask, resize_dim=(64, 64), kernel_size=(51, 51)):
+    """
+    Read, resize, blur, normalize, and save an image.
+
+    Parameters:
+    file_path (str): Path to the input image file.
+    resize_dim (tuple): Dimensions to resize the images to.
+    kernel_size (tuple): Size of the kernel to use for Gaussian blur.
+    """
+    # Check if the image is loaded successfully
+    normalized_mask = None
+    if mask is not None:
+        # Resize the mask image
+        resized_mask = cv2.resize(mask, resize_dim)
+        # Apply Gaussian blur to the resized mask image
+        blurred_mask = cv2.GaussianBlur(resized_mask, kernel_size, 0)
+        # Normalize the blurred image
+        normalized_mask = cv2.normalize(
+            blurred_mask, None, 0, 255, cv2.NORM_MINMAX)
+        # Save the normalized mask image
+    return normalized_mask
+
+def get_background_mask(file_path, output_file_path):
+    """
+    Read an image, invert its values, and save the result.
+
+    Parameters:
+    file_path (str): Path to the input image file.
+    output_dir (str): Path to the output directory to save the inverted image.
+    """
+    # Read the image
+    image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
+
+    if image is None:
+        print(f"Failed to load image: {file_path}")
+        return
+
+    # Invert the image
+    inverted_image = 1.0 - (
+        image / 255.0
+    )  # Assuming the image values are in [0, 255] range
+    # Convert back to uint8
+    inverted_image = (inverted_image * 255).astype(np.uint8)
+
+    # Save the inverted image
+    cv2.imwrite(output_file_path, inverted_image)
+    print(f"Processed and saved: {output_file_path}")
+
+
+def get_sep_face_mask(file_path1, file_path2, output_file_path):
+    """
+    Read two images, subtract the second one from the first, and save the result.
+
+    Parameters:
+    output_dir (str): Path to the output directory to save the subtracted image.
+    """
+
+    # Read the images
+    mask1 = cv2.imread(file_path1, cv2.IMREAD_GRAYSCALE)
+    mask2 = cv2.imread(file_path2, cv2.IMREAD_GRAYSCALE)
+
+    if mask1 is None or mask2 is None:
+        print(f"Failed to load images: {file_path1}")
+        return
+
+    # Ensure the images are the same size
+    if mask1.shape != mask2.shape:
+        print(
+            f"Image shapes do not match for {file_path1}: {mask1.shape} vs {mask2.shape}"
+        )
+        return
+
+    # Subtract the second mask from the first
+    result_mask = cv2.subtract(mask1, mask2)
+
+    # Save the result mask image
+    cv2.imwrite(output_file_path, result_mask)
+    print(f"Processed and saved: {output_file_path}")
+
+def resample_audio(input_audio_file: str, output_audio_file: str, sample_rate: int):
+    p = subprocess.Popen([
+        "ffmpeg", "-y", "-v", "error", "-i", input_audio_file, "-ar", str(sample_rate), output_audio_file
+    ])
+    ret = p.wait()
+    assert ret == 0, "Resample audio failed!"
+    return output_audio_file
+
+def get_face_region(image_path: str, detector):
+    try:
+        image = cv2.imread(image_path)
+        if image is None:
+            print(f"Failed to open image: {image_path}. Skipping...")
+            return None, None
+
+        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
+        detection_result = detector.detect(mp_image)
+
+        # Adjust mask creation for the three-channel image
+        mask = np.zeros_like(image, dtype=np.uint8)
+
+        for detection in detection_result.detections:
+            bbox = detection.bounding_box
+            start_point = (int(bbox.origin_x), int(bbox.origin_y))
+            end_point = (int(bbox.origin_x + bbox.width),
+                         int(bbox.origin_y + bbox.height))
+            cv2.rectangle(mask, start_point, end_point,
+                          (255, 255, 255), thickness=-1)
+
+        save_path = image_path.replace("images", "face_masks")
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        cv2.imwrite(save_path, mask)
+        # print(f"Processed and saved {save_path}")
+        return image_path, mask
+    except Exception as e:
+        print(f"Error processing image {image_path}: {e}")
+        return None, None
+
+
+def save_checkpoint(model: torch.nn.Module, save_dir: str, prefix: str, ckpt_num: int, total_limit: int = -1) -> None:
+    """
+    Save the model's state_dict to a checkpoint file.
+
+    If `total_limit` is provided, this function will remove the oldest checkpoints
+    until the total number of checkpoints is less than the specified limit.
+
+    Args:
+        model (nn.Module): The model whose state_dict is to be saved.
+        save_dir (str): The directory where the checkpoint will be saved.
+        prefix (str): The prefix for the checkpoint file name.
+        ckpt_num (int): The checkpoint number to be saved.
+        total_limit (int, optional): The maximum number of checkpoints to keep.
+            Defaults to None, in which case no checkpoints will be removed.
+
+    Raises:
+        FileNotFoundError: If the save directory does not exist.
+        ValueError: If the checkpoint number is negative.
+        OSError: If there is an error saving the checkpoint.
+    """
+
+    if not osp.exists(save_dir):
+        raise FileNotFoundError(
+            f"The save directory {save_dir} does not exist.")
+
+    if ckpt_num < 0:
+        raise ValueError(f"Checkpoint number {ckpt_num} must be non-negative.")
+
+    save_path = osp.join(save_dir, f"{prefix}-{ckpt_num}.pth")
+
+    if total_limit > 0:
+        checkpoints = os.listdir(save_dir)
+        checkpoints = [d for d in checkpoints if d.startswith(prefix)]
+        checkpoints = sorted(
+            checkpoints, key=lambda x: int(x.split("-")[1].split(".")[0])
+        )
+
+        if len(checkpoints) >= total_limit:
+            num_to_remove = len(checkpoints) - total_limit + 1
+            removing_checkpoints = checkpoints[0:num_to_remove]
+            print(
+                f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+            )
+            print(
+                f"Removing checkpoints: {', '.join(removing_checkpoints)}"
+            )
+
+            for removing_checkpoint in removing_checkpoints:
+                removing_checkpoint_path = osp.join(
+                    save_dir, removing_checkpoint)
+                try:
+                    os.remove(removing_checkpoint_path)
+                except OSError as e:
+                    print(
+                        f"Error removing checkpoint {removing_checkpoint_path}: {e}")
+
+    state_dict = model.state_dict()
+    try:
+        torch.save(state_dict, save_path)
+        print(f"Checkpoint saved at {save_path}")
+    except OSError as e:
+        raise OSError(f"Error saving checkpoint at {save_path}: {e}") from e
+
+
+def init_output_dir(dir_list: List[str]):
+    """
+    Initialize the output directories.
+
+    This function creates the directories specified in the `dir_list`. If a directory already exists, it does nothing.
+
+    Args:
+        dir_list (List[str]): List of directory paths to create.
+    """
+    for path in dir_list:
+        os.makedirs(path, exist_ok=True)
+
+
+def load_checkpoint(cfg, save_dir, accelerator):
+    """
+    Load the most recent checkpoint from the specified directory.
+
+    This function loads the latest checkpoint from the `save_dir` if the `resume_from_checkpoint` parameter is set to "latest".
+    If a specific checkpoint is provided in `resume_from_checkpoint`, it loads that checkpoint. If no checkpoint is found,
+    it starts training from scratch.
+
+    Args:
+        cfg: The configuration object containing training parameters.
+        save_dir (str): The directory where checkpoints are saved.
+        accelerator: The accelerator object for distributed training.
+
+    Returns:
+        int: The global step at which to resume training.
+    """
+    if cfg.resume_from_checkpoint != "latest":
+        resume_dir = cfg.resume_from_checkpoint
+        accelerator.load_state(os.path.join(save_dir, resume_dir))
+        accelerator.print(f"Resuming from checkpoint {resume_dir}")
+        global_step = int(resume_dir.split("-")[1])
+        return global_step
+    else:
+        resume_dir = save_dir
+    # Get the most recent checkpoint
+    dirs = os.listdir(resume_dir)
+
+    dirs = [d for d in dirs if d.startswith("checkpoint")]
+    if len(dirs) > 0:
+        dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+        path = dirs[-1]
+        accelerator.load_state(os.path.join(resume_dir, path))
+        accelerator.print(f"Resuming from checkpoint {path}")
+        global_step = int(path.split("-")[1])
+    else:
+        accelerator.print(
+            f"Could not find checkpoint under {resume_dir}, start training from scratch")
+        global_step = 0
+
+    return global_step
+
+
+def compute_snr(noise_scheduler, timesteps):
+    """
+    Computes SNR as per
+    https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/
+            521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+    """
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+
+    # Expand the tensors.
+    # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/
+    #              521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[
+        timesteps
+    ].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+
+    sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
+        device=timesteps.device
+    )[timesteps].float()
+    while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+    sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+
+    # Compute SNR.
+    snr = (alpha / sigma) ** 2
+    return snr
+
+
+def extract_audio_from_videos(video_path: Path, audio_output_path: Path) -> Path:
+    """
+    Extract audio from a video file and save it as a WAV file.
+
+    This function uses ffmpeg to extract the audio stream from a given video file and saves it as a WAV file
+    in the specified output directory.
+
+    Args:
+        video_path (Path): The path to the input video file.
+        output_dir (Path): The directory where the extracted audio file will be saved.
+
+    Returns:
+        Path: The path to the extracted audio file.
+
+    Raises:
+        subprocess.CalledProcessError: If the ffmpeg command fails to execute.
+    """
+    ffmpeg_command = [
+        'ffmpeg', '-y',
+        '-i', str(video_path),
+        '-vn', '-acodec',
+        "pcm_s16le", '-ar', '16000', '-ac', '2',
+        str(audio_output_path)
+    ]
+
+    try:
+        print(f"Running command: {' '.join(ffmpeg_command)}")
+        subprocess.run(ffmpeg_command, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error extracting audio from video: {e}")
+        raise
+
+    return audio_output_path
+
+
+def convert_video_to_images(video_path: Path, output_dir: Path) -> Path:
+    """
+    Convert a video file into a sequence of images.
+
+    This function uses ffmpeg to convert each frame of the given video file into an image. The images are saved
+    in a directory named after the video file stem under the specified output directory.
+
+    Args:
+        video_path (Path): The path to the input video file.
+        output_dir (Path): The directory where the extracted images will be saved.
+
+    Returns:
+        Path: The path to the directory containing the extracted images.
+
+    Raises:
+        subprocess.CalledProcessError: If the ffmpeg command fails to execute.
+    """
+    ffmpeg_command = [
+        'ffmpeg',
+        '-i', str(video_path),
+        '-vf', 'fps=25',
+        str(output_dir / '%04d.png')
+    ]
+
+    try:
+        print(f"Running command: {' '.join(ffmpeg_command)}")
+        subprocess.run(ffmpeg_command, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error converting video to images: {e}")
+        raise
+
+    return output_dir
+
+
+def get_union_mask(masks):
+    """
+    Compute the union of a list of masks.
+
+    This function takes a list of masks and computes their union by taking the maximum value at each pixel location.
+    Additionally, it finds the bounding box of the non-zero regions in the mask and sets the bounding box area to white.
+
+    Args:
+        masks (list of np.ndarray): List of masks to be combined.
+
+    Returns:
+        np.ndarray: The union of the input masks.
+    """
+    union_mask = None
+    for mask in masks:
+        if union_mask is None:
+            union_mask = mask
+        else:
+            union_mask = np.maximum(union_mask, mask)
+
+    if union_mask is not None:
+        # Find the bounding box of the non-zero regions in the mask
+        rows = np.any(union_mask, axis=1)
+        cols = np.any(union_mask, axis=0)
+        try:
+            ymin, ymax = np.where(rows)[0][[0, -1]]
+            xmin, xmax = np.where(cols)[0][[0, -1]]
+        except Exception as e:
+            print(str(e))
+            return 0.0
+
+        # Set bounding box area to white
+        union_mask[ymin: ymax + 1, xmin: xmax + 1] = np.max(union_mask)
+
+    return union_mask
+
+
+def move_final_checkpoint(save_dir, module_dir, prefix):
+    """
+    Move the final checkpoint file to the save directory.
+
+    This function identifies the latest checkpoint file based on the given prefix and moves it to the specified save directory.
+
+    Args:
+        save_dir (str): The directory where the final checkpoint file should be saved.
+        module_dir (str): The directory containing the checkpoint files.
+        prefix (str): The prefix used to identify checkpoint files.
+
+    Raises:
+        ValueError: If no checkpoint files are found with the specified prefix.
+    """
+    checkpoints = os.listdir(module_dir)
+    checkpoints = [d for d in checkpoints if d.startswith(prefix)]
+    checkpoints = sorted(
+        checkpoints, key=lambda x: int(x.split("-")[1].split(".")[0])
+    )
+    shutil.copy2(os.path.join(
+        module_dir, checkpoints[-1]), os.path.join(save_dir, prefix + '.pth'))